In [388]:
import pandas as pd
movies = pd.read_csv('movies.csv')
ratings = pd.read_csv('rating.csv')

In [28]:
movies.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [29]:
ratings.head()

Unnamed: 0,userId,movie_Id,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


## 1. Human-readable datetime Format

In [32]:
pd.to_datetime(ratings['timestamp'], unit='s').head()

0   2000-07-30 18:45:03
1   2000-07-30 18:20:47
2   2000-07-30 18:37:04
3   2000-07-30 19:03:35
4   2000-07-30 18:48:51
Name: timestamp, dtype: datetime64[ns]

In [34]:
ratings['timestamp'].head()

0    964982703
1    964981247
2    964982224
3    964983815
4    964982931
Name: timestamp, dtype: int64

## 2. Top 5 most-rated movies

In [55]:
highestRated = ratings['movie_Id'].value_counts(ascending=False).reset_index().head()
highestRated

Unnamed: 0,movie_Id,count
0,356,329
1,318,317
2,296,307
3,593,279
4,2571,278


In [56]:
highestRated.columns = ['movieId', 'ratingCount']

In [57]:
highestRated

Unnamed: 0,movieId,ratingCount
0,356,329
1,318,317
2,296,307
3,593,279
4,2571,278


In [115]:
maxRated = pd.merge(movies[['movieId', 'title']], highestRated, on='movieId', how='right')
maxRated.sort_values(by='ratingCount', ascending=False).reset_index(drop=True)

Unnamed: 0,movieId,title,ratingCount
0,356,Forrest Gump (1994),329
1,318,"Shawshank Redemption, The (1994)",317
2,296,Pulp Fiction (1994),307
3,593,"Silence of the Lambs, The (1991)",279
4,2571,"Matrix, The (1999)",278


In [116]:
#Another Method
topRated = ratings.value_counts('movie_Id', ascending=False).reset_index().head()
topRated

Unnamed: 0,movie_Id,count
0,356,329
1,318,317
2,296,307
3,593,279
4,2571,278


In [119]:
topFive = pd.merge(movies[['movieId', 'title']], topRated, left_on='movieId', right_on='movie_Id', how='right').reset_index(drop=True).head()

In [120]:
top = topFive.sort_values(by='count', ascending=False).reset_index(drop=True)
top.drop('movie_Id', axis=1)

Unnamed: 0,movieId,title,count
0,356,Forrest Gump (1994),329
1,318,"Shawshank Redemption, The (1994)",317
2,296,Pulp Fiction (1994),307
3,593,"Silence of the Lambs, The (1991)",279
4,2571,"Matrix, The (1999)",278


## 3. Identifying the most common genres

In [192]:
movies['genreList'] = movies['genres'].str.split('|')
explodedList = movies.explode('genreList')

In [193]:
genreCounts = explodedList['genreList'].value_counts(ascending=False)

In [200]:
genre = genreCounts.reset_index()

In [201]:
genre.columns = ['Genres', 'Genre Counts']

In [202]:
genre

Unnamed: 0,Genres,Genre Counts
0,Drama,4361
1,Comedy,3756
2,Thriller,1894
3,Action,1828
4,Romance,1596
5,Adventure,1263
6,Crime,1199
7,Sci-Fi,980
8,Horror,978
9,Fantasy,779


In [220]:
genre.merge(movies[['title']], left_index=True, right_index=True)

Unnamed: 0,Genres,Genre Counts,title
0,Drama,4361,Toy Story (1995)
1,Comedy,3756,Jumanji (1995)
2,Thriller,1894,Grumpier Old Men (1995)
3,Action,1828,Waiting to Exhale (1995)
4,Romance,1596,Father of the Bride Part II (1995)
5,Adventure,1263,Heat (1995)
6,Crime,1199,Sabrina (1995)
7,Sci-Fi,980,Tom and Huck (1995)
8,Horror,978,Sudden Death (1995)
9,Fantasy,779,GoldenEye (1995)


## 4. Average rating for each movie

In [243]:
averageRating = ratings.groupby('movie_Id')['rating'].mean().reset_index()

In [310]:
averageRating.tail()

Unnamed: 0,movie_Id,rating
9719,193581,4.0
9720,193583,3.5
9721,193585,3.5
9722,193587,3.5
9723,193609,4.0


In [328]:
movieRatingCount = ratings['movie_Id'].value_counts()
movieRatingCount.reset_index()

Unnamed: 0,movie_Id,count
0,356,329
1,318,317
2,296,307
3,593,279
4,2571,278
...,...,...
9719,86279,1
9720,86922,1
9721,5962,1
9722,87660,1


In [326]:
eligibleMovie = movieRatingCount[movieRatingCount>=50].index
eligibleMovie

Index([   356,    318,    296,    593,   2571,    260,    480,    110,    589,
          527,
       ...
        96079,  34048,  88125, 116797,   8464,    333,   3785,   8361,   2105,
        33166],
      dtype='int64', name='movie_Id', length=450)

In [334]:
highestRatedMovie = averageRating[averageRating['movie_Id'].isin(eligibleMovie)].sort_values(by='rating', ascending=False).head(5)
highestRatedMovie

Unnamed: 0,movie_Id,rating
277,318,4.429022
659,858,4.289062
2224,2959,4.272936
974,1276,4.27193
602,750,4.268041


In [335]:
highestRatedMovie = highestRatedMovie.merge(movies[['movieId', 'title']], left_on='movie_Id', right_on='movieId')

In [336]:
highestRatedMovie

Unnamed: 0,movie_Id,rating,movieId,title
0,318,4.429022,318,"Shawshank Redemption, The (1994)"
1,858,4.289062,858,"Godfather, The (1972)"
2,2959,4.272936,2959,Fight Club (1999)
3,1276,4.27193,1276,Cool Hand Luke (1967)
4,750,4.268041,750,Dr. Strangelove or: How I Learned to Stop Worr...


## 5. Group movies by release_year and average rating

In [362]:
movies

Unnamed: 0,movieId,title,genres,genreList,Movie Name,Year
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,"[Adventure, Animation, Children, Comedy, Fantasy]",Toy Story,1995
1,2,Jumanji (1995),Adventure|Children|Fantasy,"[Adventure, Children, Fantasy]",Jumanji,1995
2,3,Grumpier Old Men (1995),Comedy|Romance,"[Comedy, Romance]",Grumpier Old Men,1995
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance,"[Comedy, Drama, Romance]",Waiting to Exhale,1995
4,5,Father of the Bride Part II (1995),Comedy,[Comedy],Father of the Bride Part II,1995
...,...,...,...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy,"[Action, Animation, Comedy, Fantasy]",Black Butler: Book of the Atlantic,2017
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy,"[Animation, Comedy, Fantasy]",No Game No Life: Zero,2017
9739,193585,Flint (2017),Drama,[Drama],Flint,2017
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation,"[Action, Animation]",Bungo Stray Dogs: Dead Apple,2018


In [363]:
movies[['Movie Name', 'Year']]=movies['title'].str.extract(r"^(.*)\s\((\d{4})\)$")

In [364]:
moviesWithRating = movies.merge(ratings, left_on='movieId', right_on='movie_Id', how='inner')

In [365]:
yearlyStatus = moviesWithRating.groupby('Year').agg(
    totalMovies = ('movieId', 'nunique'),
    avgRating = ('rating', 'mean')).reset_index()

In [366]:
yearlyStatus

Unnamed: 0,Year,totalMovies,avgRating
0,1902,1,3.500000
1,1903,1,2.500000
2,1908,1,4.000000
3,1915,1,2.000000
4,1916,4,3.600000
...,...,...,...
101,2014,277,3.512140
102,2015,274,3.410386
103,2016,218,3.387261
104,2017,147,3.578091
