# Data Cleaning and Feature Engineering On MovieLens Dataset

In [1]:
#importing libraries
import pandas as pd
import numpy as np

### loading datasets

In [2]:
moviesdf=pd.read_csv('movie.csv')
moviesdf=pd.DataFrame(moviesdf)
moviesdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27278 entries, 0 to 27277
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  27278 non-null  int64 
 1   title    27278 non-null  object
 2   genres   27278 non-null  object
dtypes: int64(1), object(2)
memory usage: 639.5+ KB


In [3]:
moviesdf

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
27273,131254,Kein Bund für's Leben (2007),Comedy
27274,131256,"Feuer, Eis & Dosenbier (2002)",Comedy
27275,131258,The Pirates (2014),Adventure
27276,131260,Rentun Ruusu (2001),(no genres listed)


In [4]:
ratingdf=pd.read_csv('rating.csv')
ratingdf=pd.DataFrame(ratingdf)
ratingdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1048575 entries, 0 to 1048574
Data columns (total 4 columns):
 #   Column     Non-Null Count    Dtype  
---  ------     --------------    -----  
 0   userId     1048575 non-null  int64  
 1   movieId    1048575 non-null  int64  
 2   rating     1048575 non-null  float64
 3   timestamp  1048575 non-null  object 
dtypes: float64(1), int64(2), object(1)
memory usage: 32.0+ MB


In [5]:
ratingdf

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,02-04-2005 23:53
1,1,29,3.5,02-04-2005 23:31
2,1,32,3.5,02-04-2005 23:33
3,1,47,3.5,02-04-2005 23:32
4,1,50,3.5,02-04-2005 23:29
...,...,...,...,...
1048570,7120,168,5.0,02-04-2007 19:44
1048571,7120,253,4.0,02-04-2007 19:30
1048572,7120,260,5.0,02-04-2007 19:27
1048573,7120,261,4.0,02-04-2007 19:49


In [6]:
tagsdf=pd.read_csv('tag.csv',sep=',')
tagsdf=pd.DataFrame(tagsdf)
tagsdf.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 465564 entries, 0 to 465563
Data columns (total 4 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   userId     465564 non-null  int64 
 1   movieId    465564 non-null  int64 
 2   tag        465548 non-null  object
 3   timestamp  465564 non-null  object
dtypes: int64(2), object(2)
memory usage: 14.2+ MB


In [7]:
tagsdf

Unnamed: 0,userId,movieId,tag,timestamp
0,18,4141,Mark Waters,2009-04-24 18:19:40
1,65,208,dark hero,2013-05-10 01:41:18
2,65,353,dark hero,2013-05-10 01:41:19
3,65,521,noir thriller,2013-05-10 01:39:43
4,65,592,dark hero,2013-05-10 01:41:18
...,...,...,...,...
465559,138446,55999,dragged,2013-01-23 23:29:32
465560,138446,55999,Jason Bateman,2013-01-23 23:29:38
465561,138446,55999,quirky,2013-01-23 23:29:38
465562,138446,55999,sad,2013-01-23 23:29:32


### checking for null values and checking shape of data sets

In [8]:
moviesdf.isnull().value_counts()

movieId  title  genres
False    False  False     27278
Name: count, dtype: int64

In [9]:
moviesdf.isna().value_counts()

movieId  title  genres
False    False  False     27278
Name: count, dtype: int64

In [10]:
tagsdf.shape

(465564, 4)

In [11]:
ratingdf.isnull().value_counts()

userId  movieId  rating  timestamp
False   False    False   False        1048575
Name: count, dtype: int64

In [12]:
ratingdf.isna().value_counts()

userId  movieId  rating  timestamp
False   False    False   False        1048575
Name: count, dtype: int64

In [13]:
ratingdf.shape

(1048575, 4)

In [14]:
tagsdf.isnull().value_counts()

userId  movieId  tag    timestamp
False   False    False  False        465548
                 True   False            16
Name: count, dtype: int64

### dropping null values

In [15]:
tagsdf=tagsdf.dropna()

In [16]:
tagsdf.isnull().value_counts()

userId  movieId  tag    timestamp
False   False    False  False        465548
Name: count, dtype: int64

### checking for unique tags 

In [17]:
uniquetags=tagsdf['tag'].unique().tolist()
len(uniquetags)

38643

In [18]:
#getting total number of rating per movies
ratingdf['movieId'].value_counts()

movieId
296      3498
356      3476
593      3247
318      3216
480      3129
         ... 
5978        1
5918        1
5807        1
8660        1
65651       1
Name: count, Length: 14026, dtype: int64

In [19]:
#check for duplicate values
moviesdf['movieId'].duplicated().value_counts()

movieId
False    27278
Name: count, dtype: int64

### adding random null values to datasets for data cleaning for making it more challenging(can be skipped)

In [20]:
probability_of_null = 0.0005
mask = np.random.rand(len(moviesdf)) < probability_of_null
moviesdf.loc[mask] = np.NaN
print(moviesdf)

        movieId                               title  \
0           1.0                    Toy Story (1995)   
1           2.0                      Jumanji (1995)   
2           3.0             Grumpier Old Men (1995)   
3           4.0            Waiting to Exhale (1995)   
4           5.0  Father of the Bride Part II (1995)   
...         ...                                 ...   
27273  131254.0        Kein Bund für's Leben (2007)   
27274  131256.0       Feuer, Eis & Dosenbier (2002)   
27275  131258.0                  The Pirates (2014)   
27276  131260.0                 Rentun Ruusu (2001)   
27277  131262.0                    Innocence (2014)   

                                            genres  
0      Adventure|Animation|Children|Comedy|Fantasy  
1                       Adventure|Children|Fantasy  
2                                   Comedy|Romance  
3                             Comedy|Drama|Romance  
4                                           Comedy  
...                  

In [21]:
probability_of_null = 0.0005
c='rating'
mask = np.random.rand(len(ratingdf)) < probability_of_null
ratingdf.loc[mask,c] = np.NaN

print(ratingdf)

         userId  movieId  rating         timestamp
0             1        2     3.5  02-04-2005 23:53
1             1       29     3.5  02-04-2005 23:31
2             1       32     3.5  02-04-2005 23:33
3             1       47     3.5  02-04-2005 23:32
4             1       50     3.5  02-04-2005 23:29
...         ...      ...     ...               ...
1048570    7120      168     5.0  02-04-2007 19:44
1048571    7120      253     4.0  02-04-2007 19:30
1048572    7120      260     5.0  02-04-2007 19:27
1048573    7120      261     4.0  02-04-2007 19:49
1048574    7120      266     3.5  02-04-2007 19:34

[1048575 rows x 4 columns]


In [22]:
#deleing timestamp section from ratings dataset as it is not relevenat 
del ratingdf['timestamp']

In [23]:
#deleing timestamp and userId section from tags dataset as it is not relevenat 
tagsdf = tagsdf.drop(columns=['timestamp', 'userId'])

In [24]:
probability_of_null = 0.005
c='movieId', 'userId'
mask = np.random.rand(len(ratingdf)) < probability_of_null
ratingdf.loc[mask,c] = np.NaN

print(ratingdf)

         userId  movieId  rating
0           1.0      2.0     3.5
1           1.0     29.0     3.5
2           1.0     32.0     3.5
3           1.0     47.0     3.5
4           1.0     50.0     3.5
...         ...      ...     ...
1048570  7120.0    168.0     5.0
1048571  7120.0    253.0     4.0
1048572  7120.0    260.0     5.0
1048573  7120.0    261.0     4.0
1048574  7120.0    266.0     3.5

[1048575 rows x 3 columns]


### Checking for null values

In [25]:
moviesdf.isnull().value_counts()

movieId  title  genres
False    False  False     27264
True     True   True         14
Name: count, dtype: int64

In [26]:
ratingdf.isnull().value_counts()

userId  movieId  rating
False   False    False     1042782
True    True     False        5248
False   False    True          543
True    True     True            2
Name: count, dtype: int64

In [27]:
ratingdf.isnull().sum()

userId     5250
movieId    5250
rating      545
dtype: int64

In [28]:
moviesdf.isnull().sum()

movieId    14
title      14
genres     14
dtype: int64

### dropping null rows from movies dataset

In [29]:
moviesdf.dropna(inplace=True)

In [30]:
moviesdf.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

### Making a list of movie Ids and filtering ratings of movies which are not in the list from rating dataset

In [31]:
movie_ids = moviesdf['movieId']
filter_ratingdf = ratingdf[ratingdf['movieId'].isin(movie_ids)]

In [32]:
filter_ratingdf.isnull().value_counts()

userId  movieId  rating
False   False    False     1042605
                 True          543
Name: count, dtype: int64

In [33]:
filter_ratingdf

Unnamed: 0,userId,movieId,rating
0,1.0,2.0,3.5
1,1.0,29.0,3.5
2,1.0,32.0,3.5
3,1.0,47.0,3.5
4,1.0,50.0,3.5
...,...,...,...
1048570,7120.0,168.0,5.0
1048571,7120.0,253.0,4.0
1048572,7120.0,260.0,5.0
1048573,7120.0,261.0,4.0


### Putting median value of rating per differnt movie in null rows of rating dataset

In [34]:
movie_medians = filter_ratingdf.groupby('movieId')['rating'].median().to_dict()
def fill_rating(row):
  movie_id = row['movieId']
  if pd.isnull(row['rating']):
    row['rating'] = movie_medians.get(movie_id)
  return row

filter_ratingdf = filter_ratingdf.apply(fill_rating, axis=1)

In [35]:
filter_ratingdf.isnull().value_counts()

userId  movieId  rating
False   False    False     1043148
Name: count, dtype: int64

In [36]:
#Getting index of null values if any
filter_ratingdf[filter_ratingdf['rating'].isnull()].index

Index([], dtype='int64')

### Checking for outliers using z-score and iqr

In [65]:
zscore=(filter_ratingdf['rating']-filter_ratingdf['rating'].mean())/filter_ratingdf['rating'].std()
t=3
outliers=filter_ratingdf[zscore.abs()>t]
print(outliers[['movieId','rating']])

Empty DataFrame
Columns: [movieId, rating]
Index: []


In [38]:
q1 = filter_ratingdf['rating'].quantile(0.25)
q3 = filter_ratingdf['rating'].quantile(0.75)
iqr = q3 - q1
lbound = q1 - 1.5 * iqr
ubound = q3 + 1.5 * iqr
outlier = filter_ratingdf[(filter_ratingdf['rating'] < lbound) | (filter_ratingdf['rating'] > ubound)]
print(outliers[['movieId', 'rating']])

Empty DataFrame
Columns: [movieId, rating]
Index: []


In [39]:
#checking of there is any rating below 0.5
ratingdf.loc[ratingdf['rating'] < 0.5, 'rating'] = 0.5


In [40]:
#checking of there is any rating above 5
ratingdf.loc[ratingdf['rating'] > 5, 'rating'] = 5

In [41]:
#counting differrnt genres
moviesdf['genres'].value_counts()

genres
Drama                                                  4518
Comedy                                                 2294
Documentary                                            1940
Comedy|Drama                                           1263
Drama|Romance                                          1075
                                                       ... 
Action|Comedy|Crime|Western                               1
Action|Fantasy|Sci-Fi|Thriller                            1
Action|Drama|Fantasy|Horror|Mystery|Sci-Fi|Thriller       1
Action|Drama|Fantasy|Romance                              1
Animation|Children|Comedy|Western                         1
Name: count, Length: 1342, dtype: int64

### making title string lowercase

In [42]:
moviesdf['title'] = moviesdf['title'].str.lower()

In [43]:
print(filter_ratingdf)

         userId  movieId  rating
0           1.0      2.0     3.5
1           1.0     29.0     3.5
2           1.0     32.0     3.5
3           1.0     47.0     3.5
4           1.0     50.0     3.5
...         ...      ...     ...
1048570  7120.0    168.0     5.0
1048571  7120.0    253.0     4.0
1048572  7120.0    260.0     5.0
1048573  7120.0    261.0     4.0
1048574  7120.0    266.0     3.5

[1043148 rows x 3 columns]


### seperating years from title

In [113]:
moviesdf['year'] =moviesdf['title'].str.extract('.*\\((.*)\\).*',expand = True)
print(moviesdf['year'])

0        1995
1        1995
2        1995
3        1995
4        1995
         ... 
27273    2007
27274    2002
27275    2014
27276    2001
27277    2014
Name: year, Length: 27264, dtype: object


In [114]:
missing_year_movies = moviesdf[moviesdf['year'].isnull()]
movie_ids = missing_year_movies['movieId'].tolist()
print(movie_ids)

[]


### making a new dataset with both movies dataset and rating dataset

In [115]:
moviesdf_ratingdf=moviesdf.merge(filter_ratingdf,on = 'movieId',how = 'inner')
moviesdf_ratingdf

Unnamed: 0,movieId,title,genres,year,userId,rating
0,1.0,toy story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,3.0,4.0
1,1.0,toy story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,6.0,5.0
2,1.0,toy story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,8.0,4.0
3,1.0,toy story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,10.0,4.0
4,1.0,toy story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,11.0,4.5
...,...,...,...,...,...,...
1043143,130219.0,the dark knight (2011),Action|Crime|Drama|Thriller,2011,1339.0,4.5
1043144,130462.0,the boy (2015),(no genres listed),2015,5731.0,4.0
1043145,130490.0,insurgent (2015),Action|Romance|Sci-Fi,2015,2423.0,1.0
1043146,130490.0,insurgent (2015),Action|Romance|Sci-Fi,2015,3397.0,3.5


In [116]:
#checking how many movies have drama as their genre
drama_movies=moviesdf['genres'].str.contains('Drama')
moviesdf[drama_movies]

Unnamed: 0,movieId,title,genres,year
3,4.0,waiting to exhale (1995),Comedy|Drama|Romance,1995
10,11.0,"american president, the (1995)",Comedy|Drama|Romance,1995
13,14.0,nixon (1995),Drama,1995
15,16.0,casino (1995),Crime|Drama,1995
16,17.0,sense and sensibility (1995),Drama|Romance,1995
...,...,...,...,...
27256,131162.0,por un puñado de besos (2014),Drama|Romance,2014
27259,131168.0,phoenix (2014),Drama,2014
27262,131174.0,gentlemen (2014),Drama|Romance|Thriller,2014
27263,131176.0,a second chance (2014),Drama,2014


### geting average rating per movie from overall rating

In [117]:
average_ratings = moviesdf_ratingdf.groupby('movieId')['rating'].mean().reset_index()
moviesdf_ratingdf=pd.merge(moviesdf_ratingdf,average_ratings,on='movieId',how='left')


In [118]:
#renaming columns 
moviesdf_ratingdf.rename(columns={'rating_x':'rating','rating_y':'avg_rating'},inplace=True)
moviesdf_ratingdf

Unnamed: 0,movieId,title,genres,year,userId,rating,avg_rating
0,1.0,toy story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,3.0,4.0,3.960383
1,1.0,toy story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,6.0,5.0,3.960383
2,1.0,toy story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,8.0,4.0,3.960383
3,1.0,toy story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,10.0,4.0,3.960383
4,1.0,toy story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,11.0,4.5,3.960383
...,...,...,...,...,...,...,...
1043143,130219.0,the dark knight (2011),Action|Crime|Drama|Thriller,2011,1339.0,4.5,4.500000
1043144,130462.0,the boy (2015),(no genres listed),2015,5731.0,4.0,4.000000
1043145,130490.0,insurgent (2015),Action|Romance|Sci-Fi,2015,2423.0,1.0,2.250000
1043146,130490.0,insurgent (2015),Action|Romance|Sci-Fi,2015,3397.0,3.5,2.250000


### getting how many users have rated the movie

In [119]:
average_ratings_count = moviesdf_ratingdf.groupby('movieId')['rating'].count().reset_index()
moviesdf_ratingdf=pd.merge(moviesdf_ratingdf,average_ratings_count,on='movieId',how='left')

In [120]:
average_ratings_count

Unnamed: 0,movieId,rating
0,1.0,2562
1,2.0,1152
2,3.0,684
3,4.0,138
4,5.0,654
...,...,...
13989,130073.0,1
13990,130219.0,1
13991,130462.0,1
13992,130490.0,2


In [121]:
#renaming columns
moviesdf_ratingdf.rename(columns={'rating_x':'rating','rating_y':'no.user_rating/movie'},inplace=True)
moviesdf_ratingdf

Unnamed: 0,movieId,title,genres,year,userId,rating,avg_rating,no.user_rating/movie
0,1.0,toy story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,3.0,4.0,3.960383,2562
1,1.0,toy story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,6.0,5.0,3.960383,2562
2,1.0,toy story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,8.0,4.0,3.960383,2562
3,1.0,toy story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,10.0,4.0,3.960383,2562
4,1.0,toy story (1995),Adventure|Animation|Children|Comedy|Fantasy,1995,11.0,4.5,3.960383,2562
...,...,...,...,...,...,...,...,...
1043143,130219.0,the dark knight (2011),Action|Crime|Drama|Thriller,2011,1339.0,4.5,4.500000,1
1043144,130462.0,the boy (2015),(no genres listed),2015,5731.0,4.0,4.000000,1
1043145,130490.0,insurgent (2015),Action|Romance|Sci-Fi,2015,2423.0,1.0,2.250000,2
1043146,130490.0,insurgent (2015),Action|Romance|Sci-Fi,2015,3397.0,3.5,2.250000,2


In [122]:
#deleting rating and user ID columns
moviesdf_ratingdf = moviesdf_ratingdf.drop(columns=['rating', 'userId'])

### Dropping duplicate rows based on Movie Id

In [123]:
moviesdf_ratingdf = moviesdf_ratingdf.drop_duplicates(subset='movieId', keep='first')
print(moviesdf_ratingdf)

          movieId                               title  \
0             1.0                    toy story (1995)   
2562          2.0                      jumanji (1995)   
3714          3.0             grumpier old men (1995)   
4398          4.0            waiting to exhale (1995)   
4536          5.0  father of the bride part ii (1995)   
...           ...                                 ...   
1043142  130073.0                   cinderella (2015)   
1043143  130219.0              the dark knight (2011)   
1043144  130462.0                      the boy (2015)   
1043145  130490.0                    insurgent (2015)   
1043147  130642.0                  backcountry (2014)   

                                              genres  year  avg_rating  \
0        Adventure|Animation|Children|Comedy|Fantasy  1995    3.960383   
2562                      Adventure|Children|Fantasy  1995    3.269965   
3714                                  Comedy|Romance  1995    3.182749   
4398               

### Contructing a column Named Popularity Based on formula (Rating x No_of_ratings)/200

In [124]:
def popularity(rating, no_of_ratings):
  popularity = (rating * no_of_ratings) / 200
  return popularity
moviesdf_ratingdf['popularity'] =moviesdf_ratingdf.apply(lambda row: popularity(row['avg_rating'], row['no.user_rating/movie']), axis=1)
print(moviesdf_ratingdf)

          movieId                               title  \
0             1.0                    toy story (1995)   
2562          2.0                      jumanji (1995)   
3714          3.0             grumpier old men (1995)   
4398          4.0            waiting to exhale (1995)   
4536          5.0  father of the bride part ii (1995)   
...           ...                                 ...   
1043142  130073.0                   cinderella (2015)   
1043143  130219.0              the dark knight (2011)   
1043144  130462.0                      the boy (2015)   
1043145  130490.0                    insurgent (2015)   
1043147  130642.0                  backcountry (2014)   

                                              genres  year  avg_rating  \
0        Adventure|Animation|Children|Comedy|Fantasy  1995    3.960383   
2562                      Adventure|Children|Fantasy  1995    3.269965   
3714                                  Comedy|Romance  1995    3.182749   
4398               

### getting descriptive statistics

In [125]:
moviesdf_ratingdf[['avg_rating','no.user_rating/movie','popularity']].describe()

Unnamed: 0,avg_rating,no.user_rating/movie,popularity
count,13994.0,13994.0,13994.0
mean,3.216392,74.542518,1.315516
std,0.790597,217.313412,4.099943
min,0.5,1.0,0.0025
25%,2.833333,2.0,0.035
50%,3.363636,9.0,0.14
75%,3.75,43.0,0.6975
max,5.0,3481.0,72.29


### most 20 most rated titles/movies

In [126]:
most_rated=moviesdf_ratingdf.sort_values(by='no.user_rating/movie',ascending=False)[['title','no.user_rating/movie']][:20]
most_rated


Unnamed: 0,title,no.user_rating/movie
101473,pulp fiction (1994),3481
128388,forrest gump (1994),3459
208986,"silence of the lambs, the (1991)",3230
111362,"shawshank redemption, the (1994)",3201
167201,jurassic park (1993),3111
88613,star wars: episode iv - a new hope (1977),2859
43731,braveheart (1995),2787
201615,terminator 2: judgment day (1991),2703
578868,"matrix, the (1999)",2689
181425,schindler's list (1993),2579


### function for seperaing movie genre and counting them

In [127]:
def count_word(df, ref_col, liste):
    keyword_count = dict()
    for s in liste: keyword_count[s] = 0
    for liste_keywords in df[ref_col].str.split('|'):
        if type(liste_keywords) == float and pd.isnull(liste_keywords): continue
        for s in liste_keywords: 
            if pd.notnull(s): keyword_count[s] += 1
    keyword_occurences = []
    for k,v in keyword_count.items():
        keyword_occurences.append([k,v])
    keyword_occurences.sort(key = lambda x:x[1], reverse = True)
    return keyword_occurences, keyword_count

In [128]:
genre_labels = set()
for s in moviesdf['genres'].str.split('|').values:
    genre_labels = genre_labels.union(set(s))

### Getting Enteries per Genres

In [129]:
keyword_occurences, dum = count_word(moviesdf, 'genres', genre_labels)
keyword_occurences

[['Drama', 13339],
 ['Comedy', 8371],
 ['Thriller', 4177],
 ['Romance', 4126],
 ['Action', 3519],
 ['Crime', 2939],
 ['Horror', 2609],
 ['Documentary', 2469],
 ['Adventure', 2328],
 ['Sci-Fi', 1742],
 ['Mystery', 1513],
 ['Fantasy', 1411],
 ['War', 1194],
 ['Children', 1138],
 ['Musical', 1034],
 ['Animation', 1026],
 ['Western', 676],
 ['Film-Noir', 330],
 ['(no genres listed)', 246],
 ['IMAX', 196]]

### storing the data into new file for future use

In [134]:
moviesdf_ratingdf.to_csv("movies_with_ratings.csv", index=False)

In [131]:
moviesdf_ratingdf.isnull().value_counts()

movieId  title  genres  year   avg_rating  no.user_rating/movie  popularity
False    False  False   False  False       False                 False         13994
Name: count, dtype: int64

### pivot chart for rating vs average rating to find any discrepancies

In [132]:
pivot_table = moviesdf_ratingdf.pivot_table(
    values='avg_rating', index=['movieId','popularity'], aggfunc='mean')
pivot_table

Unnamed: 0_level_0,Unnamed: 1_level_0,avg_rating
movieId,popularity,Unnamed: 2_level_1
1.0,50.7325,3.960383
2.0,18.8350,3.269965
3.0,10.8850,3.182749
4.0,2.0700,3.000000
5.0,10.2825,3.144495
...,...,...
130073.0,0.0125,2.500000
130219.0,0.0225,4.500000
130462.0,0.0200,4.000000
130490.0,0.0225,2.250000


### Getting Top 4 Popular movies 

In [133]:
a=moviesdf_ratingdf['popularity']>60
high_rated_movies = moviesdf_ratingdf[a]
print(high_rated_movies)

        movieId                             title  \
101473    296.0               pulp fiction (1994)   
111362    318.0  shawshank redemption, the (1994)   
128388    356.0               forrest gump (1994)   
208986    593.0  silence of the lambs, the (1991)   

                             genres  year  avg_rating  no.user_rating/movie  \
101473  Comedy|Crime|Drama|Thriller  1994    4.153404                  3481   
111362                  Crime|Drama  1994    4.469385                  3201   
128388     Comedy|Drama|Romance|War  1994    4.054640                  3459   
208986        Crime|Horror|Thriller  1991    4.184056                  3230   

        popularity  
101473     72.2900  
111362     71.5325  
128388     70.1250  
208986     67.5725  
