# Eploratory data analysis for movie recommendation 

In [1]:
pip install neattext

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [1]:
import pandas as pd
import numpy as np
import warnings
import neattext.functions as nfx
from scipy.sparse import csr_matrix
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [2]:
warnings.filterwarnings(action='ignore')

In [6]:
movies=pd.read_csv('movies.csv.zip')


#Checking:
movies

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
34203,151697,Grand Slam (1967),Thriller
34204,151701,Bloodmoney (2010),(no genres listed)
34205,151703,The Butterfly Circus (2009),Drama
34206,151709,Zero (2015),Drama|Sci-Fi


In [8]:
data_types = movies.dtypes


print(data_types)

movieId     int64
title      object
genres     object
dtype: object


In [9]:
movies.isnull().any()

movieId    False
title      False
genres     False
dtype: bool

In [10]:
ratings=pd.read_csv('ratings.csv.zip')


ratings

Unnamed: 0,userId,movieId,rating,timestamp
0,1,169,2.5,1204927694
1,1,2471,3.0,1204927438
2,1,48516,5.0,1204927435
3,2,2571,3.5,1436165433
4,2,109487,4.0,1436165496
...,...,...,...,...
22884372,247753,49530,5.0,1430437962
22884373,247753,69481,3.0,1430437984
22884374,247753,74458,4.0,1430437968
22884375,247753,76093,5.0,1430437811


In [11]:
    ratings.dtypes

userId         int64
movieId        int64
rating       float64
timestamp      int64
dtype: object

In [12]:
ratings.isnull().any()

userId       False
movieId      False
rating       False
timestamp    False
dtype: bool

In [14]:
len(ratings['movieId'].unique())

33670

In [15]:
len(ratings['userId'].unique())

247753

In [16]:
ratingscale=ratings['rating'].min(),ratings['rating'].max()

In [17]:
ratingscale

(0.5, 5.0)

In [18]:
mean_ratings_df=ratings.copy()
mean_ratings_df.drop(['userId', 'timestamp'], axis=1, inplace=True)


mean_ratings_df

Unnamed: 0,movieId,rating
0,169,2.5
1,2471,3.0
2,48516,5.0
3,2571,3.5
4,109487,4.0
...,...,...
22884372,49530,5.0
22884373,69481,3.0
22884374,74458,4.0
22884375,76093,5.0


In [19]:
mean_ratings_df=mean_ratings_df.groupby(['movieId']).mean()


mean_ratings_df

Unnamed: 0_level_0,rating
movieId,Unnamed: 1_level_1
1,3.894802
2,3.221086
3,3.180094
4,2.879727
5,3.080811
...,...
151697,3.000000
151701,4.000000
151703,5.000000
151709,3.500000


In [20]:
mean_ratings_df.reset_index(inplace=True)

In [21]:
mean_ratings_df

Unnamed: 0,movieId,rating
0,1,3.894802
1,2,3.221086
2,3,3.180094
3,4,2.879727
4,5,3.080811
...,...,...
33665,151697,3.000000
33666,151701,4.000000
33667,151703,5.000000
33668,151709,3.500000


In [22]:
max_movies=mean_ratings_df.loc[mean_ratings_df['rating']==max(mean_ratings_df['rating'])]


max_movies

Unnamed: 0,movieId,rating
9008,26718,5.0
9563,27914,5.0
10571,40404,5.0
14433,72235,5.0
14636,73141,5.0
...,...,...
33649,151657,5.0
33651,151663,5.0
33663,151691,5.0
33664,151695,5.0


In [23]:
max_movies.reset_index(inplace=True, drop=True)



max_movies

Unnamed: 0,movieId,rating
0,26718,5.0
1,27914,5.0
2,40404,5.0
3,72235,5.0
4,73141,5.0
...,...,...
266,151657,5.0
267,151663,5.0
268,151691,5.0
269,151695,5.0


In [24]:

max_ids=max_movies['movieId'].to_list()

best_rated=movies[movies['movieId'].isin(max_ids)]



best_rated

Unnamed: 0,movieId,title,genres
9011,26718,Life On A String (Bian chang Bian Zou) (1991),Adventure|Drama|Fantasy|Musical
9567,27914,"Hijacking Catastrophe: 9/11, Fear & the Sellin...",Documentary
10577,40404,Al otro lado (2004),Drama
14445,72235,Between the Devil and the Deep Blue Sea (1995),Drama
14650,73141,Moana (1926),Documentary
...,...,...,...
34187,151657,iMurders (2008),Drama|Horror|Mystery|Thriller
34189,151663,"Semen, a Love Sample (2005)",Comedy|Romance
34201,151691,Hollywood High (1976),Comedy
34202,151695,The Survivalist (2015),Drama|Sci-Fi|Thriller


In [25]:
min_movies=mean_ratings_df.loc[mean_ratings_df['rating']==min(mean_ratings_df['rating'])]


min_movies

Unnamed: 0,movieId,rating
5707,5805,0.5
12556,58760,0.5
12699,59775,0.5
14653,73196,0.5
14663,73230,0.5
...,...,...
33557,150904,0.5
33587,151365,0.5
33595,151443,0.5
33644,151641,0.5


In [26]:
min_movies=min_movies.reset_index()
min_movies.drop('index', axis=1, inplace=True)


min_movies

Unnamed: 0,movieId,rating
0,5805,0.5
1,58760,0.5
2,59775,0.5
3,73196,0.5
4,73230,0.5
...,...,...
337,150904,0.5
338,151365,0.5
339,151443,0.5
340,151641,0.5


In [27]:
min_ids=min_movies['movieId'].to_list()

worst_rated=movies[movies['movieId'].isin(min_ids)]


worst_rated

Unnamed: 0,movieId,title,genres
5707,5805,Besotted (2001),Drama
12562,58760,Joysticks (1983),Comedy
12705,59775,Allan Quatermain and the Temple of Skulls (2008),Action|Adventure
14667,73196,My Love Has Been Burning (Waga koi wa moenu) (...,Drama
14677,73230,"Last Warrior, The (Last Patrol, The) (2000)",Action|Drama
...,...,...,...
34076,150904,"Dzień dobry, kocham cię! (2014)",Comedy
34125,151365,Weasels Rip My Flesh (1979),Horror|Sci-Fi
34133,151443,Sisters in Law (2005),(no genres listed)
34182,151641,90 Minutes in Heaven (2015),Drama
