In [29]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.metrics  import silhouette_score
from scipy.spatial.distance import cdist
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [7]:
basics_df = pd.read_csv("https://datasets.imdbws.com/title.basics.tsv.gz", sep="\t", low_memory=False)

In [8]:
basics_df =  basics_df.drop('endYear',axis=1)

In [9]:
movies = basics_df.loc[basics_df['titleType']=='movie']

In [10]:
movies = movies.drop('titleType',axis=1)

In [11]:
movies_ok = movies.loc[movies['isAdult']=='0']
movies_ok=movies_ok.drop('isAdult',axis=1)

In [12]:
movies_ok_nat = movies_ok.replace('\\N', pd.NaT)
movies_clean = movies_ok_nat.dropna()

In [13]:
movies_clean = movies_clean.astype({'runtimeMinutes': int})
movies_clean = movies_clean.astype({'startYear': int})

In [14]:
movies_clean = movies_clean[(movies_clean['runtimeMinutes'] >= 58) & (movies_clean['runtimeMinutes'] <= 270)]

In [15]:
movies_clean = movies_clean[(movies_clean['startYear'] >= 1918) & (movies_clean['startYear'] <= 2021)]

In [16]:
ratings_df = pd.read_csv("https://datasets.imdbws.com/title.ratings.tsv.gz", sep="\t")

In [17]:
movies = pd.merge(movies_clean,ratings_df,how='inner')

In [18]:
movies_7 = movies.loc[movies['averageRating']>=7.0]

In [19]:
movies_7_rating = movies_7[movies_7['numVotes'] >= 1000]


In [20]:
movie_genre = movies_7_rating[['genres']]

In [21]:
movie_genre[['genre1', 'genre2','genre3']] = movie_genre['genres'].str.split(',', n=2, expand=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [22]:
movie_genre.drop(columns='genres',axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [23]:
movie_genre = pd.concat([movie_genre , movie_genre['genre1'].str.get_dummies(), movie_genre['genre2'].str.get_dummies(), movie_genre['genre3'].str.get_dummies()], 
          axis = 1)

In [24]:
movie_genre.drop(['genre1','genre2','genre3'],axis=1,inplace=True)

In [48]:
movie_for_ml = pd.concat([movies_7_rating, movie_genre.groupby(lambda x:x, axis=1).sum()], axis = 1)

In [49]:
movie_for_ml.drop(columns=['runtimeMinutes','genres','averageRating','numVotes'],axis =1,inplace=True)

In [27]:
#movie_for_ml.to_csv('../data/movies_for_ml.csv.zip', index=False, compression='zip')

In [50]:
movie_for_ml.drop(columns='primaryTitle',axis=1,inplace=True)

In [56]:
movie_for_ml = movie_for_ml.reset_index(drop=True)

**ML**

In [57]:
cols = movie_for_ml.columns[2:]
X = movie_for_ml[cols]

scaler = StandardScaler().fit(X)
X_scaled = scaler.transform(X)

In [58]:
movie_scaled = pd.DataFrame()

In [59]:
movie_scaled['tconst']=movie_for_ml['tconst']
movie_scaled['originalTitle']=movie_for_ml['originalTitle']
movie_scaled

Unnamed: 0,tconst,originalTitle
0,tt0008879,Berg-Ejvind och hans hustru
1,tt0009893,Die Austernprinzessin
2,tt0009937,Blind Husbands
3,tt0009968,Broken Blossoms or The Yellow Man and the Girl
4,tt0010247,Herr Arnes pengar
...,...,...
11642,tt9882084,Chasing Happiness
11643,tt9886872,Munthiri Monchan
11644,tt9900782,Kaithi
11645,tt9902160,Herself


In [60]:
scaled_df = pd.DataFrame(X_scaled,columns=cols)
scaled_df

Unnamed: 0,startYear,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,Drama,Family,...,Music,Musical,Mystery,News,Romance,Sci-Fi,Sport,Thriller,War,Western
0,-2.973615,-0.391659,-0.325535,-0.215743,-0.314612,-0.629701,-0.437060,-0.357061,0.706242,-0.197441,...,-0.219207,-0.149313,-0.244108,-0.044482,-0.463272,-0.150212,-0.160923,-0.318552,-0.21487,-0.116515
1,-2.933570,-0.391659,-0.325535,-0.215743,-0.314612,1.588056,-0.437060,-0.357061,-1.415946,-0.197441,...,-0.219207,-0.149313,-0.244108,-0.044482,-0.463272,-0.150212,-0.160923,-0.318552,-0.21487,-0.116515
2,-2.933570,-0.391659,-0.325535,-0.215743,-0.314612,-0.629701,-0.437060,-0.357061,0.706242,-0.197441,...,-0.219207,-0.149313,-0.244108,-0.044482,2.158559,-0.150212,-0.160923,-0.318552,-0.21487,-0.116515
3,-2.933570,-0.391659,-0.325535,-0.215743,-0.314612,-0.629701,-0.437060,-0.357061,0.706242,-0.197441,...,-0.219207,-0.149313,-0.244108,-0.044482,2.158559,-0.150212,-0.160923,-0.318552,-0.21487,-0.116515
4,-2.933570,-0.391659,-0.325535,-0.215743,-0.314612,-0.629701,-0.437060,-0.357061,0.706242,-0.197441,...,-0.219207,-0.149313,-0.244108,-0.044482,-0.463272,-0.150212,-0.160923,-0.318552,-0.21487,-0.116515
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
11642,1.071007,-0.391659,-0.325535,-0.215743,3.178515,-0.629701,-0.437060,2.800640,-1.415946,-0.197441,...,4.561892,-0.149313,-0.244108,-0.044482,-0.463272,-0.150212,-0.160923,-0.318552,-0.21487,-0.116515
11643,1.071007,-0.391659,-0.325535,-0.215743,-0.314612,1.588056,-0.437060,-0.357061,-1.415946,-0.197441,...,-0.219207,-0.149313,-0.244108,-0.044482,2.158559,-0.150212,-0.160923,-0.318552,-0.21487,-0.116515
11644,1.071007,2.553242,-0.325535,-0.215743,-0.314612,-0.629701,2.288015,-0.357061,-1.415946,-0.197441,...,-0.219207,-0.149313,-0.244108,-0.044482,-0.463272,-0.150212,-0.160923,3.139206,-0.21487,-0.116515
11645,1.111052,-0.391659,-0.325535,-0.215743,-0.314612,-0.629701,-0.437060,-0.357061,0.706242,-0.197441,...,-0.219207,-0.149313,-0.244108,-0.044482,-0.463272,-0.150212,-0.160923,-0.318552,-0.21487,-0.116515


In [62]:
movie_scaled_concat = pd.concat([movie_scaled,scaled_df],axis=1)

In [63]:
cols = movie_scaled_concat.columns[2:]
X = movie_scaled_concat[cols]

modelKM = KMeans(n_clusters=10).fit(X)

movie_scaled_concat['cluster']=modelKM.labels_

In [77]:
cols = movie_scaled_concat.columns[2:]
X = movie_scaled_concat[cols]

modelKM = KMeans(n_clusters=35).fit(X)

movie_scaled_concat['cluster']=modelKM.labels_

In [78]:
movie_scaled_concat[movie_scaled_concat['originalTitle'].str.contains('Lord of')]

Unnamed: 0,tconst,originalTitle,startYear,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,...,Musical,Mystery,News,Romance,Sci-Fi,Sport,Thriller,War,Western,cluster
4806,tt0120737,The Lord of the Rings: The Fellowship of the Ring,0.350183,2.553242,3.071869,-0.215743,-0.314612,-0.629701,-0.43706,-0.357061,...,-0.149313,-0.244108,-0.044482,-0.463272,-0.150212,-0.160923,-0.318552,-0.21487,-0.116515,26
5095,tt0167260,The Lord of the Rings: The Return of the King,0.430274,2.553242,3.071869,-0.215743,-0.314612,-0.629701,-0.43706,-0.357061,...,-0.149313,-0.244108,-0.044482,-0.463272,-0.150212,-0.160923,-0.318552,-0.21487,-0.116515,26
5096,tt0167261,The Lord of the Rings: The Two Towers,0.390229,2.553242,3.071869,-0.215743,-0.314612,-0.629701,-0.43706,-0.357061,...,-0.149313,-0.244108,-0.044482,-0.463272,-0.150212,-0.160923,-0.318552,-0.21487,-0.116515,26
6423,tt0399295,Lord of War,0.510366,2.553242,-0.325535,-0.215743,-0.314612,-0.629701,2.288015,-0.357061,...,-0.149313,-0.244108,-0.044482,-0.463272,-0.150212,-0.160923,-0.318552,-0.21487,-0.116515,21


In [91]:
movie_scaled_concat[movie_scaled_concat['cluster']==7]

Unnamed: 0,tconst,originalTitle,startYear,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,...,Musical,Mystery,News,Romance,Sci-Fi,Sport,Thriller,War,Western,cluster
6377,tt0388789,Born Into Brothels: Calcutta's Red Light Kids,0.47032,-0.391659,-0.325535,-0.215743,3.178515,-0.629701,-0.43706,2.80064,...,-0.149313,-0.244108,22.48091,-0.463272,-0.150212,-0.160923,-0.318552,-0.21487,-0.116515,7
6985,tt0497116,An Inconvenient Truth,0.550412,-0.391659,-0.325535,-0.215743,-0.314612,-0.629701,-0.43706,2.80064,...,-0.149313,-0.244108,22.48091,-0.463272,-0.150212,-0.160923,-0.318552,-0.21487,-0.116515,7
8081,tt1286537,"Food, Inc.",0.630503,-0.391659,-0.325535,-0.215743,-0.314612,-0.629701,-0.43706,2.80064,...,-0.149313,-0.244108,22.48091,-0.463272,-0.150212,-0.160923,-0.318552,-0.21487,-0.116515,7
8106,tt1300563,The Age of Stupid,0.670549,-0.391659,-0.325535,-0.215743,-0.314612,-0.629701,-0.43706,2.80064,...,-0.149313,-0.244108,22.48091,-0.463272,-0.150212,-0.160923,-0.318552,-0.21487,-0.116515,7
8170,tt1333634,Burma VJ: Reporter i et lukket land,0.630503,-0.391659,-0.325535,-0.215743,-0.314612,-0.629701,-0.43706,2.80064,...,-0.149313,-0.244108,22.48091,-0.463272,-0.150212,-0.160923,-0.318552,-0.21487,-0.116515,7
8608,tt1618448,Racing Extinction,0.910824,-0.391659,3.071869,-0.215743,-0.314612,-0.629701,-0.43706,2.80064,...,-0.149313,-0.244108,22.48091,-0.463272,-0.150212,-0.160923,-0.318552,-0.21487,-0.116515,7
8684,tt1671513,Four Horsemen,0.790686,-0.391659,-0.325535,-0.215743,-0.314612,-0.629701,-0.43706,2.80064,...,-0.149313,-0.244108,22.48091,-0.463272,-0.150212,-0.160923,-0.318552,-0.21487,-0.116515,7
8818,tt1778338,The Culture High,0.870778,-0.391659,-0.325535,-0.215743,-0.314612,-0.629701,-0.43706,2.80064,...,-0.149313,-0.244108,22.48091,-0.463272,-0.150212,-0.160923,-0.318552,-0.21487,-0.116515,7
8836,tt1789083,The Weight of Chains,0.710595,-0.391659,-0.325535,-0.215743,-0.314612,-0.629701,-0.43706,2.80064,...,-0.149313,-0.244108,22.48091,-0.463272,-0.150212,-0.160923,-0.318552,-0.21487,-0.116515,7
9118,tt2084953,Terms and Conditions May Apply,0.830732,-0.391659,-0.325535,-0.215743,-0.314612,-0.629701,-0.43706,2.80064,...,-0.149313,-0.244108,22.48091,-0.463272,-0.150212,-0.160923,-0.318552,-0.21487,-0.116515,7


In [79]:
movie_scaled_concat['cluster'].value_counts()

5     1564
2     1416
4      935
16     710
25     455
21     427
29     419
20     402
6      344
0      338
17     334
1      307
19     291
23     268
22     247
26     239
18     231
30     219
24     217
15     213
13     193
11     190
10     179
32     174
12     174
33     161
14     156
9      145
34     144
31     137
8      114
3      107
28      92
27      82
7       23
Name: cluster, dtype: int64

In [88]:
film = movie_scaled_concat[movie_scaled_concat['originalTitle']=='The Lord of the Rings: The Fellowship of the Ring']
n_cluster = movie_scaled_concat['cluster'][movie_scaled_concat['originalTitle']=='The Lord of the Rings: The Fellowship of the Ring'].values
df_for_KNN=movie_scaled_concat[movie_scaled_concat['cluster']==n_cluster[0]]

[26]


In [84]:
cols = movie_scaled_concat.columns[2:-1]
X = df_for_KNN[cols]

distanceKNN = NearestNeighbors(n_neighbors=100).fit(X)

In [85]:
list_film = distanceKNN.kneighbors(film[cols].values)
items = list_film[1].reshape(1,100)[0]

In [90]:
suggestion = movie_for_ml.iloc[items]
suggestion.tail()

Unnamed: 0,tconst,originalTitle,startYear,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,...,Music,Musical,Mystery,News,Romance,Sci-Fi,Sport,Thriller,War,Western
193,tt0022735,The Cabin in the Cotton,1932,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
203,tt0023049,If I Had a Million,1932,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
105,tt0018742,The Cameraman,1928,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
106,tt0018770,La chute de la maison Usher,1928,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
94,tt0018217,Oktyabr,1927,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [75]:
score = []
K = range(20, 40)
for k in K:
    modelKM = KMeans(n_clusters=k)
    modelKM.fit(X)
    score.append(silhouette_score(X, modelKM.labels_))

In [76]:
fig = px.line(x=K,y=score,
              labels={'x':'# of clusters','y':'score'},
              title = 'silhouette')

fig.update_layout(width = 1000,height = 500)
fig.show()