In [None]:
import pandas as pd
import numpy as np
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.metrics  import silhouette_score
from scipy.spatial.distance import cdist
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [2]:
basics_df = pd.read_csv("https://datasets.imdbws.com/title.basics.tsv.gz", sep="\t", low_memory=False)

In [3]:
basics_df =  basics_df.drop('endYear',axis=1)

In [4]:
movies = basics_df.loc[basics_df['titleType']=='movie']

In [5]:
movies = movies.drop('titleType',axis=1)

In [6]:
movies_ok = movies.loc[movies['isAdult']=='0']
movies_ok=movies_ok.drop('isAdult',axis=1)

In [7]:
movies_ok_nat = movies_ok.replace('\\N', pd.NaT)
movies_clean = movies_ok_nat.dropna()

In [8]:
movies_clean = movies_clean.astype({'runtimeMinutes': int})
movies_clean = movies_clean.astype({'startYear': int})

In [9]:
movies_clean = movies_clean[(movies_clean['runtimeMinutes'] >= 58) & (movies_clean['runtimeMinutes'] <= 270)]

In [10]:
movies_clean = movies_clean[(movies_clean['startYear'] >= 1918) & (movies_clean['startYear'] <= 2021)]

In [11]:
ratings_df = pd.read_csv("https://datasets.imdbws.com/title.ratings.tsv.gz", sep="\t")

In [12]:
movies = pd.merge(movies_clean,ratings_df,how='inner')

In [13]:
movies_7 = movies.loc[movies['averageRating']>=7.0]

In [14]:
movies_7_rating = movies_7[movies_7['numVotes'] >= 1000]


In [15]:
crew_df = pd.read_csv("https://datasets.imdbws.com/title.crew.tsv.gz", sep="\t")

In [16]:
principals_df = pd.read_csv("https://datasets.imdbws.com/title.principals.tsv.gz", sep="\t")

In [17]:
name_df = pd.read_csv("https://datasets.imdbws.com/name.basics.tsv.gz", sep="\t")

In [18]:
movie_dir = pd.merge(movies_7_rating,crew_df,how='inner',on='tconst')

In [19]:
movie_dir.drop('writers',axis=1,inplace=True)

In [20]:
movie_dir.drop('primaryTitle',axis=1,inplace=True)

In [21]:
movie_dir[['mainDirector', 'director2']] = movie_dir['directors'].str.split(',', n=1, expand=True)

In [22]:
movie_dir.drop('directors',axis=1,inplace=True)
movie_dir.drop('director2',axis=1,inplace=True)

In [23]:
movie_dir = movie_dir.replace('\\N', pd.NaT)

In [24]:
movie_dir.dropna(subset=['mainDirector'], inplace=True)


In [25]:
movie_dir = pd.concat([movie_dir[['tconst', 'originalTitle', 'startYear', 'runtimeMinutes','genres','averageRating','numVotes']] , movie_dir['mainDirector'].str.get_dummies()], 
          axis = 1)

In [27]:
movie_genre = movie_dir[['genres']]

In [29]:
movie_genre[['genre1', 'genre2','genre3']] = movie_genre['genres'].str.split(',', n=2, expand=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [31]:
movie_genre.drop(columns='genres',axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [33]:
movie_genre = pd.concat([movie_genre , movie_genre['genre1'].str.get_dummies(), movie_genre['genre2'].str.get_dummies(), movie_genre['genre3'].str.get_dummies()], 
          axis = 1)

In [38]:
movie_genre.drop(['genre1','genre2','genre3'],axis=1,inplace=True)

In [40]:
movie_for_ml = pd.concat([movie_dir, movie_genre.groupby(lambda x:x, axis=1).sum()], axis = 1)

In [44]:
movie_for_ml.drop(columns=['runtimeMinutes','genres','averageRating','numVotes'],axis =1,inplace=True)

In [46]:
movie_for_ml.head()

Unnamed: 0,tconst,originalTitle,startYear,nm0000005,nm0000008,nm0000019,nm0000033,nm0000036,nm0000037,nm0000040,...,Music,Musical,Mystery,News,Romance,Sci-Fi,Sport,Thriller,War,Western
0,tt0008879,Berg-Ejvind och hans hustru,1918,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,tt0009893,Die Austernprinzessin,1919,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,tt0009937,Blind Husbands,1919,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,tt0009968,Broken Blossoms or The Yellow Man and the Girl,1919,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
4,tt0010247,Herr Arnes pengar,1919,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [51]:
cols = movie_for_ml.columns.tolist()
del cols[:2]

In [52]:
X=movie_for_ml[cols]

In [53]:
distortions = []
inertias = []
mapping1 = {}
mapping2 = {}
K = range(20, 41)
 
for k in K:
    # Building and fitting the model
    kmeanModel = KMeans(n_clusters=k).fit(X)
    kmeanModel.fit(X)
 
    distortions.append(sum(np.min(cdist(X, kmeanModel.cluster_centers_,
                                        'euclidean'), axis=1)) / X.shape[0])
    inertias.append(kmeanModel.inertia_)
 
    mapping1[k] = sum(np.min(cdist(X, kmeanModel.cluster_centers_,
                                   'euclidean'), axis=1)) / X.shape[0]
    mapping2[k] = kmeanModel.inertia_

In [None]:
fig = px.line(x=K,y=inertias,
              labels={'x':'Values of K','y':'inertias'},
              title = 'The Elbow Method using Inertia')

fig.update_layout(width = 1000,height = 500)
fig.show()

In [None]:
score = []
K = range(20, 41)
for k in K:
    modelKM = KMeans(n_clusters=k)
    modelKM.fit(X)
    score.append(silhouette_score(X, modelKM.labels_))

In [None]:
fig = px.line(x=K,y=score,
              labels={'x':'# of clusters','y':'score'},
              title = 'silhouette')

fig.update_layout(width = 1000,height = 500)
fig.show()