In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestNeighbors
from sklearn.cluster import KMeans
from sklearn import metrics
from sklearn.metrics  import silhouette_score
from scipy.spatial.distance import cdist
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go

In [2]:
basics_df = pd.read_csv("https://datasets.imdbws.com/title.basics.tsv.gz", sep="\t", low_memory=False)

In [3]:
basics_df =  basics_df.drop('endYear',axis=1)

In [4]:
movies = basics_df.loc[basics_df['titleType']=='movie']

In [5]:
movies = movies.drop('titleType',axis=1)

In [6]:
movies_ok = movies.loc[movies['isAdult']=='0']
movies_ok=movies_ok.drop('isAdult',axis=1)

In [7]:
movies_ok_nat = movies_ok.replace('\\N', pd.NaT)
movies_clean = movies_ok_nat.dropna()

In [8]:
movies_clean = movies_clean.astype({'runtimeMinutes': int})
movies_clean = movies_clean.astype({'startYear': int})

In [9]:
movies_clean = movies_clean[(movies_clean['runtimeMinutes'] >= 58) & (movies_clean['runtimeMinutes'] <= 270)]

In [10]:
movies_clean = movies_clean[(movies_clean['startYear'] >= 1918) & (movies_clean['startYear'] <= 2021)]

In [11]:
ratings_df = pd.read_csv("https://datasets.imdbws.com/title.ratings.tsv.gz", sep="\t")

In [12]:
movies = pd.merge(movies_clean,ratings_df,how='inner')

In [13]:
movies_7 = movies.loc[movies['averageRating']>=7.0]

In [14]:
movies_7_rating = movies_7[movies_7['numVotes'] >= 1000]


In [15]:
movies_7_rating

Unnamed: 0,tconst,primaryTitle,originalTitle,startYear,runtimeMinutes,genres,averageRating,numVotes
11,tt0008879,The Outlaw and His Wife,Berg-Ejvind och hans hustru,1918,136,Drama,7.2,1784
82,tt0009893,The Oyster Princess,Die Austernprinzessin,1919,60,Comedy,7.2,1817
88,tt0009937,Blind Husbands,Blind Husbands,1919,99,"Drama,Romance",7.0,1470
91,tt0009968,Broken Blossoms,Broken Blossoms or The Yellow Man and the Girl,1919,90,"Drama,Romance",7.3,10029
124,tt0010247,Sir Arne's Treasure,Herr Arnes pengar,1919,122,"Drama,History",7.2,1357
...,...,...,...,...,...,...,...,...
223058,tt9882084,Chasing Happiness,Chasing Happiness,2019,96,"Biography,Documentary,Music",7.8,2138
223066,tt9886872,Munthiri Monchan,Munthiri Monchan,2019,130,"Comedy,Romance",7.8,1345
223130,tt9900782,Kaithi,Kaithi,2019,145,"Action,Crime,Thriller",8.5,19014
223131,tt9902160,Herself,Herself,2020,97,Drama,7.0,2854


In [16]:
movie_genre = movies_7_rating[['genres']]

In [17]:
movie_genre[['genre1', 'genre2','genre3']] = movie_genre['genres'].str.split(',', n=2, expand=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self[k1] = value[k2]


In [18]:
movie_genre.drop(columns='genres',axis=1,inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(


In [19]:
movie_genre = pd.concat([movie_genre , movie_genre['genre1'].str.get_dummies(), movie_genre['genre2'].str.get_dummies(), movie_genre['genre3'].str.get_dummies()], 
          axis = 1)

In [20]:
movie_genre.drop(['genre1','genre2','genre3'],axis=1,inplace=True)

In [21]:
movie_for_ml = pd.concat([movies_7_rating, movie_genre.groupby(lambda x:x, axis=1).sum()], axis = 1)

In [22]:
movie_for_ml.drop(columns=['runtimeMinutes','genres','averageRating','numVotes'],axis =1,inplace=True)

In [23]:
movie_for_ml.drop(columns='primaryTitle',axis=1,inplace=True)

In [24]:
movie_for_ml

Unnamed: 0,tconst,originalTitle,startYear,Action,Adventure,Animation,Biography,Comedy,Crime,Documentary,...,Music,Musical,Mystery,News,Romance,Sci-Fi,Sport,Thriller,War,Western
11,tt0008879,Berg-Ejvind och hans hustru,1918,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
82,tt0009893,Die Austernprinzessin,1919,0,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
88,tt0009937,Blind Husbands,1919,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
91,tt0009968,Broken Blossoms or The Yellow Man and the Girl,1919,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
124,tt0010247,Herr Arnes pengar,1919,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
223058,tt9882084,Chasing Happiness,2019,0,0,0,1,0,0,1,...,1,0,0,0,0,0,0,0,0,0
223066,tt9886872,Munthiri Monchan,2019,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
223130,tt9900782,Kaithi,2019,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
223131,tt9902160,Herself,2020,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [27]:
movie_for_ml.to_csv('../data/movies_for_ml.csv.zip', index=False, compression='zip')