# Cleaning and Preparing data

**https://pandas.pydata.org/docs/user_guide/index.html#user-guide**

In [7]:
import pandas as pd

**Load datasets**

In [73]:
name_basics = pd.read__csv("name.basics.tsv", sep="\t", low_memory=False, na_values="\\N")
title_basics = pd.read_csv("title.basics.tsv", sep="\t", low_memory=False, na_values="\\N")
title_ratings = pd.read_csv("title.ratings.tsv", sep="\t", low_memory=False, na_values="\\N")

**Debugging**

In [56]:
print(name_basics.columns)

Index(['nconst', 'primaryName', 'birthYear', 'deathYear', 'primaryProfession',
       'knownForTitles'],
      dtype='object')


In [59]:
name_basics.columns = name_basics.columns.str.strip()

**Cleaning the data name.basics**

Just getting the usefull columns

In [61]:
name_basics = name_basics[["nconst", "primaryName", "knownForTitles"]]
name_basics.head()

Unnamed: 0,nconst,primaryName,knownForTitles
0,nm0000001,Fred Astaire,"tt0072308,tt0050419,tt0027125,tt0031983"
1,nm0000002,Lauren Bacall,"tt0037382,tt0075213,tt0038355,tt0117057"
2,nm0000003,Brigitte Bardot,"tt0057345,tt0049189,tt0056404,tt0054452"
3,nm0000004,John Belushi,"tt0072562,tt0077975,tt0080455,tt0078723"
4,nm0000005,Ingmar Bergman,"tt0050986,tt0069467,tt0050976,tt0083922"


**Cleaning the data  title.basics**

In [63]:
title_basics = title_basics[["tconst", "primaryTitle", "genres"]]
title_basics.head()

Unnamed: 0,tconst,primaryTitle,genres
0,tt0000001,Carmencita,"Documentary,Short"
1,tt0000002,Le clown et ses chiens,"Animation,Short"
2,tt0000003,Poor Pierrot,"Animation,Comedy,Romance"
3,tt0000004,Un bon bock,"Animation,Short"
4,tt0000005,Blacksmith Scene,Short


**Cleaning the data title.ratings**

In [65]:
title_ratings = title_ratings[["tconst", "averageRating"]]
title_ratings.head()

Unnamed: 0,tconst,averageRating
0,tt0000001,5.7
1,tt0000002,5.5
2,tt0000003,6.4
3,tt0000004,5.3
4,tt0000005,6.2


**Merging Datasets title_basics with title_ratings**

In [67]:
movies_df = pd.merge(title_basics, title_ratings, on="tconst", how="left")
movies_df.head()

Unnamed: 0,tconst,primaryTitle,genres,averageRating
0,tt0000001,Carmencita,"Documentary,Short",5.7
1,tt0000002,Le clown et ses chiens,"Animation,Short",5.5
2,tt0000003,Poor Pierrot,"Animation,Comedy,Romance",6.4
3,tt0000004,Un bon bock,"Animation,Short",5.3
4,tt0000005,Blacksmith Scene,Short,6.2


**Merging with name_basics (Actors and Directors)**

Explode knownForTitles column to match tconst

Merge actors with movies

Drop knownForTitles column (as we already merged it)

In [69]:
name_basics["knownForTitles"] = name_basics["knownForTitles"].fillna("")
name_basics_expanded = name_basics.assign(knownForTitles=name_basics["knownForTitles"].str.split(","))
name_basics_expanded = name_basics_expanded.explode("knownForTitles")

final_df = pd.merge(movies_df, name_basics_expanded, left_on="tconst", right_on="knownForTitles", how="left")

final_df.drop(columns=["knownForTitles"], inplace=True)

final_df.head()

Unnamed: 0,tconst,primaryTitle,genres,averageRating,nconst,primaryName
0,tt0000001,Carmencita,"Documentary,Short",5.7,nm1588970,Carmencita
1,tt0000002,Le clown et ses chiens,"Animation,Short",5.5,,
2,tt0000003,Poor Pierrot,"Animation,Comedy,Romance",6.4,nm0721526,Émile Reynaud
3,tt0000003,Poor Pierrot,"Animation,Comedy,Romance",6.4,nm1335271,Gaston Paulin
4,tt0000003,Poor Pierrot,"Animation,Comedy,Romance",6.4,nm17045636,Louis Morin


**Saving dataset merged**

In [106]:
final_df.to_csv("merged_imdb_data.csv", index=False, encoding="utf-8")

In [75]:
import numpy as np

In [100]:
df = pd.read_csv("merged_imdb_data.csv", encoding="utf-8")

**Handling missing values**

`TODO:` Handle missing ratings dymanically during recommendations 

In [110]:
df["genres"] = df["genres"].fillna("Unknown")
df["primaryName"] = df["primaryName"].fillna("Unknown")

print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32762392 entries, 0 to 32762391
Data columns (total 6 columns):
 #   Column         Dtype  
---  ------         -----  
 0   tconst         object 
 1   primaryTitle   object 
 2   genres         object 
 3   averageRating  float64
 4   nconst         object 
 5   primaryName    object 
dtypes: float64(1), object(5)
memory usage: 1.5+ GB
None
      tconst            primaryTitle                    genres  averageRating  \
0  tt0000001              Carmencita         Documentary,Short            5.7   
1  tt0000002  Le clown et ses chiens           Animation,Short            5.5   
2  tt0000003            Poor Pierrot  Animation,Comedy,Romance            6.4   
3  tt0000003            Poor Pierrot  Animation,Comedy,Romance            6.4   
4  tt0000003            Poor Pierrot  Animation,Comedy,Romance            6.4   

       nconst    primaryName  
0   nm1588970     Carmencita  
1         NaN        Unknown  
2   nm0721526  Émile Reyn

**Display missing data summary**

In [108]:
print(df.isnull().sum())

tconst                  0
primaryTitle           38
genres             942642
averageRating    16995992
nconst            9431932
primaryName       9431941
dtype: int64


**Checking for special char in names**

In [120]:
print(df["primaryName"].head(50))

0                   Carmencita
1                      Unknown
2                Émile Reynaud
3                Gaston Paulin
4                  Louis Morin
5                 Julien Pappé
6                 Tamara Pappé
7              Claire Lissalde
8               Sophie Nicolle
9                Gaston Paulin
10                     Unknown
11                     Unknown
12              Peter Courtney
13                    Fred Ott
14             Blanche Bayliss
15             Alexander Black
16           William Courtenay
17              Chauncey Depew
18                     Unknown
19                     Grunato
20        Mrs. Auguste Lumiere
21             Auguste Lumière
22           Madeleine Koehler
23             Suzanne Lumière
24                Rose Lumière
25              Marcel Koehler
26    Jeanne-Joséphine Lumière
27                     Unknown
28              François Clerc
29                Benoît Duval
30               Louis Lumière
31               Gaston Paulin
32      

## Content-Based Filtering (Finding Similar Movies by Genre)

In [128]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity