# Cleaning and Preparing data

**https://pandas.pydata.org/docs/user_guide/index.html#user-guide**

In [148]:
import pandas as pd

**Load datasets**

In [152]:
name_basics = pd.read_csv("name.basics.tsv", sep="\t", low_memory=False, na_values="\\N")
title_basics = pd.read_csv("title.basics.tsv", sep="\t", low_memory=False, na_values="\\N")
title_ratings = pd.read_csv("title.ratings.tsv", sep="\t", low_memory=False, na_values="\\N")

**Debugging**

In [154]:
print(name_basics.columns)

Index(['nconst', 'primaryName', 'birthYear', 'deathYear', 'primaryProfession',
       'knownForTitles'],
      dtype='object')


In [156]:
name_basics.columns = name_basics.columns.str.strip()

**Cleaning the data name.basics**

Just getting the usefull columns

In [158]:
name_basics = name_basics[["nconst", "primaryName", "knownForTitles"]]
name_basics.head()

Unnamed: 0,nconst,primaryName,knownForTitles
0,nm0000001,Fred Astaire,"tt0072308,tt0050419,tt0027125,tt0031983"
1,nm0000002,Lauren Bacall,"tt0037382,tt0075213,tt0038355,tt0117057"
2,nm0000003,Brigitte Bardot,"tt0057345,tt0049189,tt0056404,tt0054452"
3,nm0000004,John Belushi,"tt0072562,tt0077975,tt0080455,tt0078723"
4,nm0000005,Ingmar Bergman,"tt0050986,tt0069467,tt0050976,tt0083922"


**Cleaning the data  title.basics**

In [168]:
title_basics = title_basics[title_basics["titleType"] == "movie"]
title_basics = title_basics[["tconst", "primaryTitle", "titleType", "genres"]]
title_basics.head()

Unnamed: 0,tconst,primaryTitle,titleType,genres
8,tt0000009,Miss Jerry,movie,Romance
144,tt0000147,The Corbett-Fitzsimmons Fight,movie,"Documentary,News,Sport"
498,tt0000502,Bohemios,movie,
570,tt0000574,The Story of the Kelly Gang,movie,"Action,Adventure,Biography"
587,tt0000591,The Prodigal Son,movie,Drama


**Cleaning the data title.ratings**

In [170]:
title_ratings = title_ratings[["tconst", "averageRating"]]
title_ratings.head()

Unnamed: 0,tconst,averageRating
0,tt0000001,5.7
1,tt0000002,5.5
2,tt0000003,6.4
3,tt0000004,5.3
4,tt0000005,6.2


**Merging Datasets title_basics with title_ratings**

In [172]:
movies_df = pd.merge(title_basics, title_ratings, on="tconst", how="left")
movies_df.head()

Unnamed: 0,tconst,primaryTitle,titleType,genres,averageRating
0,tt0000009,Miss Jerry,movie,Romance,5.4
1,tt0000147,The Corbett-Fitzsimmons Fight,movie,"Documentary,News,Sport",5.3
2,tt0000502,Bohemios,movie,,4.0
3,tt0000574,The Story of the Kelly Gang,movie,"Action,Adventure,Biography",6.0
4,tt0000591,The Prodigal Son,movie,Drama,5.6


**Merging with name_basics (Actors and Directors)**

Explode knownForTitles column to match tconst

Merge actors with movies

Drop knownForTitles column (as we already merged it)

In [174]:
name_basics["knownForTitles"] = name_basics["knownForTitles"].fillna("")
name_basics_expanded = name_basics.assign(knownForTitles=name_basics["knownForTitles"].str.split(","))
name_basics_expanded = name_basics_expanded.explode("knownForTitles")

final_df = pd.merge(movies_df, name_basics_expanded, left_on="tconst", right_on="knownForTitles", how="left")

final_df.drop(columns=["knownForTitles"], inplace=True)

final_df.head()

Unnamed: 0,tconst,primaryTitle,titleType,genres,averageRating,nconst,primaryName
0,tt0000009,Miss Jerry,movie,Romance,5.4,nm0063086,Blanche Bayliss
1,tt0000009,Miss Jerry,movie,Romance,5.4,nm0085156,Alexander Black
2,tt0000009,Miss Jerry,movie,Romance,5.4,nm0183823,William Courtenay
3,tt0000009,Miss Jerry,movie,Romance,5.4,nm1309758,Chauncey Depew
4,tt0000147,The Corbett-Fitzsimmons Fight,movie,"Documentary,News,Sport",5.3,nm0280615,Bob Fitzsimmons


**Saving dataset merged**

In [176]:
final_df.to_csv("merged_imdb_data.csv", index=False, encoding="utf-8")

In [178]:
import numpy as np

In [180]:
df = pd.read_csv("merged_imdb_data.csv", encoding="utf-8")

**Handling missing values**

`TODO:` Handle missing ratings dymanically during recommendations 

In [181]:
df["genres"] = df["genres"].fillna("Unknown")
df["primaryName"] = df["primaryName"].fillna("Unknown")

print(df.info())
print(df.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9772826 entries, 0 to 9772825
Data columns (total 7 columns):
 #   Column         Dtype  
---  ------         -----  
 0   tconst         object 
 1   primaryTitle   object 
 2   titleType      object 
 3   genres         object 
 4   averageRating  float64
 5   nconst         object 
 6   primaryName    object 
dtypes: float64(1), object(6)
memory usage: 521.9+ MB
None
      tconst                   primaryTitle titleType                  genres  \
0  tt0000009                     Miss Jerry     movie                 Romance   
1  tt0000009                     Miss Jerry     movie                 Romance   
2  tt0000009                     Miss Jerry     movie                 Romance   
3  tt0000009                     Miss Jerry     movie                 Romance   
4  tt0000147  The Corbett-Fitzsimmons Fight     movie  Documentary,News,Sport   

   averageRating     nconst        primaryName  
0            5.4  nm0063086    Blanche Ba

**Display missing data summary**

In [184]:
print(df.isnull().sum())

tconst                 0
primaryTitle          12
titleType              0
genres                 0
averageRating    2105625
nconst             96975
primaryName            0
dtype: int64


**Checking for special char in names**

In [186]:
print(df["primaryName"].head(50))

0                    Blanche Bayliss
1                    Alexander Black
2                  William Courtenay
3                     Chauncey Depew
4                    Bob Fitzsimmons
5                    Enoch J. Rector
6                   John L. Sullivan
7                       George Siler
8                       Billy Madden
9                   Antonio del Pozo
10                       El Mochuelo
11                  Guillermo Perrín
12                      Godfrey Cass
13                        Bella Cola
14                        Sam Crewes
15                       W.A. Gibson
16                   Millard Johnson
17                       Vera Linden
18                       Frank Mills
19                       Orrie Perry
20                         Reg Perry
21                      Charles Tait
22                    Elizabeth Tait
23                         John Tait
24                        Nevin Tait
25                      Ollie Wilson
26                 Nicholas Brierley
2

## Content-Based Filtering (Finding Similar Movies by Genre)

In [128]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity