# Cleaning and Preparing data

**https://pandas.pydata.org/docs/user_guide/index.html#user-guide**

In [None]:
import pandas as pd

**Load datasets**

In [None]:
name_basics = pd.read_csv("name.basics.tsv", sep="\t", low_memory=False, na_values="\\N")
title_basics = pd.read_csv("title.basics.tsv", sep="\t", low_memory=False, na_values="\\N")
title_ratings = pd.read_csv("title.ratings.tsv", sep="\t", low_memory=False, na_values="\\N")

**Debugging**

In [None]:
print(name_basics.columns)

In [None]:
name_basics.columns = name_basics.columns.str.strip()

**Cleaning the data name.basics**

Just getting the usefull columns

In [None]:
name_basics = name_basics[["nconst", "primaryName", "knownForTitles"]]
name_basics.head()

**Cleaning the data  title.basics**

In [None]:
title_basics = title_basics[title_basics["titleType"] == "movie"]
title_basics = title_basics[["tconst", "primaryTitle", "titleType", "genres"]]
title_basics.head()

**Cleaning the data title.ratings**

In [None]:
title_ratings = title_ratings[["tconst", "averageRating"]]
title_ratings.head()

**Merging Datasets title_basics with title_ratings**

In [None]:
movies_df = pd.merge(title_basics, title_ratings, on="tconst", how="left")
movies_df.head()

**Merging with name_basics (Actors and Directors)**

Explode knownForTitles column to match tconst

Merge actors with movies

Drop knownForTitles column (as we already merged it)

In [None]:
name_basics["knownForTitles"] = name_basics["knownForTitles"].fillna("")
name_basics_expanded = name_basics.assign(knownForTitles=name_basics["knownForTitles"].str.split(","))
name_basics_expanded = name_basics_expanded.explode("knownForTitles")

final_df = pd.merge(movies_df, name_basics_expanded, left_on="tconst", right_on="knownForTitles", how="left")

final_df.drop(columns=["knownForTitles"], inplace=True)

final_df.head()

**Saving dataset merged**

In [None]:
final_df.to_csv("merged_imdb_data.csv", index=False, encoding="utf-8")

In [None]:
import numpy as np

In [None]:
movie_dataset = pd.read_csv("merged_imdb_data.csv", encoding="utf-8")

**Handling missing values**

`TODO:` Handle missing ratings dymanically during recommendations 

In [None]:
movie_dataset["genres"] = movie_dataset["genres"].fillna("Unknown")
movie_dataset["primaryName"] = movie_dataset["primaryName"].fillna("Unknown")

print(movie_dataset.info())
print(movie_dataset.head())

**Display missing data summary**

In [None]:
print(movie_dataset.isnull().sum())

**Checking for special char in names**

In [None]:
print(movie_dataset["primaryName"].head(50))