# <p style="background-color:#003672;font-family:newtimeroman;color:#FFF9ED;font-size:150%;text-align:center;border-radius:10px 10px;">Movie Recommendation - EDA and Processing</p>

In [134]:
import pandas as pd
import numpy as np

main_color = "#003672"
three_colors_palette = ["#003672", "#943400", "#ED8B75"]
five_colors_palette = ["#003672", "#943400", "#ED8B75", "#F2DC5D", "#0E9594"]
eight_colors_palette = ["#003672", "#943400", "#ED8B75", "#25044B", "#63145B", "#993365", "#C85C6C", "#ED8B75"]

S = "\033[1m" + '\033[96m' # Used for colored strings
E = "\033[0m"

## <p style="background-color:#003672;font-family:newtimeroman;color:#FFF9ED;font-size:150%;text-align:center;border-radius:10px 10px;">Movie DataFrame</p>

In [135]:
movie_df = pd.read_csv("IMDB-Dataset\movies.csv", index_col="movieId")
movie_df.head()

Unnamed: 0_level_0,title,genres
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,Jumanji (1995),Adventure|Children|Fantasy
3,Grumpier Old Men (1995),Comedy|Romance
4,Waiting to Exhale (1995),Comedy|Drama|Romance
5,Father of the Bride Part II (1995),Comedy


In [136]:
print(S+"Seems we have a lot of unique values for the genres, we need to do some feature engineering"+E)
movie_df.genres.unique().shape[0] , movie_df.shape[0]

[1m[96mSeems we have a lot of unique values for the genres, we need to do some feature engineering[0m


(938, 10329)

## Working with genres

In [137]:
genres = []
for string in movie_df.genres.unique():
    tmp_genres = string.split("|")
    for genre in tmp_genres:
        if genre not in genres:
            genres.append(genre)

genres

['Adventure',
 'Animation',
 'Children',
 'Comedy',
 'Fantasy',
 'Romance',
 'Drama',
 'Action',
 'Crime',
 'Thriller',
 'Horror',
 'Mystery',
 'Sci-Fi',
 'IMAX',
 'War',
 'Musical',
 'Documentary',
 'Western',
 'Film-Noir',
 '(no genres listed)']

In [138]:
print(S+"IMAX isn't really a genre, but we'll keep it to see if we get more from it.")
genres.pop() # get rid of "(no genres listed)"

[1m[96mIMAX isn't really a genre, but we'll keep it to see if we get more from it.


'(no genres listed)'

In [139]:
movie_df.isna().sum()

title     0
genres    0
dtype: int64

In [140]:
movie_df = movie_df.reindex(columns=list(movie_df.columns)+genres)
movie_df.head()

Unnamed: 0_level_0,title,genres,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,...,Thriller,Horror,Mystery,Sci-Fi,IMAX,War,Musical,Documentary,Western,Film-Noir
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,,,,,,,,,...,,,,,,,,,,
2,Jumanji (1995),Adventure|Children|Fantasy,,,,,,,,,...,,,,,,,,,,
3,Grumpier Old Men (1995),Comedy|Romance,,,,,,,,,...,,,,,,,,,,
4,Waiting to Exhale (1995),Comedy|Drama|Romance,,,,,,,,,...,,,,,,,,,,
5,Father of the Bride Part II (1995),Comedy,,,,,,,,,...,,,,,,,,,,


In [141]:
ids = movie_df.index

for i, string in zip(ids, movie_df.genres):
    tmp_genres = string.split("|")
    for genre in genres:
        if genre  in tmp_genres:
            movie_df.at[i, genre] = 1
        else :
            movie_df.at[i, genre] = 0

movie_df.drop(columns="genres", axis=1, inplace=True)

In [142]:
for genre in genres:
    movie_df[genre] = pd.to_numeric(movie_df[genre], downcast="integer")

movie_df.head()

Unnamed: 0_level_0,title,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,War,Musical,Documentary,Western,Film-Noir
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,Toy Story (1995),1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Jumanji (1995),1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Grumpier Old Men (1995),0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Waiting to Exhale (1995),0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0
5,Father of the Bride Part II (1995),0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


## Working with dates

In [143]:
movie_df.head()

Unnamed: 0_level_0,title,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,Thriller,Horror,Mystery,Sci-Fi,IMAX,War,Musical,Documentary,Western,Film-Noir
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1
1,Toy Story (1995),1,1,1,1,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
2,Jumanji (1995),1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0
3,Grumpier Old Men (1995),0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0
4,Waiting to Exhale (1995),0,0,0,1,0,1,1,0,0,0,0,0,0,0,0,0,0,0,0
5,Father of the Bride Part II (1995),0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0


In [144]:
movie_df["Year"] = movie_df.title.str[-5:-1]
movie_df.head()

Unnamed: 0_level_0,title,Adventure,Animation,Children,Comedy,Fantasy,Romance,Drama,Action,Crime,...,Horror,Mystery,Sci-Fi,IMAX,War,Musical,Documentary,Western,Film-Noir,Year
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,Toy Story (1995),1,1,1,1,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1995
2,Jumanji (1995),1,0,1,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1995
3,Grumpier Old Men (1995),0,0,0,1,0,1,0,0,0,...,0,0,0,0,0,0,0,0,0,1995
4,Waiting to Exhale (1995),0,0,0,1,0,1,1,0,0,...,0,0,0,0,0,0,0,0,0,1995
5,Father of the Bride Part II (1995),0,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1995


In [145]:
movie_df.drop(columns="title",axis=1, inplace=True)

In [146]:
print(S+"The only things missing are standardization and reducing memory but we'll do it after the EDA"+E)

[1m[96mThe only things missing are standardization and reducing memory but we'll do it after the EDA[0m


## <p style="background-color:#003672;font-family:newtimeroman;color:#FFF9ED;font-size:150%;text-align:center;border-radius:10px 10px;">Ratings DataFrame</p>

In [147]:
ratings_df = pd.read_csv("IMDB-Dataset\\ratings.csv")
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,16,4.0,1217897793
1,1,24,1.5,1217895807
2,1,32,4.0,1217896246
3,1,47,4.0,1217896556
4,1,50,4.0,1217896523
