# Read data

In [None]:
import pandas as pd

movies = pd.read_csv('data/movies_meta_data.csv', sep=';', engine='python')
users = pd.read_csv('data/users.dat', sep='::', engine='python', names=['userId', 'gender', 'age', 'occupation', 'zip-code'])
ratings = pd.read_csv('data/ratings.dat', sep='::', engine='python', names=['userId', 'movieId', 'rating', 'timestamp'])

# Cleaning

In [None]:
movies_dropped = movies[['ml_movieId', 'Title', 'Year', 'Released', 'Runtime', 'Genre', 'Ratings', 'Metascore', 'imdbRating', 'imdbVotes', 'BoxOffice']]
movies_dropped.head()

In [None]:
import re

# Fill missing 'imdbRating' values with the mean
movies_dropped['imdbRating'].fillna(movies_dropped['imdbRating'].mean(), inplace=True)

# Round the 'imdbRating' values to one decimal place
movies_dropped['imdbRating'] = movies_dropped['imdbRating'].round(decimals=1)

# Convert the 'imdbVotes' column to float type using regular expressions
movies_dropped['imdbVotes'] = movies_dropped['imdbVotes'].apply(lambda x: float(re.sub(r'[^\d.]', '', x)) if isinstance(x, str) else x)

# Fill in missing values with mean
movies_dropped['imdbVotes'].fillna(movies_dropped['imdbVotes'].mean(), inplace=True)

# Round the 'imdbVotes' values to zero decimal places
movies_dropped['imdbVotes'] = movies_dropped['imdbVotes'].round(decimals=0)

# Convert the 'BoxOffice' column to float type using regular expressions
movies_dropped['BoxOffice'] = movies_dropped['BoxOffice'].apply(lambda x: float(re.sub(r'[^\d.]', '', x)) if isinstance(x, str) else x)

# Fill in missing values with mean
movies_dropped['BoxOffice'].fillna(movies_dropped['BoxOffice'].mean(), inplace=True)

# Round the 'BoxOffice' values to zero decimal places
movies_dropped['BoxOffice'] = movies_dropped['BoxOffice'].round(decimals=0)

# Convert the 'Runtime' column from minutes to hours
movies_dropped['Runtime'] = movies_dropped['Runtime'].apply(lambda x: int(re.sub(r'\D', '', x)) / 60 if isinstance(x, str) else x)

# Fill in missing values with mean
movies_dropped['Runtime'].fillna(movies_dropped['Runtime'].mean(), inplace=True)

In [None]:
for index, row in movies_dropped.iterrows():
    if pd.isna(row['Released']):
        movies_dropped.drop(index, inplace=True)

In [None]:
# Drop the rows with missing values in the 'Genre' column
movies_dropped.dropna(subset=['Genre'], inplace=True)
movies_dropped.head()

# Transformation

In [None]:
# Define a dictionary to map numeric occupation values to text labels
occupation_labels = {0: "other or not specified",
                     1: "academic/educator",
                     2: "artist",
                     3: "clerical/admin",
                     4: "college/grad student",
                     5: "customer service",
                     6: "doctor/health care",
                     7: "executive/managerial",
                     8: "farmer",
                     9: "homemaker",
                     10: "K-12 student",
                     11: "lawyer",
                     12: "programmer",
                     13: "retired",
                     14: "sales/marketing",
                     15: "scientist",
                     16: "self-employed",
                     17: "technician/engineer",
                     18: "tradesman/craftsman",
                     19: "unemployed",
                     20: "writer"}

# Replace the numeric occupation values with text labels using the map method
users['occupation'] = users['occupation'].map(occupation_labels)

In [None]:
# Create dummy variables for the genres
movies_dropped = movies_dropped.join(movies_dropped['Genre'].str.get_dummies(sep=', '))
movies.head()

In [None]:
movies_dropped.drop(['Genre'], axis=1, inplace=True)

# Merge

In [None]:
merged = pd.merge(movies_dropped, ratings, left_on='ml_movieId', right_on='movieId', how='inner')
merged.head()

In [None]:
merged.columns

In [None]:
merged.drop(['timestamp', 'movieId'], axis=1)
merged.head()

# Create Pivot table

In [None]:
pivot = merged.pivot_table(index=['userId'], columns=['Title'], values='rating')
pivot.head()

In [None]:
pivot = pivot.dropna(thresh=10, axis=1).fillna(0)
pivot.head()

In [None]:
similarity_df = pivot.corr(method='pearson')
similarity_df.head()

In [None]:
similarity_df.loc['Jurassic Park'].sort_values(ascending=False).head(10)

In [None]:
def get_similar_movies(title, user_rating):
    similar_score = similarity_df[title]*(user_rating-2.5)
    similar_score = similar_score.sort_values(ascending=False)
    
    return similar_score

In [None]:
action_lover = [("Jurassic Park", 5), ("The Lost World: Jurassic Park", 5), ('Titanic', 3), ('Forrest Gump', 5)]
similar_movies = pd.DataFrame()

for movie, rating in action_lover:
    similar_movies = similar_movies.append(get_similar_movies(movie, rating), ignore_index=True)

similar_movies.head()
similar_movies.sum().sort_values(ascending=False).head(20)