# Read data

In [None]:
import pandas as pd

movies = pd.read_csv('data/movies_meta_data.csv', sep=';', engine='python')
users = pd.read_csv('data/users.dat', sep='::', engine='python', names=['userId', 'gender', 'age', 'occupation', 'zip-code'])
ratings = pd.read_csv('data/ratings.dat', sep='::', engine='python', names=['userId', 'movieId', 'rating', 'timestamp'])

In [None]:
users.groupby('gender').size().plot(kind='pie', y=0, figsize=(10, 10), autopct='%1.1f%%', title='Gender distribution')




In [None]:
ratings.groupby('userId').size()

In [None]:
ratings.groupby('userId').size().plot(kind='hist', bins=100, figsize=(5, 5), title='Number of ratings per user', xlabel='Number of ratings', ylabel='Number of users')

# Cleaning

As we don't plan on doing anything with the zip code of the user we will drop this feature.

In [None]:
users_dropped = users.drop(['zip-code'], axis=1)

Because the timestamp adds no value to our model we are going to remove the timestamp from all ratings.

In [None]:
ratings_dropped = ratings.drop('timestamp', axis=1)

In [None]:
movies_dropped = movies[['ml_movieId', 'Title', 'Year', 'Released', 'Runtime', 'Genre', 'Ratings', 'Metascore', 'imdbRating', 'imdbVotes', 'BoxOffice']]
movies_dropped.head()

In [None]:
import re

# Fill missing 'imdbRating' values with the mean
movies_dropped['imdbRating'].fillna(movies_dropped['imdbRating'].mean(), inplace=True)

# Round the 'imdbRating' values to one decimal place
movies_dropped['imdbRating'] = movies_dropped['imdbRating'].round(decimals=1)

# Convert the 'imdbVotes' column to float type using regular expressions
movies_dropped['imdbVotes'] = movies_dropped['imdbVotes'].apply(lambda x: float(re.sub(r'[^\d.]', '', x)) if isinstance(x, str) else x)

# Fill in missing values with mean
movies_dropped['imdbVotes'].fillna(movies_dropped['imdbVotes'].mean(), inplace=True)

# Round the 'imdbVotes' values to zero decimal places
movies_dropped['imdbVotes'] = movies_dropped['imdbVotes'].round(decimals=0)

# Convert the 'BoxOffice' column to float type using regular expressions
movies_dropped['BoxOffice'] = movies_dropped['BoxOffice'].apply(lambda x: float(re.sub(r'[^\d.]', '', x)) if isinstance(x, str) else x)

# Fill in missing values with mean
movies_dropped['BoxOffice'].fillna(movies_dropped['BoxOffice'].mean(), inplace=True)

# Round the 'BoxOffice' values to zero decimal places
movies_dropped['BoxOffice'] = movies_dropped['BoxOffice'].round(decimals=0)

# Convert the 'Runtime' column from minutes to hours
movies_dropped['Runtime'] = movies_dropped['Runtime'].apply(lambda x: int(re.sub(r'\D', '', x)) / 60 if isinstance(x, str) else x)

# Fill in missing values with mean
movies_dropped['Runtime'].fillna(movies_dropped['Runtime'].mean(), inplace=True)

In [None]:
# Drop rows with missing values by selecting only rows without NaN values in the 'Released' column
movies_dropped = movies_dropped.loc[~pd.isna(movies_dropped['Released'])]
movies_dropped['Released'].isna().sum()

In [None]:
# Drop the rows with missing values in the 'Genre' column
movies_dropped.dropna(subset=['Genre'], inplace=True)
movies_dropped['Genre'].isna().sum()

# Transformation

In [None]:
# Define a dictionary to map numeric occupation values to text labels
occupation_labels = {
    0: "other or not specified",
    1: "academic/educator",
    2: "artist",
    3: "clerical/admin",
    4: "college/grad student",
    5: "customer service",
    6: "doctor/health care",
    7: "executive/managerial",
    8: "farmer",
    9: "homemaker",
    10: "K-12 student",
    11: "lawyer",
    12: "programmer",
    13: "retired",
    14: "sales/marketing",
    15: "scientist",
    16: "self-employed",
    17: "technician/engineer",
    18: "tradesman/craftsman",
    19: "unemployed",
    20: "writer",
}

# Replace the numeric occupation values with text labels using the map method
users_dropped["occupation"] = users["occupation"].map(occupation_labels)

In [None]:
age_labels = {
    1: "Under 18",
    18: "18-24",
    25: "25-34",
    35: "35-44",
    45: "45-49",
    50: "50-55",
    56: "56+",
}

# Replace the numeric age values with text labels using the map method
users_dropped["age"] = users["age"].map(age_labels)

In [None]:
users_dropped.head()

# Merge

In [None]:
# Rename the 'ml_movieId' column to 'movieId'
movies_dropped.rename(columns={'ml_movieId': 'movieId'}, inplace=True)

In [None]:
merged = movies_dropped.merge(ratings_dropped, on='movieId', how='left')
# merged['userId'] = merged['userId'].astype('int32')
merged.isna().sum()
# merged.head()

In [None]:
merged.shape

# SVD approach with Surprise

In [None]:
from surprise import Reader, Dataset, SVD, accuracy, KNNBasic
from surprise.model_selection import train_test_split

model = SVD()
data = Dataset.load_from_df(ratings_dropped, Reader(rating_scale=(1, 5)))
trainset, testset = train_test_split(data, test_size=.25)

model.fit(trainset)
predictions = model.test(testset)

accuracy.rmse(predictions)

### Exporting the model

In [362]:
from surprise import dump

# Dump algorithm and reload it.
file_name = 'models/SVD_Model'
dump.dump(file_name, algo=model)
_, loaded_algo = dump.load(file_name)

FileNotFoundError: [Errno 2] No such file or directory: '/models/SVD_Model'

### Handling a new user

In [None]:

new_user_id = 9999
new_user_movies = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
new_user_ratings = [5, 4, 3, 2, 1, 5, 4, 3, 2, 1]

new_user = pd.DataFrame({'userId': new_user_id, 'movieId': new_user_movies, 'rating': new_user_ratings})
ratings_combined = pd.concat([ratings_dropped, new_user], ignore_index=True)

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(ratings_combined, reader)

# Retrain the model with the combined ratings data
model = SVD()
trainset = data.build_full_trainset()
model.fit(trainset)

model.predict(uid=new_user_id, iid=3)


# Recommendation with pearson similarity matrix

In [None]:
pivot = merged.pivot_table(index=['userId'], columns=['Title'], values='rating')
pivot.head()

In [None]:
pivot = pivot.dropna(thresh=10, axis=1).fillna(0)
pivot.head()

In [None]:
similarity_df = pivot.corr(method='pearson')
similarity_df.head()
similarity_df.to_csv('data/similarity.csv')

In [None]:
def get_similar_movies(title, user_rating):
    similar_score = similarity_df[title]*(user_rating-2.5)
    similar_score = similar_score.sort_values(ascending=False)
    
    return similar_score

In [None]:
action_lover = [("Jurassic Park", 5), ("The Lost World: Jurassic Park", 5), ('Titanic', 3), ('Forrest Gump', 5)]
similar_movies = pd.DataFrame()

for movie, rating in action_lover:
    similar_movies = similar_movies.append(get_similar_movies(movie, rating), ignore_index=True)

similar_movies.head()
similar_movies.sum().sort_values(ascending=False).head(20)