In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

In [2]:
# Load the data
movies_df = pd.read_csv('movies.csv')
ratings_df = pd.read_csv('ratings.csv')

In [3]:
movies_df

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy
...,...,...,...
9737,193581,Black Butler: Book of the Atlantic (2017),Action|Animation|Comedy|Fantasy
9738,193583,No Game No Life: Zero (2017),Animation|Comedy|Fantasy
9739,193585,Flint (2017),Drama
9740,193587,Bungo Stray Dogs: Dead Apple (2018),Action|Animation


In [4]:
ratings_df

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931
...,...,...,...,...
100831,610,166534,4.0,1493848402
100832,610,168248,5.0,1493850091
100833,610,168250,5.0,1494273047
100834,610,168252,5.0,1493846352


In [5]:
# Merge ratings and movies dataframes
ratings_movies_df = pd.merge(ratings_df, movies_df, on="movieId")

In [6]:
# Convert genres column to list of binary variables using one-hot encoding
genres_list = ["Action", "Adventure", "Animation", "Children", "Comedy", "Crime", "Documentary", "Drama", "Fantasy", "Film-Noir", "Horror", "Musical", "Mystery", "Romance", "Sci-Fi", "Thriller", "War", "Western"]
for genre in genres_list:
    ratings_movies_df[genre] = ratings_movies_df["genres"].str.contains(genre).astype(int)

In [7]:
# Split data into training and testing sets
#X_train, X_test, y_train, y_test = train_test_split(ratings_movies_df.drop(["userId", "movieId", "title", "genres", "rating"], axis=1), ratings_movies_df["rating"], test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(ratings_movies_df.drop(["title", "genres", "rating"], axis=1), ratings_movies_df["rating"], test_size=0.2, random_state=42)


In [8]:
X_train

Unnamed: 0,userId,movieId,timestamp,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Fantasy,Film-Noir,Horror,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
80568,275,2917,1049077233,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
50582,295,1221,1320064798,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0
8344,140,1240,942841243,1,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
99603,606,5613,1171501480,0,0,0,0,1,1,0,...,0,0,0,1,1,0,0,0,0,0
71701,182,2231,1055157566,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6265,249,1089,1346757745,0,0,0,0,0,1,0,...,0,0,0,0,1,0,0,1,0,0
54886,75,44199,1158968109,0,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
76820,132,3186,1157997280,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
860,17,110,1305696470,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0


In [9]:
y_train

80568    5.0
50582    4.5
8344     3.0
99603    4.0
71701    3.0
        ... 
6265     4.0
54886    3.0
76820    4.5
860      4.5
15795    3.5
Name: rating, Length: 80668, dtype: float64

In [10]:
# Train random forest regressor model
rf = RandomForestRegressor(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

# Test model on testing set
y_pred = rf.predict(X_test)

# Create an input data point to predict the rating
user_id = 1
movie_id = 2
input_data = X_train.sample(1, random_state=42).copy()
input_data.loc[:, "user_id"] = user_id
input_data.loc[:, "movie_id"] = movie_id

# Remove user_id and movie_id from input_data
input_data.drop(["user_id", "movie_id"], axis=1, inplace=True)

# Fill NaN values with zero
input_data.fillna(0, inplace=True)

# Predict the rating using the trained model
predicted_rating = rf.predict(input_data)[0]

print("Predicted rating for user", user_id, "and movie", movie_id, "is", predicted_rating)

Predicted rating for user 1 and movie 2 is 3.165


In [11]:
from sklearn.metrics import mean_squared_error, mean_absolute_error
# Calculate root mean squared error (RMSE) and mean absolute error (MAE)
rmse = mean_squared_error(y_test, y_pred, squared=False)
mae = mean_absolute_error(y_test, y_pred)
print("RMSE:", rmse)
print("MAE:", mae)


RMSE: 0.8744141058506043
MAE: 0.6654628619595399
