In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error
import warnings
warnings.filterwarnings('ignore')

In [2]:
train_data = pd.read_csv('movie_ratings_train.csv')
test_data = pd.read_csv('movie_ratings_test.csv')
movies_data = pd.read_csv('movies.csv')

In [3]:
train_data.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,1,4.0,964982703
1,1,3,4.0,964981247
2,1,6,4.0,964982224
3,1,47,5.0,964983815
4,1,50,5.0,964982931


In [4]:
test_data.head()

Unnamed: 0,userId,movieId,timestamp
0,1,349,964982563
1,1,592,964982271
2,1,780,964984086
3,1,1196,964981827
4,1,1208,964983250


In [5]:
movies_data.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
train_data.drop(columns=['timestamp'],inplace=True)
test_data.drop(columns=['timestamp'],inplace=True)

In [7]:
train_data.shape

(90836, 3)

In [8]:
test_data.shape

(10000, 2)

In [9]:
movies_data.shape

(9742, 3)

In [10]:
round(train_data.isnull().sum() / train_data.shape[0] * 100.00,10)

userId     0.0
movieId    0.0
rating     0.0
dtype: float64

In [11]:
round(test_data.isnull().sum() / test_data.shape[0] * 100.00,10)

userId     0.0
movieId    0.0
dtype: float64

In [12]:
round(movies_data.isnull().sum() / movies_data.shape[0] * 100.00,10)

movieId    0.0
title      0.0
genres     0.0
dtype: float64

In [13]:
# Merge the movies data with the training and test data
train_data_merged = pd.merge(train_data, movies_data, on='movieId')
test_data_merged = pd.merge(test_data, movies_data, on='movieId')

In [14]:
train_data_merged.head()

Unnamed: 0,userId,movieId,rating,title,genres
0,1,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,5,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
2,7,1,4.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
3,15,1,2.5,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
4,19,1,4.0,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy


In [15]:
test_data_merged.head()

Unnamed: 0,userId,movieId,title,genres
0,1,349,Clear and Present Danger (1994),Action|Crime|Drama|Thriller
1,11,349,Clear and Present Danger (1994),Action|Crime|Drama|Thriller
2,91,349,Clear and Present Danger (1994),Action|Crime|Drama|Thriller
3,99,349,Clear and Present Danger (1994),Action|Crime|Drama|Thriller
4,117,349,Clear and Present Danger (1994),Action|Crime|Drama|Thriller


In [16]:
train_data_merged.drop(columns=['title'],inplace=True)
test_data_merged.drop(columns=['title'],inplace=True)

In [17]:
# One-hot encoding of genres
train_data_merged = train_data_merged.join(train_data_merged['genres'].str.get_dummies('|'))
test_data_merged = test_data_merged.join(test_data_merged['genres'].str.get_dummies('|'))

In [18]:
train_data_merged.head()

Unnamed: 0,userId,movieId,rating,genres,(no genres listed),Action,Adventure,Animation,Children,Comedy,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,1,4.0,Adventure|Animation|Children|Comedy|Fantasy,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,5,1,4.0,Adventure|Animation|Children|Comedy|Fantasy,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,7,1,4.5,Adventure|Animation|Children|Comedy|Fantasy,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
3,15,1,2.5,Adventure|Animation|Children|Comedy|Fantasy,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
4,19,1,4.0,Adventure|Animation|Children|Comedy|Fantasy,0,0,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [19]:
test_data_merged.head()

Unnamed: 0,userId,movieId,genres,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,...,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,349,Action|Crime|Drama|Thriller,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
1,11,349,Action|Crime|Drama|Thriller,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
2,91,349,Action|Crime|Drama|Thriller,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
3,99,349,Action|Crime|Drama|Thriller,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0
4,117,349,Action|Crime|Drama|Thriller,1,0,0,0,0,1,0,...,0,0,0,0,0,0,0,1,0,0


In [20]:
X = train_data_merged.drop(['userId', 'movieId', 'rating', 'genres'], axis=1)
y = train_data_merged['rating']

In [21]:
X

Unnamed: 0,(no genres listed),Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
1,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
2,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
3,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
4,0,0,1,1,1,1,0,0,0,1,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
90831,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0
90832,0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0
90833,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0
90834,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,1,0,0


In [22]:
y

0        4.0
1        4.0
2        4.5
3        2.5
4        4.0
        ... 
90831    2.5
90832    4.5
90833    3.0
90834    3.5
90835    3.5
Name: rating, Length: 90836, dtype: float64

In [23]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [24]:
linear_model = LinearRegression()

In [25]:
linear_model.fit(X_train,y_train)

LinearRegression()

In [26]:
val_predictions = linear_model.predict(X_val)

In [27]:
val_predictions

array([3.25685095, 3.25685095, 3.60734004, ..., 3.60069819, 3.47485038,
       3.5948342 ])

In [28]:
mse = mean_squared_error(y_val, val_predictions)
rmse = np.sqrt(mse)

In [29]:
print('Mean Squared Error on Validation Set:', mse)
print('Root Mean Squared Error on Validation Set:', rmse)

Mean Squared Error on Validation Set: 1.048530632106445
Root Mean Squared Error on Validation Set: 1.0239778474686085


In [30]:
model = RandomForestRegressor(n_estimators=100, random_state=42)
model.fit(X_train, y_train)

RandomForestRegressor(random_state=42)

In [31]:
val_predictions = model.predict(X_val)

In [32]:
mse = mean_squared_error(y_val, val_predictions)
rmse = np.sqrt(mse)

In [33]:
print('Mean Squared Error on Validation Set:', mse)
print('Root Mean Squared Error on Validation Set:', rmse)

Mean Squared Error on Validation Set: 1.0134972868405911
Root Mean Squared Error on Validation Set: 1.0067260237227362


In [34]:
X_test = test_data_merged.drop(['userId', 'movieId', 'genres'], axis=1)

In [35]:
X_test

Unnamed: 0,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Film-Noir,Horror,IMAX,Musical,Mystery,Romance,Sci-Fi,Thriller,War,Western
0,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0
1,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0
2,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0
3,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0
4,1,0,0,0,0,1,0,1,0,0,0,0,0,0,0,0,1,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
9995,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
9996,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
9997,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0
9998,1,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,1,0,0


In [36]:
X_test = X_test.reindex(columns = X_train.columns, fill_value=0)

In [37]:
test_predictions = model.predict(X_test)

In [38]:
test_predictions

array([3.7329344 , 3.7329344 , 3.7329344 , ..., 3.28522913, 3.28522913,
       3.28522913])

In [39]:
predictions_df = test_data_merged[['userId', 'movieId']]
predictions_df['predicted_rating'] = test_predictions

# Specify the file path for the CSV
output_file_path = 'save_predictions.csv'

# Export to CSV
predictions_df.to_csv(output_file_path, index=False)

In [42]:
l = list(test_predictions)

In [45]:
l

[3.7329343957767454,
 3.7329343957767454,
 3.7329343957767454,
 3.7329343957767454,
 3.7329343957767454,
 3.7329343957767454,
 3.7329343957767454,
 3.7329343957767454,
 3.7329343957767454,
 3.7329343957767454,
 3.7329343957767454,
 3.7329343957767454,
 3.7329343957767454,
 3.7329343957767454,
 3.7329343957767454,
 3.7329343957767454,
 3.7329343957767454,
 3.7329343957767454,
 3.7329343957767454,
 3.4645501643360643,
 3.4645501643360643,
 3.4645501643360643,
 3.4645501643360643,
 3.4645501643360643,
 3.4645501643360643,
 3.4645501643360643,
 3.4645501643360643,
 3.4645501643360643,
 3.4645501643360643,
 3.4645501643360643,
 3.4645501643360643,
 3.4645501643360643,
 3.4645501643360643,
 3.4645501643360643,
 3.4645501643360643,
 3.4645501643360643,
 3.4645501643360643,
 3.4645501643360643,
 3.4645501643360643,
 3.4645501643360643,
 3.4645501643360643,
 3.4645501643360643,
 3.4645501643360643,
 3.4645501643360643,
 3.4645501643360643,
 3.4645501643360643,
 3.4645501643360643,
 3.4645501643