In [9]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [10]:
df = pd.read_csv("../../preprocessing/merged_data.csv")

In [11]:
df.head()

Unnamed: 0,UserID,MovieID,Rating,Gender,Age,Title,Genres,Year
0,1,1193,5,0,1,One Flew Over the Cuckoo's Nest,[8],1975
1,1,661,3,0,1,James and the Giant Peach,"[3, 4, 12]",1996
2,1,914,3,0,1,My Fair Lady,"[12, 14]",1964
3,1,3408,4,0,1,Erin Brockovich,[8],2000
4,1,2355,5,0,1,"Bug's Life, A","[3, 4, 5]",1998


In [12]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000209 entries, 0 to 1000208
Data columns (total 8 columns):
 #   Column   Non-Null Count    Dtype 
---  ------   --------------    ----- 
 0   UserID   1000209 non-null  int64 
 1   MovieID  1000209 non-null  int64 
 2   Rating   1000209 non-null  int64 
 3   Gender   1000209 non-null  int64 
 4   Age      1000209 non-null  int64 
 5   Title    1000209 non-null  object
 6   Genres   1000209 non-null  object
 7   Year     1000209 non-null  int64 
dtypes: int64(6), object(2)
memory usage: 61.0+ MB


In [13]:
filtered_df = df.drop(columns=["Gender","Age","Title","Year", "Genres"])
filtered_df.head()

Unnamed: 0,UserID,MovieID,Rating
0,1,1193,5
1,1,661,3
2,1,914,3
3,1,3408,4
4,1,2355,5


In [14]:
filtered_df["Rating"].value_counts()

Rating
4    348971
3    261197
5    226310
2    107557
1     56174
Name: count, dtype: int64

In [15]:
from sklearn.linear_model import LinearRegression, Lasso, LogisticRegression
from sklearn.svm import SVR
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error,r2_score, roc_auc_score
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.pipeline import Pipeline
from surprise import SVD, Dataset, Reader, accuracy
from surprise.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder , StandardScaler

In [16]:
scaler = StandardScaler()
scaled_df = filtered_df
scaled_df['Original_Rating'] = scaled_df['Rating']
scaled_df['Rating'] = scaler.fit_transform(scaled_df[['Rating']])
# Create a lookup dictionary for OriginalRating (O(1) access)
# Key: (UserID, MovieID), Value: OriginalRating
rating_lookup = scaled_df.set_index(['UserID', 'MovieID'])['Original_Rating'].to_dict()

In [17]:
scaled_df.head()

Unnamed: 0,UserID,MovieID,Rating,Original_Rating
0,1,1193,1.269747,5
1,1,661,-0.520601,3
2,1,914,-0.520601,3
3,1,3408,0.374573,4
4,1,2355,1.269747,5


In [18]:
reader = Reader(rating_scale=(scaled_df['Rating'].min(), scaled_df['Rating'].max()))
data = Dataset.load_from_df(scaled_df[['UserID', 'MovieID', 'Rating']], reader)

In [19]:
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

In [20]:
model = SVD(n_factors=10, random_state=42)
model.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x21c827fb170>

In [21]:
test_preds = model.test(testset)
accuracy.rmse(test_preds)

RMSE: 0.7871


0.7870733495976184

In [22]:
# Reverting a single prediction
scaled_pred = model.predict(uid=1, iid=661).est
original_pred = scaler.inverse_transform([[scaled_pred]])[0][0]
print(f"Predicted rating (1-5): {original_pred:.2f}")

Predicted rating (1-5): 3.74


In [23]:
# Reverting multiple predictions
test_preds = model.test(testset)
scaled_preds = np.array([pred.est for pred in test_preds]).reshape(-1, 1)
scaled_actuals = np.array([pred.r_ui for pred in test_preds]).reshape(-1, 1)
original_preds = scaler.inverse_transform(scaled_preds).flatten()
original_actuals = scaler.inverse_transform(scaled_actuals).flatten()

In [24]:
# Get actual ratings using the precomputed dictionary (O(1) per lookup)
actual_ratings = [rating_lookup.get((int(pred.uid), int(pred.iid)), None) for pred in test_preds]

# Compare results (skip missing entries)
print("Original vs Predicted:")
for actual, pred in zip(actual_ratings, original_preds):
    if actual is not None:
        print(f"  Actual: {actual:.2f}, Predicted: {pred:.2f}")

Original vs Predicted:
  Actual: 1.00, Predicted: 1.98
  Actual: 3.00, Predicted: 2.44
  Actual: 4.00, Predicted: 3.89
  Actual: 3.00, Predicted: 3.27
  Actual: 3.00, Predicted: 2.27
  Actual: 3.00, Predicted: 2.64
  Actual: 1.00, Predicted: 2.04
  Actual: 3.00, Predicted: 2.97
  Actual: 4.00, Predicted: 3.70
  Actual: 4.00, Predicted: 3.68
  Actual: 5.00, Predicted: 4.51
  Actual: 4.00, Predicted: 3.95
  Actual: 4.00, Predicted: 3.46
  Actual: 2.00, Predicted: 3.19
  Actual: 4.00, Predicted: 4.49
  Actual: 4.00, Predicted: 3.73
  Actual: 4.00, Predicted: 2.76
  Actual: 3.00, Predicted: 3.29
  Actual: 5.00, Predicted: 3.80
  Actual: 3.00, Predicted: 3.55
  Actual: 3.00, Predicted: 2.62
  Actual: 4.00, Predicted: 2.80
  Actual: 5.00, Predicted: 3.47
  Actual: 3.00, Predicted: 3.43
  Actual: 3.00, Predicted: 2.70
  Actual: 2.00, Predicted: 3.69
  Actual: 4.00, Predicted: 2.87
  Actual: 4.00, Predicted: 2.87
  Actual: 4.00, Predicted: 3.76
  Actual: 5.00, Predicted: 3.95
  Actual: 3.00, P

In [25]:
# Define tolerance (e.g., predictions within ±1 stars are "correct")
tolerance = 1
correct = np.abs(original_preds - original_actuals) <= tolerance
accuracy = np.mean(correct) * 100

print(f"Accuracy (Within ±{tolerance} Stars): {accuracy:.2f}%")

Accuracy (Within ±1 Stars): 75.66%
