In [None]:
import csv
import pandas as pd
import numpy as np
import seaborn as sns

from pydrive.auth import GoogleAuth
from google.colab import drive
from pydrive.drive import GoogleDrive
from google.colab import auth
from oauth2client.client import GoogleCredentials

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.metrics import roc_curve, roc_auc_score
import matplotlib.pyplot as plt

In [None]:
auth.authenticate_user()
gauth = GoogleAuth()
gauth.credentials = GoogleCredentials.get_application_default()
drive = GoogleDrive(gauth)

test_id = '1OCmcnJbQ7yKMFD8di8QIUTdZrFtjWCGt'
rating_df_id = '1TgkqAqWX2nXn4w8fNIRAO07G42klFwKM'
movie_id = '1hiv4rOCwSlzP_JfoNPUKSAZVhwgpHhYw'
link_id = '1quhfEIeGyJKDsqoC5J9iT053aSbLpmLX'
genome_tags_id = '156oC_Sm9ZVGCZ1a997mR2lTqT2vzCxiM'
genome_scores_id = '1K5yHeEUUU-C7Xzpy_Kezf85uSAU2yASi'
tag_id = '1bh4y_xLRPCnifHfpiCUrC3Vc0lnGRdWF'


test_download = drive.CreateFile({'id': test_id})
rating_df_download = drive.CreateFile({'id': rating_df_id})
movie_download = drive.CreateFile({'id': movie_id})
link_dowmload = drive.CreateFile({'id': genome_tags_id})
genome_tags_download = drive.CreateFile({'id': genome_tags_id})
genome_scores_download = drive.CreateFile({'id': genome_scores_id})
tag_download = drive.CreateFile({'id': tag_id})

# Download the file to a local disc
test_download.GetContentFile('test_file.csv')
rating_df_download.GetContentFile('rating_df_file.csv')
movie_download.GetContentFile('movie_file.csv')
link_dowmload.GetContentFile('link_file.csv')
genome_tags_download.GetContentFile('genome_tags_file.csv')
genome_scores_download.GetContentFile('genome_scores_file.csv')
tag_download.GetContentFile('tag_file.csv')

# Specify the data type for the problematic column (e.g., as 'str' if it should be a string)
# If you're not sure about the correct data type, you can use 'str' to read everything as strings and then process it later.
dtype_dict = {6: 'str'}

test_df = pd.read_csv("test_file.csv", dtype=dtype_dict, low_memory=False)
rating_df = pd.read_csv("rating_df_file.csv", dtype=dtype_dict, low_memory=False)
movie_df = pd.read_csv("movie_file.csv", dtype=dtype_dict, low_memory=False)
link_df = pd.read_csv("link_file.csv", dtype=dtype_dict, low_memory=False)
genome_tags_df = pd.read_csv("genome_tags_file.csv", dtype=dtype_dict, low_memory=False)
genome_scores_df = pd.read_csv("genome_scores_file.csv", dtype=dtype_dict, low_memory=False)
tag_df = pd.read_csv("tag_file.csv", dtype=dtype_dict, low_memory=False)

In [None]:
!pip install surprise

Collecting surprise
  Downloading surprise-0.1-py2.py3-none-any.whl (1.8 kB)
Collecting scikit-surprise (from surprise)
  Downloading scikit-surprise-1.1.3.tar.gz (771 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m772.0/772.0 kB[0m [31m10.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: scikit-surprise
  Building wheel for scikit-surprise (setup.py) ... [?25l[?25hdone
  Created wheel for scikit-surprise: filename=scikit_surprise-1.1.3-cp310-cp310-linux_x86_64.whl size=3163496 sha256=be09fa0d12117e830f41960f46d3a9f30ef009717a712f83982df246ff612df2
  Stored in directory: /root/.cache/pip/wheels/a5/ca/a8/4e28def53797fdc4363ca4af740db15a9c2f1595ebc51fb445
Successfully built scikit-surprise
Installing collected packages: scikit-surprise, surprise
Successfully installed scikit-surprise-1.1.3 surprise-0.1


In [None]:
from collections import defaultdict #acts just like dictionary except for the fact that it never raises a KeyError. It displays a default value for the Key that doesn't exist. Useful as many movies wouldn't have ratings as users did not watch them so this would avoid KeyErrors.
from surprise import SVD, Dataset #surprise is a scikit library used for recommendation systems

import pandas as pd

from surprise.prediction_algorithms.matrix_factorization import SVD
from sklearn.model_selection import train_test_split
import surprise

# SVD

In [None]:
from surprise import SVD
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import train_test_split
from surprise.accuracy import rmse, mae

# Sample the dataset (assuming rating_df is your full dataset)
sample_df = rating_df.sample(frac=0.001, random_state=42)
reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(sample_df[['userId', 'movieId', 'rating']], reader)

# Split the data into a training set and a validation set
trainset, testset = train_test_split(data, test_size=0.2)

Baseline: Predict the item's average rating across all users.


In [None]:
# Calculate the mean rating for each item using the trainset
item_means = {}
for iid, ratings in trainset.ir.items():
    item_means[iid] = np.mean([rating for (_, rating) in ratings])

# Function to predict the average for a given item
def predict_item_mean(item_id, item_means, global_mean):
    # If the item_id is not in the training set, fall back to the global mean
    return item_means.get(item_id, global_mean)

# Predict ratings for the test set based on item means
global_mean = trainset.global_mean  # Global mean rating
baseline_predictions = []
for uid, iid, true_r in testset:
    baseline_predictions.append(predict_item_mean(iid, item_means, global_mean))

# Actual ratings
actual_ratings = [true_r for (_, _, true_r) in testset]

# Calculate RMSE and MAE for the baseline predictions
baseline_rmse = np.sqrt(np.mean([(true_r - pred)**2 for true_r, pred in zip(actual_ratings, baseline_predictions)]))
baseline_mae = np.mean([abs(true_r - pred) for true_r, pred in zip(actual_ratings, baseline_predictions)])

print(f'Item Mean Baseline RMSE: {baseline_rmse}')
print(f'Item Mean Baseline MAE: {baseline_mae}')


Item Mean Baseline RMSE: 1.2779362977500783
Item Mean Baseline MAE: 0.9881728871182761


Apply SVD

In [None]:
# Create an SVD instance and train it on the training set
svd_model = SVD()
svd_model.fit(trainset)

# Make predictions on the validation (test) set
predictions = svd_model.test(testset)

# Calculate RMSE and MAE on the validation set
print("RMSE on Validation set: ", rmse(predictions, verbose=False))
print("MAE on Validation set: ", mae(predictions, verbose=False))


RMSE on Validation set:  0.996943502276251
MAE on Validation set:  0.7834825081510493


 An RMSE of 0.9884 means that the average error (in terms of the rating scale) is just under 1 rating point.
 MAE: This represents the average absolute error between the predicted and actual ratings. A MAE of 0.7781 means that on average, the model's predictions are about 0.7781 rating points off from the true rating.

Prediction

In [None]:
print(test_df.shape)

# Assuming test_df is your test set pandas DataFrame with 'userId' and 'movieId' columns
testset = list(zip(test_df['userId'].values, test_df['movieId'].values))
num_testset = len(testset)

print(f'There are {num_testset} items in the testset list.')

# Since we don't have the actual ratings, we'll use a dummy value, e.g., 0, for all of them
# Surprise ignores this value during prediction but requires it to be there in the dataset.
testset = [(uid, iid, 0) for (uid, iid) in testset]

# Now you can predict ratings for the testset using the SVD model
predictions = svd_model.test(testset)

# Display the first 5 predictions
predictions[:5]

# Calculate the number of items in the predictions list
num_predictions = len(predictions)

# Print the number of items
print(f'There are {num_predictions} items in the predictions list.')


(12340, 2)
There are 12340 items in the testset list.
There are 12340 items in the predictions list.


In [None]:
test_df.columns

Index(['userId', 'movieId'], dtype='object')

In [None]:
test_df.head()

Unnamed: 0,userId,movieId
0,49177.0,356
1,3184.0,541
2,3165.0,356
3,35462.0,356
4,25996.0,356


In [None]:
# Extract the estimated ratings and user-item pairs from the predictions
predicted_ratings = [pred.est for pred in predictions]
user_movie_pairs = [(int(pred.uid), pred.iid) for pred in predictions]

# Create a unique ID for each user-movie pair
unique_ids = ['{}_{}'.format(uid, iid) for (uid, iid) in user_movie_pairs]


# Prepare the submission DataFrame
submission_df = pd.DataFrame({
    'ID': unique_ids,
    'predicted_rating': predicted_ratings
})

# Save to a CSV file for submission
submission_df.to_csv('submission.csv', index=False)

from google.colab import files
files.download('submission.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

# PCA