## Importing Data

In [6]:
# Import essential packages
import pandas as pd
import numpy as np

# Import surprise packages
from surprise import BaselineOnly
from surprise import Dataset
from surprise import Reader
from surprise.model_selection import cross_validate

# Import models
from surprise import NormalPredictor
from surprise import KNNBasic
from surprise import SVD
from surprise.model_selection import GridSearchCV

# Import utilities
import os

In [2]:
# Importing train and test datasets
ratings_df = pd.read_csv('train.csv').drop(columns=['timestamp'],axis=0)
test_df = pd.read_csv('test.csv')

In [3]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,5163,57669,4.0
1,106343,5,4.5
2,146790,5459,5.0
3,106362,32296,2.0
4,9041,366,3.0


In [4]:
test_df.head()

Unnamed: 0,userId,movieId
0,1,2011
1,1,4144
2,1,5767
3,1,6711
4,1,7318


In [5]:
# Subset of test data
test_df_subset = test_df[:1000]

## Modelling

In [7]:
# A reader is needed in order to input rating_scale param
reader = Reader(rating_scale=(1, 5))

# Load data into format that Surprise understands using Dataset class from surprise
# The columns must correspond to user id, item id and ratings (in that order).
data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)

# We can now use this dataset as we please, e.g. calling cross_validate

In [8]:
# we are using the full train dataset to train the model here - should use test and train split in order to evaluate
trainset = data.build_full_trainset()

# Build an algorithm, and train it.
algo = SVD()
algo.fit(trainset)

<surprise.prediction_algorithms.matrix_factorization.SVD at 0x2789725c0b8>

In [None]:
# Run this cell if you would like to wait forever - GridSeach
'''
param_grid = {'n_factors': [25, 30, 35, 40], 'n_epochs': [15, 20, 25], 'lr_all': [0.001, 0.003, 0.005, 0.008],
              'reg_all': [0.08, 0.1, 0.15]}
gs = GridSearchCV(SVD, param_grid, measures=['rmse', 'mae'], cv=3, n_jobs = -1)
gs.fit(data)
algo = gs.best_estimator['rmse']
print(gs.best_score['rmse'])
print(gs.best_params['rmse'])

#Assigning values
t = gs.best_params
factors = t['rmse']['n_factors']
epochs = t['rmse']['n_epochs']
lr_value = t['rmse']['lr_all']
reg_value = t['rmse']['reg_all']
'''

In [11]:
# Example on how to predict using the algorithm
uid = str(1)  # raw user id (as in the ratings file). They are **strings**!
iid = str(2011)  # raw item id (as in the ratings file). They are **strings**!

# get a prediction for specific users and items.
pred = algo.predict(uid, iid, r_ui=4, verbose=True)

user: 1          item: 2011       r_ui = 4.00   est = 3.53   {'was_impossible': False}


In [10]:
# Predict on our test data
predcol = []
for _, row in test_df.iterrows():
    x = (algo.predict(row.userId, row.movieId))
    pred = x[3]
    predcol.append(pred)

In [13]:
len(predcol) == len(test_df)

True

In [14]:
# Convert columns to string in order to prep for concatenating
test_df['userId'] = test_df['userId'].astype(str)
test_df['movieId'] = test_df['movieId'].astype(str)

In [15]:
# Create submission column ID
test_df['Id'] =test_df['userId']+'_'+test_df['movieId']

In [16]:
kaggle_df = pd.DataFrame(
    {'Id': test_df['Id'],
     'rating': predcol
    })

In [17]:
kaggle_df.shape

(5000019, 2)

In [18]:
kaggle_df.head()

Unnamed: 0,Id,rating
0,1_2011,3.40498
1,1_4144,4.220578
2,1_5767,3.431247
3,1_6711,3.801502
4,1_7318,2.657059


In [51]:
# Export to csv
kaggle_df.to_csv("recommend_2.csv",index=False)