# SVM Model

Ben Burt

Ben Christensen

Jane Cox

In [None]:
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

import pandas as pd
from scipy import linalg as la
from scipy.sparse import csr_matrix
from pandas.api.types import CategoricalDtype
from scipy.sparse import linalg as spla

# Our Data Set

In [None]:
#Benjamin's path: path = "/Users/Armen/Desktop/SpringDataProject/"
#Ben C's path:
path = "/Users/benchristensen/Desktop/Recommneder_System_Project/ml-20m/"
r = pd.read_csv("ml-20m/ratings.csv",nrows=100000)

In [None]:
def data_setup(user_id):
    """
    This function prepares our data for the train test split
    Inputs: 
    user_id
    Outputs:
    dfs 
    movies_not_seen
    movies_seen
    known_ratings
    """
    #get only those movies with more than 17 ratings (cutting out ~50% of movies)
    merged = r.merge(r.groupby("movieId").size().reset_index(name='count'),
                     how='right', on='movieId')
    df = merged[merged["count"]>17].sort_values(['userId', 'movieId'])
    
    #get our lucky user
    user1 = df[df.userId == user_id]

    df.drop(labels=df[df.userId == user_id].index, axis=0, inplace=True)
    
    #create a matrix from the ratings list
    user_c = CategoricalDtype(sorted(df.userId.unique()), ordered=True)
    movie_c = CategoricalDtype(sorted(df.movieId.unique()), ordered=True)
    row = df.userId.astype(user_c).cat.codes
    col = df.movieId.astype(movie_c).cat.codes
    
    #this matrix is movies x users
    sparse_matrix = csr_matrix((df['rating'], (row, col)), \
                               shape=(user_c.categories.size, movie_c.categories.size)).T
    
    #we can't deal with these NaN's, so we're lazy and just replace them all with 3's
    dfs = pd.SparseDataFrame(sparse_matrix, \
                         index=movie_c.categories, \
                         columns=user_c.categories, \
                         default_fill_value=3)
    
    #we can only train on the movies that the user has seen
    movies_seen = dfs.loc[user1.movieId.values].index.values
    movies_not_seen = list(set(dfs.index.values) - set(movies_seen))
    
    #turn user's rating into binary 'liked' (1) or 'disliked' (-1)
    known_ratings = user1.rating.values
    known_ratings[known_ratings <= 3] = -1 #right now the cutoff is 3. could also try 2.5?
    known_ratings[known_ratings > 3] = 1
    
    return dfs, movies_not_seen, movies_seen, known_ratings

In [None]:
user__id = 50
#dfs, movies_not_seen, movies_seen, known_ratings = set_up_test(user_id)
dfs, movies_not_seen, movies_seen, known_ratings = data_setup(user__id)

In [None]:
#for the test train split, we only want the movies for which we have labels
#print(movies_seen)
#print(dfs.loc[movies_seen])
x_train, x_test, y_train, y_test = train_test_split(dfs.loc[movies_seen].values,known_ratings)

#This is what we'll want to predict
x_to_predict = dfs.loc[movies_not_seen]

In [None]:
#check the accuracy of the model
parameters = {'kernel' : ('rbf', 'poly')
              ,'C' : [1,50], 'degree' : [1,2,3,10], 'gamma' : [1,2,3,'auto']}
model = SVC()
clf = GridSearchCV(model, parameters)
clf.fit(x_train,y_train)
clf.score(x_test,y_test)

In [None]:
users = 138493 #from the errata file
scores = []
parameters = {'kernel' : ('rbf', 'poly')
              ,'C' : [1,50], 'degree' : [1,2,3,10], 'gamma' : [1,2,3,'auto']}
model = SVC()
clf = GridSearchCV(model, parameters)
for user_id in range(1,5):
    dfs, movies_not_seen, movies_seen, known_ratings = data_setup(user_id)
    x_train, x_test, y_train, y_test = train_test_split(dfs.loc[movies_seen].values,known_ratings)
    clf.fit(x_train,y_train)
    scores.append(clf.score(x_test,y_test))
#This is what we'll want to predict
x_to_predict = dfs.loc[movies_not_seen]

## STILL TO DO
 - run it through an average model. Do we recommend or not? 
 - potentially compare if things run or not
 - NMF
 - Kmeans (on users... and movies/genome_scores?)
 - PCA (both or one?)
 - Unknown 