In [0]:
from google.colab import drive
drive.mount('/content/drive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression,Lasso
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from umap import UMAP
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from matplotlib import pyplot as plt

import warnings
warnings.filterwarnings("ignore")

data_path = "./drive/My Drive/cs472/data/data.pkl"
data_path2 = "./drive/My Drive/cs472/data/data2.pkl"

clean_data_path = "./drive/My Drive/cs472/data/clean_data.pkl"
clean_data_path2 = "./drive/My Drive/cs472/data/clean_data2.pkl"
embedded_data_path = "./drive/My Drive/cs472/data/em_data.pkl"
np_data_path = "./drive/My Drive/cs472/data/np_data.npy"

word_dict_path = "./drive/My Drive/cs472/data/word_dict.pkl"
name_dict_path = "./drive/My Drive/cs472/data/name_dict.pkl"

# Baselines

In [0]:
def get_baselines():

    df = pd.read_pickle(clean_data_path2)
    labels = df.rating_label.astype(float)
    avg = labels.mean()
    pred = [avg]*len(df)
    print("Average for Rating Label",avg)
    print("Regression MSE Error Baseline:",sum((labels - avg)**2)/len(df))

    RMSE = mean_squared_error(labels,pred,squared=False)
    R2 = r2_score(labels,pred)

    print("RSME", RMSE)
    print("R2", R2)

        
    df['rating_label'] = df['rating_label'].astype(float)
    def f(x):
        return round(x*2)/2
    df['rating_class']  = df.rating_label.apply(f)

    a = {}
    for i in range(21):
        a[i/2] = i

    def g(x):
        return a[x]
    df['rating_class'] = df['rating_class'].apply(g)
    df['rating_class'] = df['rating_class'].astype(int)
    pd.to_pickle(df, clean_data_path2)

    mode = df.rating_class.mode().values[0]
    print("Mode for most common class:",mode)
    print("Classification Accuracy Baseline:", sum(df.rating_class == mode)/len(df))

In [0]:
get_baselines()

Average for Rating Label 5.55789609813782
Regression MSE Error Baseline: 1.7932786211878171
RSME 1.3391335337403247
R2 3.3306690738754696e-16
Mode for most common class: 12
Classification Accuracy Baseline: 0.1557419450192137


# Preprocessing

In [0]:
def load_data():
    oc = OrdinalEncoder()
    df = pd.read_pickle(clean_data_path2)
    df = df.drop(["title"],axis=1)
    vect = CountVectorizer(min_df=5, ngram_range=(2, 2))
    text_train = df["description"].values
    df = df.drop(["description"],axis=1)
    actors = oc.fit_transform(df["actor"].values.reshape(-1,1))
    directors = oc.fit_transform(df["director"].values.reshape(-1,1))
    df["actor"] = actors.flatten()
    df["director"] = directors.flatten()
    text_data = vect.fit(text_train).transform(text_train)
    labels = df["rating_label"].astype(np.float64)
    
    df = df.drop(["rating_label",'rating_class'],axis=1)
    df = df.astype(np.int64)
    data = sp.hstack((df.values,text_data))
    
    trainX, testX, trainY, testY = train_test_split(data,labels,test_size=0.3)
    return trainX,testX, trainY,testY

In [0]:
trainX,testX,trainY,testY = load_data()

# Imporved Preprocessing

In [0]:
import torch
import torch.nn as nn
import pickle
import pandas as pd
from nltk.tokenize import sent_tokenize, word_tokenize
import nltk
import os
import numpy as np
import itertools
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [0]:
def get_embedding(path_):

    # get name embedding space
    dict_ = pickle.load( open(path_, "rb" ) ) 

    # make embedding space for names
    vocab_size = len(dict_) - 2
    embed_length = dict_['em_length']

    return nn.Embedding(vocab_size, embed_length), dict_


def load_text_numeric_data(load_=True, split=False):

    if load_ and os.path.exists(embedded_data_path):
        data = pd.read_pickle(embedded_data_path)
    else:
        data = pd.read_pickle(clean_data_path2)

        word_embeds, word_dict  = get_embedding(word_dict_path)
        name_embeds, name_dict = get_embedding(name_dict_path)

        def word_embed(x):
            a = [list(word_embeds(torch.tensor([word_dict[t]], dtype=torch.long)).detach().numpy()[0]) for t in word_tokenize(x)][:5]
            return sum(a, [])

        def name_embed(x):
            return list(name_embeds(torch.tensor([name_dict[x]], dtype=torch.long)).detach().numpy()[0])

        data['words'] = data.description + " " + data.title

        labels = data.rating_label

        data['actor_em'] = data.actor.apply(name_embed)
        data['director_em'] = data.director.apply(name_embed)
        data['word_em'] = data.words.apply(word_embed)

        pd.to_pickle(data, embedded_data_path)

    if load_ and os.path.exists(np_data_path):
        data_ = np.load(np_data_path, allow_pickle=True)
    else:
        data['temp'] = (data['actor_em'] + data['director_em'] + data['word_em']).apply(np.array)   
        data = data[data['temp'].apply(lambda x: len(x) == 138)]

        s_ratings = pd.get_dummies(data.s_rating,drop_first=True)
        genres = pd.get_dummies(data.genre, drop_first=True)

        temp = np.array([np.array(xi) for xi in data.temp.values])
        data_ = np.hstack((s_ratings.values, genres.values, np.array([data.runtime.values]).T, temp, np.array([data.rating_label.values]).T))

        np.save(np_data_path, data_,allow_pickle=True)

    if split:
        return train_test_split(data_[:,:-1].astype(np.float64), data_[:,-1].astype(np.float64))
    else:
        return data_[:,:-1], data_[:,-1]

In [0]:
train2X,test2X,train2Y,test2Y = load_text_numeric_data(split = True)
data,labels = load_text_numeric_data(split=False)

In [0]:
print(data.shape)

(26941, 160)


# Linear Regression

## Initial Preprocessing Results (Vanilla Model)



In [0]:
lr = LinearRegression(n_jobs=-1)
lr.fit(trainX,trainY)
pred = lr.predict(testX)
RMSE = mean_squared_error(testY,pred,squared=False)
R2 = r2_score(testY,pred)
print("LR RMSE", RMSE)
print("LR R2",R2)

LR RMSE 1.8928308659413582
LR R2 -1.0398336671628128


## Improved Preprocessing Results (Vanilla Model)

In [0]:
lr = LinearRegression(n_jobs=-1)
lr.fit(train2X,train2Y)
pred = lr.predict(test2X)
RMSE = mean_squared_error(test2Y, pred,squared=False)
R2 = r2_score(test2Y,pred)
print("LR RMSE", RMSE)
print("LR R2", R2)

LR RMSE 1.212214203665244
LR R2 0.1594041530839012


## Improved Preprocessing Grid Search Results without Dimension Reduction

In [0]:
lr = LinearRegression()
param_grid = {"fit_intercept":[True,False],"normalize":[True,False]}
lr_gs = GridSearchCV(lr,param_grid,cv=5,refit=True,verbose=1).fit(train2X,train2Y)
best = lr_gs.best_estimator_
pred = best.predict(test2X)
rmse = mean_squared_error(test2Y,pred,squared=False)
r2 = r2_score(test2Y,pred)
print("best RMSE",rmse)
print("best r2", r2)

Fitting 5 folds for each of 4 candidates, totalling 20 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  20 out of  20 | elapsed:    4.6s finished


best RMSE 1.2149709423726427
best r2 0.16668220147198043


## Improved Preprocessing Grid Search Results with PCA Dimension Reduction

In [0]:
lr = LinearRegression()
pca = PCA()
pipe = Pipeline(steps=[('pca',pca),('lr',lr)])
param_grid = {"pca__n_components":[2,5,10,20,50,100,150], "lr__fit_intercept":[True,False], "lr__normalize":[True,False]}
lr_gs = GridSearchCV(pipe,param_grid,cv=5,refit=True, verbose=1,n_jobs=-1).fit(train2X,train2Y)
best = lr_gs.best_estimator_
print("Best hyper parameters", lr_gs.best_params_)
pred = best.predict(test2X)
RMSE = mean_squared_error(test2Y, pred,squared=False)
R2 = r2_score(test2Y,pred)
print("Best RMSE", RMSE)
print("Best R2", R2)


Fitting 5 folds for each of 28 candidates, totalling 140 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   13.5s
[Parallel(n_jobs=-1)]: Done 140 out of 140 | elapsed:   43.4s finished


Best hyper parameters {'lr__fit_intercept': True, 'lr__normalize': False, 'pca__n_components': 150}
Best RMSE 1.232775116787417
Best R2 0.14620762345543037


# Lasso Regression

## Initial Preprocessing Results



In [0]:
lasso = Lasso(max_iter=1000)
param_grid = {"alpha":[0,.1,.2,.3,.4,.5,.6,.7,.8,.9,1.0]}
l_gs = GridSearchCV(lasso,param_grid,cv=5,refit=True,verbose=1,n_jobs=-1).fit(trainX,trainY)
best = l_gs.best_estimator_
print("best parameters", l_gs.best_params_)
pred = best.predict(testX)
RMSE = mean_squared_error(testY,pred,squared=False)
R2 = r2_score(testY,pred)
print("Lasso", RMSE)
print("Lasso R2", R2)

Fitting 5 folds for each of 11 candidates, totalling 55 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed: 59.5min
[Parallel(n_jobs=-1)]: Done  55 out of  55 | elapsed: 76.1min finished


best parameters {'alpha': 0.1}
Lasso 1.3099146530497539
Lasso R2 0.04651073319688248


## Improved Preprocessing Results (Vanilla Implementation)

In [0]:
lasso = Lasso(max_iter=1000)
lasso.fit(train2X,train2Y)
pred = lasso.predict(test2X)
rmse = mean_squared_error(test2Y,pred,squared=False)
r2 = r2_score(test2Y,pred)
print("RMSE",rmse)
print("R2",r2)

RMSE 1.3240772087730381
R2 0.015056828501165076


## Improved Preprocessing Grid Search Results with No Dimension Reduction

In [0]:
lasso = Lasso(max_iter=1000)
param_grid = {"alpha":[.1,.2,.3,.4,.5,.6,.7,.8,.9,1.0]}
l_gs = GridSearchCV(lasso,param_grid,cv=5,refit=True,verbose=1,n_jobs=-1).fit(train2X,train2Y)
best = l_gs.best_estimator_
print("best parameters", l_gs.best_params_)
pred = best.predict(test2X)
RMSE = mean_squared_error(test2Y,pred,squared=False)
R2 = r2_score(test2Y,pred)
print("Lasso", RMSE)
print("Lasso R2", R2)

Fitting 5 folds for each of 10 candidates, totalling 50 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.


best parameters {'alpha': 0.1}
Lasso 1.2982523512683772
Lasso R2 0.03584526407313571


[Parallel(n_jobs=-1)]: Done  50 out of  50 | elapsed:    2.0s finished


## Improved Preprocessing Grid Search Results with PCA Dimension Reduction

In [0]:
lasso = Lasso(max_iter=1000)
pca = PCA()
pipe = Pipeline(steps=[('pca',pca),('lasso',lasso)])
param_grid = {"lasso__alpha":[0,.1,.2,.3,.4,.5,.6,.7,.8,.9,1.0],"pca__n_components":[2,5,10,20,50,100,150]}
l_gs = GridSearchCV(pipe,param_grid,cv=5,verbose=1,n_jobs=-1).fit(train2X,train2Y)
print("best parameters", l_gs.best_params_)
pred = l_gs.predict(test2X)
RMSE = mean_squared_error(test2Y,pred,squared=False)
R2 = r2_score(test2Y,pred)
print("Lasso", RMSE)
print("Lasso R2", R2)

Fitting 5 folds for each of 77 candidates, totalling 385 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed:   47.7s
[Parallel(n_jobs=-1)]: Done 196 tasks      | elapsed:  1.5min
[Parallel(n_jobs=-1)]: Done 385 out of 385 | elapsed:  2.3min finished


best parameters {'lasso__alpha': 0, 'pca__n_components': 150}
Lasso 1.232775116787417
Lasso R2 0.14620762345543037


# KNN Regression

## Initial Preprocessing Results

In [0]:
knn  = KNeighborsRegressor(n_jobs=-1)
grid = {"n_neighbors": [5,7,9,11,13,15], "weights":["distance","uniform"], "algorithm": ["auto", "ball_tree", "kd_tree"]}
knn_gs = GridSearchCV(knn,grid,cv=5,refit=True,n_jobs=1,verbose=1).fit(trainX,trainY)
print("best parameters", knn_gs.best_params_)
best=knn_gs.best_estimator_
pred = best.predict(testX)
RMSE = mean_squared_error(testY,pred,squared=False)
R2 = r2_score(testY,pred)
print("KNN RMSE", RMSE)
print("KNN R2", R2)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 180 out of 180 | elapsed: 12.5min finished


best parameters {'algorithm': 'auto', 'n_neighbors': 15, 'weights': 'distance'}
KNN RMSE 1.027792569047347
KNN R2 0.4129964132986874


## Improved Preprocessing Results (Vanilla Model)

In [0]:
knn = KNeighborsRegressor(n_jobs=-1)
knn.fit(train2X,train2Y)
pred = knn.predict(test2X)
rmse = mean_squared_error(test2Y,pred)
r2 = r2_score(test2Y,pred)
print("RMSE",rmse)
print("R2",r2)

RMSE 1.3846627672209026
R2 0.22209140897332302


## Improved Preprocessing Grid Search Results with No Dimension Reduction

In [0]:
knn  = KNeighborsRegressor(n_jobs=-1)
grid = {"n_neighbors": [5,7,9,11,13,15], "weights":["distance","uniform"], "algorithm": ["auto", "ball_tree", "kd_tree"]}
knn_gs = GridSearchCV(knn,grid,cv=5,refit=True,n_jobs=1,verbose=1).fit(train2X,train2Y)
print("best parameters", knn_gs.best_params_)
best=knn_gs.best_estimator_
pred = best.predict(test2X)
RMSE = mean_squared_error(test2Y,pred,squared=False)
R2 = r2_score(test2Y,pred)
print("KNN RMSE", RMSE)
print("KNN R2", R2)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 180 out of 180 | elapsed: 35.5min finished


best parameters {'algorithm': 'ball_tree', 'n_neighbors': 9, 'weights': 'distance'}
KNN RMSE 0.9321924026400008
KNN R2 0.5029047406315674


## Improved Preprocessing Grid Search Results with PCA Dimension Reduction

In [0]:
knn  = KNeighborsRegressor(n_jobs=-1)
pca = PCA()
pipe = Pipeline(steps=[('pca',pca),('knn',knn)])
grid = {"pca__n_components":[2,5,10,20,50,100,150],"knn__n_neighbors": [5,7,9,11,13,15], "knn__weights":["distance","uniform"], "knn__algorithm": ["auto", "ball_tree", "kd_tree"]}
knn_gs = GridSearchCV(pipe,grid,cv=5,n_jobs=1,verbose=1).fit(train2X,train2Y)
print("best parameters", knn_gs.best_params_)
pred = knn_gs.predict(test2X)
RMSE = mean_squared_error(test2Y,pred,squared=False)
R2 = r2_score(test2Y,pred)
print("KNN RMSE", RMSE)
print("KNN R2", R2)

Fitting 5 folds for each of 252 candidates, totalling 1260 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done 1260 out of 1260 | elapsed: 78.9min finished


best parameters {'knn__algorithm': 'auto', 'knn__n_neighbors': 11, 'knn__weights': 'distance', 'pca__n_components': 100}
KNN RMSE 0.8905617772357917
KNN R2 0.554433371617911


# RF Regression

## Initial Preprocessing Results

In [0]:
rf = RandomForestRegressor(n_jobs=-1)
rf.fit(trainX,trainY)
pred = rf.predict(testX)
RMSE = mean_squared_error(testY,pred,squared=False)
R2 = r2_score(testY,pred)
print("RF RMSE", RMSE)
print("RF R2", R2)

RF RMSE 0.9931564635005324
RF R2 0.4384266579249585


## Improved Preprocessing Results (Vanilla Model)

In [0]:
rf = RandomForestRegressor(n_jobs=-1)
rf.fit(train2X,train2Y)
pred = rf.predict(test2X)
RMSE = mean_squared_error(test2Y,pred,squared=False)
R2 = r2_score(test2Y,pred)
print("RF RMSE", RMSE)
print("RF R2", R2)

RF RMSE 0.9134848720643921
RF R2 0.5226562701351739


## Improved Preprocessing Grid Search Results With No Dimension Reduction

In [0]:
rf = RandomForestRegressor(max_depth=3,n_jobs=-1)
param_grid = {"criterion":["mse","mae"], "max_features":["auto", "sqrt","log2"]}
rf_gs = GridSearchCV(rf,param_grid,cv=5,verbose=1,n_jobs=-1).fit(train2X,train2Y)
pred = rf_gs.predict(test2X)
print("best parameters",rf_gs.best_params_)
RMSE = mean_squared_error(test2Y,pred,squared=False)
R2 = r2_score(test2Y,pred)
print("RF RMSE", RMSE)
print("RF R2", R2)

Fitting 5 folds for each of 6 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed: 491.1min finished


best parameters {'criterion': 'mse', 'max_features': 'auto'}
RF RMSE 1.2534803949167963
RF R2 0.13133739945981182


## Improved Preprocessing Grid Search Results With PCA Dimension Reduction

In [0]:
rf = RandomForestRegressor(max_depth=3,n_jobs=-1)
pca = PCA()
pipe = Pipeline(steps=[("pca",pca),("rf",rf)])
param_grid = {"pca__n_components":[2,5,10,20,100,150],"rf__criterion":["mse","mae"], "rf__max_features":["auto", "sqrt","log2"]}
rf_gs = GridSearchCV(pipe,param_grid,cv=5,verbose=1,n_jobs=-1).fit(train2X,train2Y)
pred = rf_gs.predict(test2X)
print("best parameters",rf_gs.best_params_)
RMSE = mean_squared_error(test2Y,pred,squared=False)
R2 = r2_score(test2Y,pred)
print("RF RMSE", RMSE)
print("RF R2", R2)

Fitting 5 folds for each of 36 candidates, totalling 180 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  46 tasks      | elapsed: 30.8min
