# Primary Notebook

## Local Code Imports - Do not delete

In [None]:
# DO NOT REMOVE THESE
%load_ext autoreload
%autoreload 2

In [None]:
# DO NOT REMOVE This
%reload_ext autoreload

In [None]:
## Uncomment to filter warnings
#import warnings
#warnings.filterwarnings('ignore')

In [1]:
## DO NOT REMOVE
## import local src module -
## src in this project will contain all your local code
## clean_data.py, model.py, visualize.py, custom.py
#from src import make_data as mk
from src import visualize as viz
from src import model as mdl
from src import custom as cm
from src import recapp as ra
#from src import pandas_operators as po

#def test_src():
    #mk.test_make_data()
    #viz.test_viz()
    #mdl.test_model()
    #po.test_pandas()

#    return 1

In [None]:
#test_src()

## Code Imports

In [2]:
## Some standard imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from sklearn.neighbors import KernelDensity
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.model_selection import GridSearchCV
from joblib import dump
%matplotlib inline
import warnings
warnings.filterwarnings("ignore")

# Project Overview

# EDA, Data Cleaning & Feature Engineering

In [None]:
user_factors = pd.read_csv('../data/processed/user_factors_unstacked.csv')

In [None]:
user_factors_scaled = pd.read_csv('../data/processed/user_factors_scaled.csv')

In [None]:
user_factors_scaled.drop(['Unnamed: 0'], axis=1, inplace=True)

# AWS KMeans Evaluation

In [None]:
error = [8113978.666767631, 7354872.282449911, 6779693.947846839, 6494419.943265818, 6161394.18380805, 5969178.495847053, 5775171.418613084, 5586181.465087898, 5459016.548945165, 5320932.3011912545, 5191293.46220535, 5090035.762150701, 4986521.909023397, 4919589.194313965, 4845781.722909153, 4738299.132215889, 4697877.381008717, 4626969.254411039, 4562072.290577676, 4519194.064635965, 4451951.002486911, 4413293.651221422, 4361910.62076177, 4339251.383448499, 4277987.088258287, 4244595.896973764, 4199613.235502976, 4156505.1574725825, 4131992.707043184, 4095312.234922758, 4069049.097594653, 4033434.5349058285, 4011851.7686528624, 3976083.217075855, 3949592.2147446494, 3938039.9457209697, 3899577.4734264542, 3881519.333619226, 3850186.2378393123, 3823198.4054184584, 3808773.2587171667, 3784495.988544207, 3761565.3901316617, 3746975.2839278127, 3723729.6121418993, 3699745.0362992603, 3680091.71095213, 3667951.1618961715, 3653602.1354131224, 3632124.2721405267, 3608965.8957424895, 3593001.8085261723, 3577712.029614562, 3560315.3167280406, 3552552.115932628, 3532308.4953923486, 3511941.7114949734, 3498429.793736455]

In [None]:
len(error)

In [None]:
cluster = list(range(2, 60))

In [None]:
plt.plot(cluster, error);

In [None]:
error_diff = []
for i in list(range(0,56)):
    diff = error[i]-error[i+1]
    error_diff.append(diff)

In [None]:
cluster_diff = list(range(2, len(error_diff)+2))

In [None]:
plt.plot(cluster_diff, error_diff);

# KMeans Model

In [None]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters = 9)

In [None]:
kmeans.fit(user_factors_scaled)

In [None]:
preds = kmeans.predict(user_factors_scaled)

In [None]:
centroids = kmeans.cluster_centers_ 

In [None]:
centroids_df = pd.DataFrame(centroids)

In [None]:
centroids_df.to_csv('../data/processed/centroids.csv')

In [None]:
centroids_df.shape

# Gradient Boosting Machine

In [None]:
preds_df = pd.DataFrame(preds, columns=['cluster'])
user_factors_with_preds = user_factors.copy()
user_factors_with_preds['cluster'] = preds_df
X = user_factors_with_preds.drop(['id', 'cluster'], axis=1)
y = preds
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=142, shuffle=True)

In [None]:
gbc = GradientBoostingClassifier()
param_grid = {'learning_rate':[.15], 
             'max_depth': [10],
             'min_samples_split': [25],
             'n_estimators': [42],
             'max_features': [7]}
gs_gbc = GridSearchCV(gbc, param_grid, cv=5)
gs_gbc.fit(X_train, y_train)
gbc_train_preds = gs_gbc.predict(X_train)
gbc_test_preds = gs_gbc.predict(X_test)
accuracy_score(y_test, gbc_test_preds), accuracy_score(y_train, gbc_train_preds)

In [None]:
print(confusion_matrix(y_test, gbc_test_preds))
print(confusion_matrix(y_train, gbc_train_preds))

In [None]:
dump(gs_gbc, '../models/fifp_classification.joblib') 

# Clusters Distances

In [None]:
cluster_distances = cm.cluster_distances(centroids)
cluster_distances.to_csv('../data/processed/cluster_distances_df.csv')

In [None]:
item_factors_unstacked = pd.read_csv('../data/processed/item_factors.csv', index_col=['id'] )

In [None]:
len(item_factors_unstacked)

In [None]:
item_factors_unstacked_transposed = item_factors_unstacked.T

In [None]:
centroid_ratings_T_df = cm.centroid_ratings(centroids,item_factors_unstacked)
centroid_ratings_T_df.head()

# Recommendations

In [3]:
ra.recommendations()

Enter a ranking for Schindler's List (1993) from 1 (lowest) to 5 (highest). If you have not seen the movie, press enter.5
Enter a ranking for Human (2015) from 1 (lowest) to 5 (highest). If you have not seen the movie, press enter.
Enter a ranking for Black Mirror: White Christmas (2014) from 1 (lowest) to 5 (highest). If you have not seen the movie, press enter.
Enter a ranking for The Adventures of Sherlock Holmes and Doctor Watson: The Treasures of Agra (1983) from 1 (lowest) to 5 (highest). If you have not seen the movie, press enter.
Enter a ranking for Operation 'Y' & Other Shurik's Adventures (1965) from 1 (lowest) to 5 (highest). If you have not seen the movie, press enter.
Enter a ranking for The Adventures of Sherlock Holmes and Dr. Watson: Bloody Signature (1979) from 1 (lowest) to 5 (highest). If you have not seen the movie, press enter.
Enter a ranking for Double Indemnity (1944) from 1 (lowest) to 5 (highest). If you have not seen the movie, press enter.
Enter a ranking f