# Capstone Project Part 2: Building a Streaming Platform Recommender Using Classification Models (Modelling)

## Import libraries, install packages and load user preference dataset

In [251]:
#install pycaret and xgboost packages to run the models
#pip install pycaret
#!pip install xgboost

In [253]:
#import relevant libraries to run the models
import pandas as pd
from pycaret.classification import *
import pickle

In [228]:
#load user preference CSV dataset
user_perference_df = pd.read_csv('data/user.csv')

In [230]:
#inspect columns and values inside datset
user_perference_df.head()

Unnamed: 0,Age,Price,Average_IMDB_ratings,Original_shows_and_movies,Asian_movies_and_shows,Japanese_anime,US_animation,Superheroes_shows_and_movies,Documentaries,Platform_popularity,Total_quantity_of_shows_and_movies,Old_movies_and_shows_before_year_2000,Recommended_platform
0,25,15,9,8,3,4,7,8,6,5,6,2,Disney+
1,27,14,8,0,7,3,8,10,6,4,1,3,Disney+
2,28,15,1,10,1,6,0,5,3,6,6,6,Netflix
3,57,7,6,5,3,5,6,0,9,7,10,8,Amazon_Prime_Video
4,35,13,7,8,5,8,6,2,4,10,6,0,Netflix


## Running Pycaret classification modelling

In [232]:
# Setting up the PyCaret environment with the user preference data, specifying the target variable ('Recommended_platform') and initializing a random session for reproducibility
setup(data = user_perference_df, target = 'Recommended_platform', session_id = 1)

Unnamed: 0,Description,Value
0,Session id,1
1,Target,Recommended_platform
2,Target type,Multiclass
3,Target mapping,"Amazon_Prime_Video: 0, Disney+: 1, Netflix: 2"
4,Original data shape,"(60, 13)"
5,Transformed data shape,"(60, 13)"
6,Transformed train set shape,"(42, 13)"
7,Transformed test set shape,"(18, 13)"
8,Numeric features,12
9,Preprocess,True


<pycaret.classification.oop.ClassificationExperiment at 0x1bee4cdfd50>

In [233]:
# Compare multiple models and select the best one based on the default evaluation metric
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.88,0.9642,0.88,0.8992,0.8703,0.8197,0.8406,0.037
rf,Random Forest Classifier,0.86,0.9575,0.86,0.8783,0.85,0.7876,0.8083,0.044
nb,Naive Bayes,0.81,0.93,0.81,0.8033,0.7917,0.6959,0.7137,0.009
gbc,Gradient Boosting Classifier,0.81,0.0,0.81,0.82,0.7853,0.7149,0.7551,0.055
lr,Logistic Regression,0.805,0.0,0.805,0.8325,0.7853,0.7025,0.7451,0.566
xgboost,Extreme Gradient Boosting,0.79,0.945,0.79,0.83,0.7767,0.6904,0.7246,0.018
dt,Decision Tree Classifier,0.785,0.8333,0.785,0.74,0.7392,0.6729,0.7338,0.008
ridge,Ridge Classifier,0.765,0.0,0.765,0.7325,0.722,0.6476,0.6923,0.008
lda,Linear Discriminant Analysis,0.765,0.0,0.765,0.7325,0.722,0.6476,0.6923,0.009
ada,Ada Boost Classifier,0.72,0.0,0.72,0.7417,0.697,0.5711,0.5988,0.022


Processing:   0%|          | 0/65 [00:00<?, ?it/s]

## Top 3 models (et, rf, nb)

1. [Extra Trees Classifier Model](#1.-Extra-Trees-Classifier-Model)
2. [Random Forest Classifier Model](#2.-Random-Forest-Classifier-Model)
3. [Naive Bayes Model](#3.-Naive-Bayes-Model)

#### 1. Extra Trees Classifier Model

In [235]:
# Create an Extra Trees model
et_model = create_model('et')
# Make predictions on the dataset using the Extra Trees model
predict_model(et_model)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,0.8,0.9333,0.8,0.8667,0.7867,0.6875,0.735
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0
5,0.5,0.9167,0.5,0.625,0.5,0.2727,0.3
6,0.75,0.7917,0.75,0.875,0.75,0.6364,0.7
7,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,0.75,1.0,0.75,0.625,0.6667,0.6,0.6708


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extra Trees Classifier,0.9444,0.9838,0.9444,0.9524,0.9441,0.9167,0.9209


Unnamed: 0,Age,Price,Average_IMDB_ratings,Original_shows_and_movies,Asian_movies_and_shows,Japanese_anime,US_animation,Superheroes_shows_and_movies,Documentaries,Platform_popularity,Total_quantity_of_shows_and_movies,Old_movies_and_shows_before_year_2000,Recommended_platform,prediction_label,prediction_score
42,16,13,10,6,5,8,4,8,8,9,1,2,Disney+,Disney+,0.46
49,18,8,3,2,6,4,7,1,6,3,8,4,Amazon_Prime_Video,Amazon_Prime_Video,0.69
31,11,7,2,8,5,10,6,5,6,3,10,9,Amazon_Prime_Video,Amazon_Prime_Video,0.67
25,43,13,7,8,8,0,5,7,10,2,5,0,Netflix,Netflix,0.62
4,35,13,7,8,5,8,6,2,4,10,6,0,Netflix,Netflix,0.66
37,44,14,9,6,8,7,1,10,6,2,6,5,Disney+,Disney+,0.52
41,30,11,8,10,5,2,2,7,4,8,4,10,Netflix,Disney+,0.48
40,33,9,1,9,5,6,7,7,10,2,8,4,Amazon_Prime_Video,Amazon_Prime_Video,0.69
50,49,13,5,7,6,5,10,8,5,1,2,3,Disney+,Disney+,0.59
7,27,11,8,7,6,6,9,10,2,0,4,0,Disney+,Disney+,0.65


#### 2. Random Forest Classifier Model

In [236]:
# Create a Random Forest model
rf_model = create_model('rf')
# Make predictions on the dataset using the Random Forest model
predict_model(rf_model)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,0.6,0.8667,0.6,0.7,0.6,0.4118,0.4375
2,1.0,1.0,1.0,1.0,1.0,1.0,1.0
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0
5,0.5,0.9167,0.5,0.625,0.5,0.2727,0.3
6,0.75,0.7917,0.75,0.875,0.75,0.6364,0.7
7,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,0.75,1.0,0.75,0.5833,0.65,0.5556,0.6455


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Random Forest Classifier,0.9444,0.9954,0.9444,0.9524,0.9441,0.9167,0.9209


Unnamed: 0,Age,Price,Average_IMDB_ratings,Original_shows_and_movies,Asian_movies_and_shows,Japanese_anime,US_animation,Superheroes_shows_and_movies,Documentaries,Platform_popularity,Total_quantity_of_shows_and_movies,Old_movies_and_shows_before_year_2000,Recommended_platform,prediction_label,prediction_score
42,16,13,10,6,5,8,4,8,8,9,1,2,Disney+,Disney+,0.53
49,18,8,3,2,6,4,7,1,6,3,8,4,Amazon_Prime_Video,Amazon_Prime_Video,0.69
31,11,7,2,8,5,10,6,5,6,3,10,9,Amazon_Prime_Video,Amazon_Prime_Video,0.74
25,43,13,7,8,8,0,5,7,10,2,5,0,Netflix,Netflix,0.59
4,35,13,7,8,5,8,6,2,4,10,6,0,Netflix,Netflix,0.69
37,44,14,9,6,8,7,1,10,6,2,6,5,Disney+,Disney+,0.53
41,30,11,8,10,5,2,2,7,4,8,4,10,Netflix,Disney+,0.43
40,33,9,1,9,5,6,7,7,10,2,8,4,Amazon_Prime_Video,Amazon_Prime_Video,0.7
50,49,13,5,7,6,5,10,8,5,1,2,3,Disney+,Disney+,0.77
7,27,11,8,7,6,6,9,10,2,0,4,0,Disney+,Disney+,0.71


#### 3. Naive Bayes Model

In [237]:
# Create a Naive Bayes model
nb_model = create_model('nb')
# Make predictions on the dataset using the Naive Bayes model
predict_model(nb_model)

Unnamed: 0_level_0,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
Fold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
1,0.6,0.8,0.6,0.7,0.6,0.4118,0.4375
2,0.75,0.875,0.75,0.875,0.75,0.6364,0.7
3,1.0,1.0,1.0,1.0,1.0,1.0,1.0
4,1.0,1.0,1.0,1.0,1.0,1.0,1.0
5,0.75,0.9167,0.75,0.625,0.6667,0.6,0.6708
6,0.5,0.7917,0.5,0.5,0.5,0.2,0.2
7,1.0,1.0,1.0,1.0,1.0,1.0,1.0
8,1.0,1.0,1.0,1.0,1.0,1.0,1.0
9,0.5,0.9167,0.5,0.3333,0.4,0.1111,0.1291


Processing:   0%|          | 0/4 [00:00<?, ?it/s]

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Naive Bayes,0.8889,0.9815,0.8889,0.8889,0.8889,0.8333,0.8333


Unnamed: 0,Age,Price,Average_IMDB_ratings,Original_shows_and_movies,Asian_movies_and_shows,Japanese_anime,US_animation,Superheroes_shows_and_movies,Documentaries,Platform_popularity,Total_quantity_of_shows_and_movies,Old_movies_and_shows_before_year_2000,Recommended_platform,prediction_label,prediction_score
42,16,13,10,6,5,8,4,8,8,9,1,2,Disney+,Disney+,0.5129
49,18,8,3,2,6,4,7,1,6,3,8,4,Amazon_Prime_Video,Amazon_Prime_Video,0.9936
31,11,7,2,8,5,10,6,5,6,3,10,9,Amazon_Prime_Video,Amazon_Prime_Video,1.0
25,43,13,7,8,8,0,5,7,10,2,5,0,Netflix,Netflix,0.9821
4,35,13,7,8,5,8,6,2,4,10,6,0,Netflix,Netflix,0.9997
37,44,14,9,6,8,7,1,10,6,2,6,5,Disney+,Netflix,0.5057
41,30,11,8,10,5,2,2,7,4,8,4,10,Netflix,Disney+,0.6261
40,33,9,1,9,5,6,7,7,10,2,8,4,Amazon_Prime_Video,Amazon_Prime_Video,0.9998
50,49,13,5,7,6,5,10,8,5,1,2,3,Disney+,Disney+,0.8609
7,27,11,8,7,6,6,9,10,2,0,4,0,Disney+,Disney+,0.9837


## Results
Using information from the above 3 models, the Train Accuracy, Test Accuracy, Discrepancy (between test and train results) as well as the time taken has been gathered into the table below for easy comparison.

| Models                     | Train Accuracy | Test Accuracy | Discrepancy | TT(Sec)    |
|----------------------------|----------------|---------------|-------------|------------|
| **Extra Trees Classifier** | **0.8800**     | **0.9444**    | **0.0644**  | **0.0370** |
| Random Forest Classifier   | 0.8600         | 0.9444        | 0.0844      | 0.0430     |
| Naive Bayes                | 0.8100         | 0.8889        | 0.0789      | 0.0090     |   |

**Conclusion: After comparing between the 3 models, Extra Trees Classifier Model (et) is chosen as the best model. The reason is because it has the highest train accuracy, highest test accuracy and it has the lowest discrepancy between its train and test results. The model which has the lowest discrepancy is the model we will use.**

## Export best performing model (et) into pickle file

In [246]:
# Save the trained Extra Trees Classifier model to a file using pickle
with open('model.pkl', 'wb') as f:
    pickle.dump(et_model, f)

print("Extra Trees Classifier model saved as model.pkl")

Extra Trees Classifier model saved as model.pkl
