<h1>Import all libraries and reading explored data into Dataframe</h1>

In [5]:
import re
import io

#General libraries needed
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

#Libraries for data pre-processing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn import preprocessing

#Libraries for data pre-processing (Log Loss)
from sklearn.datasets import make_blobs
from sklearn.calibration import CalibratedClassifierCV
from sklearn.metrics import log_loss

#For Decision Tree implementation
from scipy.stats import entropy
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree

#For KNN implementation
from sklearn.neighbors import KNeighborsClassifier

#For Bagging implementation
from sklearn.ensemble import BaggingClassifier

#For AdaBoost implementation
from sklearn.ensemble import AdaBoostClassifier
from sklearn.naive_bayes import GaussianNB

#For Random Forest implementation
from sklearn.ensemble import RandomForestClassifier

#For Baseline implementation
from sklearn.dummy import DummyClassifier

#For Logistic Regression
from sklearn.linear_model import LogisticRegression

#For Ensemble
from sklearn.ensemble import VotingClassifier

#Settings
%matplotlib inline
pd.options.display.float_format = '{:.2f}'.format
np.set_printoptions(threshold=np.nan)
sns.set()

def printModelAccuracy(y_test, y_pred):
    # Find the confusion matrix of the result
    cm = pd.DataFrame(confusion_matrix(y_test, y_pred, labels=[1, 2, 3, 4, 5]), \
        index=['true:1', 'true:2', 'true:3', 'true:4', 'true:5'], 
        columns=['pred:1', 'pred:2', 'pred:3', 'pred:4', 'pred:5'])
    print("Confusion Matrix:")
    print(cm)

    # Find the accuracy and F1 score of the result
    asr = accuracy_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred, average='macro')
    print("Accuracy:", asr)
    print("F1:", f1)
    """
    # Log loss
    score = log_loss(y_test, y_pred)
    print("Log Loss:", score)
    """
    
# Read from dataframe
dfnum = pd.read_pickle("../3. Exploratory Data Analysis/explored_data")
dfnum = dfnum.replace([np.inf, -np.inf, np.nan], 0) #removing infinite/nan values
df = dfnum.drop(['id'], 1)

# Check the columns using dtypes
print(df.dtypes)
# Randomly sample 5 records with .sample(5)
df.sample(5)

budget                 float64
revenue                float64
weekday                float64
day                    float64
month                  float64
year                   float64
runtime                float64
vote_average           float64
vote_count             float64
weighted_rating        float64
log_revenue            float64
log_budget             float64
log_runtime            float64
log_vote_average       float64
log_vote_count         float64
log_weighted_rating    float64
bin                    float64
dtype: object


Unnamed: 0,budget,revenue,weekday,day,month,year,runtime,vote_average,vote_count,weighted_rating,log_revenue,log_budget,log_runtime,log_vote_average,log_vote_count,log_weighted_rating,bin
9938,0.0,52738200.0,1.0,27.0,4.0,1987.0,93.0,4.0,2.0,7.0,17.78,0.0,4.53,1.39,0.69,1.95,5.0
15554,2000000.0,1944150.0,1.0,1.0,1.0,2007.0,97.0,6.7,32.0,7.0,14.48,14.51,4.57,1.9,3.47,1.95,3.0
7903,618000.0,867000.0,5.0,2.0,3.0,2007.0,117.0,6.5,1.0,7.0,13.67,13.33,4.76,1.87,0.0,1.95,3.0
13027,0.0,6077816.0,4.0,18.0,9.0,2003.0,136.0,6.5,15.0,7.0,15.62,0.0,4.91,1.87,2.71,1.95,3.0
12178,28000000.0,16900000.0,5.0,25.0,10.0,1996.0,132.0,6.9,184.0,7.0,16.64,17.15,4.88,1.93,5.21,1.95,4.0


In [None]:
#Decision Tree in SKLearn don't take in string well. So we use a label encoder to change that string to a numeric value
"""
for column in df.columns:
    if df[column].dtype == type(object):
        #Create the label encoder
        le = preprocessing.LabelEncoder()
        #Convert the non numeric data to numeric
        df[column] = le.fit_transform(df[column])
"""

In [8]:
# Combine datasets into one df (without PCA)

# Read from text dataframes (before PCA)
print(dfnum.shape)

dfcasts = pd.read_pickle("../2. Data Preprocessing/dfcasts")
dfcasts.columns = [x[0] for x in dfcasts.columns]
dfcasts = dfcasts.sort_values('id')
dfcasts.drop(['id'], 1, inplace=True)
# print(dfcasts.sample(5))
print(dfcasts.shape)

dfdirectors = pd.read_pickle("../2. Data Preprocessing/dfdirectors")
dfdirectors.columns = [x[0] for x in dfdirectors.columns]
dfdirectors = dfdirectors.sort_values('id')
dfdirectors.drop(['id'], 1, inplace=True)
# print(dfdirectors.sample(5))
print(dfdirectors.shape)

dfgenres = pd.read_pickle("../2. Data Preprocessing/dfgenres")
dfgenres.columns = [x[0] for x in dfgenres.columns]
dfgenres = dfgenres.sort_values('id')
dfgenres.drop(['title'], 1, inplace=True) # keep id here as genres has no missing values
# print(dfgenres.sample(5))
print(dfgenres.shape)

dfoverview_s = pd.read_pickle("../2. Data Preprocessing/dfoverviewmostcommon")
dfoverview_s.columns = [x[0] for x in dfoverview_s.columns]
dfoverview_s = dfoverview_s.sort_values('id')
dfoverview_s.drop(['id', 'title'], 1, inplace=True)
# print(dfoverview_s.sample(5))
print(dfoverview_s.shape)

dfproductioncompanies_s = pd.read_pickle("../2. Data Preprocessing/dfproductioncompaniesmostcommon")
dfproductioncompanies_s.columns = [x[0] for x in dfproductioncompanies_s.columns]
dfproductioncompanies_s = dfproductioncompanies_s.sort_values('id')
dfproductioncompanies_s.drop(['id', 'title'], 1, inplace=True)
# print(dfproductioncompanies_s.sample(5))
print(dfproductioncompanies_s.shape)

# COMMENTED OUT overview & production companies because of MEMORY ERROR

# dfoverview = pd.read_pickle("../2. Data Preprocessing/dfoverview")
# dfoverview.columns = [x[0] for x in dfoverview.columns]
# dfoverview = dfoverview.sort_values('id')
# dfoverview.drop(['id', 'title'], 1, inplace=True)
# # print(dfoverview.sample(5))
# print(dfoverview.shape)

# dfproductioncompanies = pd.read_pickle("../2. Data Preprocessing/dfproductioncompanies")
# dfproductioncompanies.columns = [x[0] for x in dfproductioncompanies.columns]
# dfproductioncompanies = dfproductioncompanies.sort_values('id')
# dfproductioncompanies.drop(['id', 'title'], 1, inplace=True)
# # print(dfproductioncompanies.sample(5))
# print(dfproductioncompanies.shape)

# Combine dataframes - MEMORY ERROR
final_df = pd.concat([dfcasts, dfdirectors, dfgenres])
# final_df = pd.concat([dfcasts, dfdirectors, dfgenres, dfoverview_s, dfproductioncompanies_s])
final_df["id"] = pd.to_numeric(final_df["id"])
print(final_df.shape) # 29 cols disappeared?
# final_df = pd.concat([dfcasts, dfdirectors, dfgenres, dfoverview_s, dfproductioncompanies])
final_df = pd.merge(dfnum, final_df, on='id', how='left')
final_df = final_df.replace([np.inf, -np.inf, np.nan], 0) #removing infinite/nan values

# Check the columns using dtypes
print(final_df.dtypes)

# Randomly sample 5 records with .sample(5)
print(final_df.sample(5))
print(final_df.shape)

(19560, 18)
(22775, 942)
(22775, 507)
(23579, 20)


MemoryError: 

In [None]:
# PCA data
dfpca = pd.read_pickle("../3. Exploratory Data Analysis/pca_data")
# dfpca = dfpca.replace([np.inf, -np.inf, np.nan], 0) #removing infinite/nan values
dfpca = dfpca.drop(['id'], 1)

<h1>Find out the number of records per revenue bin. </h1>

In [None]:
# Using groupby, find out the number of reviews with
# positive and negative sentiment respectively.
df_target = df.groupby('bin').size().reset_index(name='n')
print(df_target)

# How many patients in the dataset have been diagnosed positive and negative for diabetes?
fig = plt.figure(figsize=(6, 6))
ax1 = fig.add_subplot(111)
df_target.plot(kind='bar', x='bin', y='n', title = "Target class count", ax=ax1)
ax1.set_ylabel("No. of Movies")
plt.xticks(np.arange(0,5), ["<35k", "35k to 650k", "650k to 800k", "800k to 45mil", ">45mil"])
plt.show()

<h1>Train-Test Split</h1>

In [None]:
#X = df.loc[:, df.columns != 'bin']
X = df[['budget', 'weekday', 'day', 'month', 'year', 'runtime', 'weighted_rating']]
#X = df[['log_budget', 'weekday', 'day', 'month', 'year', 'log_runtime', 'log_weighted_rating']]
y = df[['bin']]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=0)

<h1>Baseline Classifier (Decision Tree)</h1>

In [None]:
parameters = {
    'max_depth' : list(range(5, 10))
}

decisionTree = GridSearchCV(DecisionTreeClassifier(), cv=3, param_grid=parameters)
#Fit the training feature Xs and training label Ys
decisionTree.fit(X_train, y_train.values.ravel())

#Use the trained model to predict the test data
y_pred = decisionTree.predict(X_test)

# Find the confusion matrix, the accuracy, and F1 score of the result
printModelAccuracy(y_test, y_pred)

# Best hyperparameters to use for model
print("Best Parameters:",decisionTree.best_params_)

<h1>1. k-Nearest Neighbor (KNN)</h1>

Refer to the following links on for detail explanation on the implementation:
- [kNN Classifier SKLearn Documentation](https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html)
- [DataCamp Implementation](https://www.datacamp.com/community/tutorials/k-nearest-neighbor-classification-scikit-learn)



In [None]:
#Create the kNN classifier and set the number of neighbors. Note that you can tune this number of neighbors
knn = KNeighborsClassifier()

parameters = {'n_neighbors':list(range(1, 10)),
              'leaf_size':[1,3,5],
              'algorithm':['auto', 'kd_tree'],
              'n_jobs':[-1]}

#Fit the training feature Xs and training label Ys
knn = GridSearchCV(
        knn, 
        cv=3, 
        param_grid=parameters, 
        scoring='f1_macro')

knn.fit(X_train,y_train.values.ravel())

#Use the trained model to predict the test data
y_pred = knn.predict(X_test)

# Find the confusion matrix, the accuracy, and F1 score of the result
printModelAccuracy(y_test, y_pred)

# Best hyperparameters to use for model
print("Best Parameters:",knn.best_params_)

<h1>2. Bagging (with Decision Tree)</h1>

Refer to the following links on for detail explanation on the implementation:
- [Bagging Classifier SKLearn Documentation](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.BaggingClassifier.html)

*Note that the default AdaBoost implementation in SKLearn is Decision Tree 

In [None]:
#Create the Bagging classifier. Default base classifiers is Decision Tree. 
# - n_estimator is the number of base classifiers (i.e. weak learners)
parameters = {
    'base_estimator__max_depth' : list(range(5, 10)),
    'n_estimators' : [100, 200],
    'max_features' : [0.5, 0.6, 0.7],
    'max_samples' : [0.6, 0.7]
}

baggingTree = GridSearchCV(
                BaggingClassifier(DecisionTreeClassifier()), 
                cv=3,
                param_grid=parameters, 
                scoring='f1_macro')

#Fit the training feature Xs and training label Ys
baggingTree.fit(X_train, y_train.values.ravel())

#Use the trained model to predict the test data
y_pred = baggingTree.predict(X_test)

# Find the confusion matrix, the accuracy, and F1 score of the result
printModelAccuracy(y_test, y_pred)

# Best hyperparameters to use for model
print("Best Parameters:",baggingTree.best_params_)

<h1>3. Bagging (with kNN)</h1>

In [None]:
#Create the kNN base classifier
parameters = {
    'n_estimators' : [100, 200],
    'max_features' : [0.5, 0.6, 0.7],
    'max_samples' : [0.6, 0.7]
}

baggingknn = GridSearchCV(
                BaggingClassifier(KNeighborsClassifier(algorithm='auto', leaf_size=3, n_jobs=-1, n_neighbors=9)), 
                cv=3,
                param_grid=parameters, 
                scoring='f1_macro')

#Create the Bagging classifier. Default base classifiers is Decision Tree. 
# - n_estimator is the number of base classifiers (i.e. weak learners)
#baggingknn = BaggingClassifier(n_estimators=50, base_estimator=knn)

#Fit the training feature Xs and training label Ys
baggingknn.fit(X_train, y_train.values.ravel())

#Use the trained model to predict the test data
y_pred = baggingknn.predict(X_test)

# Find the confusion matrix, the accuracy, and F1 score of the result
printModelAccuracy(y_test, y_pred)

# Best hyperparameters to use for model
print("Best Parameters:",baggingknn.best_params_)

<h1>4. AdaBoost (with Decision Tree)</h1>

Refer to the following links on for detail explanation on the implementation:
- [AdaBoost Classifier SKLearn Documentation](https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html)
- [DataCamp Implementation](https://www.datacamp.com/community/tutorials/adaboost-classifier-python)
- [Setting Learning Rate and N Estimators](https://stats.stackexchange.com/questions/82323/shrinkage-parameter-in-adaboost)

*Note that the default AdaBoost implementation in SKLearn is Decision Tree 


In [None]:
#Create the AdaBoost classifier. Default base classifiers is Decision Tree. 
# - n_estimator is the number of base classifiers (i.e. weak learners)
# - learning_rate controls the weight adjustments of each base classifiers. Default is 1
# - learning_rate controls the weight adjustments of each base classifiers. Default is 1
parameters = {"base_estimator__max_depth" : list(range(5, 10)),
              "base_estimator__criterion" : ["gini", "entropy"],
              "base_estimator__splitter" :   ["best", "random"],
              "n_estimators": [1, 50, 100, 200],
              "learning_rate": [1, 2]
             }

adaboostTree = GridSearchCV(AdaBoostClassifier(DecisionTreeClassifier()), cv=3, param_grid=parameters)

#Fit the training feature Xs and training label Ys
adaboostTree.fit(X_train, y_train.values.ravel())

#Use the trained model to predict the test data
y_pred = adaboostTree.predict(X_test)

# Find the confusion matrix, the accuracy, and F1 score of the result
printModelAccuracy(y_test, y_pred)
print("Best Parameters:",adaboostTree.best_params_)

<h1>5. AdaBoost (with Gaussian Navie Bayes)</h1>

Refer to the following links on for detail explanation on the implementation:
- [Gaussian Naive Bayes Classifier SKLearn Documentation](https://scikit-learn.org/stable/modules/generated/sklearn.naive_bayes.GaussianNB.html)
- [Naive Bayes Classifier video](https://www.youtube.com/watch?v=CPqOCI0ahss)

In [None]:
naivebayes = GaussianNB()
#Fit the training feature Xs and training label Ys
naivebayes.fit(X_train, y_train.values.ravel())

#Use the trained model to predict the test data
y_pred = naivebayes.predict(X_test)

# Find the confusion matrix, the accuracy, and F1 score of the result
printModelAccuracy(y_test, y_pred)

In [None]:
nb = GaussianNB()

adaboostnaivebayes = AdaBoostClassifier(n_estimators=50,learning_rate=1, base_estimator=nb)
#model = BaggingClassifier(n_estimators=50, base_estimator=knn)

#Fit the training feature Xs and training label Ys
adaboostnaivebayes.fit(X_train, y_train.values.ravel())

#Use the trained model to predict the test data
y_pred = adaboostnaivebayes.predict(X_test)

# Find the confusion matrix, the accuracy, and F1 score of the result
printModelAccuracy(y_test, y_pred)

<h1>6. Random Forest</h1>

In [None]:
#Instantiate model
randomforest = RandomForestClassifier()

parameters = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6],
    'criterion' :['gini', 'entropy']
}

#Fit the training feature Xs and training label Ys
randomforest = GridSearchCV(randomforest, cv=3, param_grid=parameters, scoring='f1_macro')
randomforest.fit(X_train, y_train.values.ravel())

#Use the trained model to predict the test data
y_pred = randomforest.predict(X_test)

# Find the confusion matrix, the accuracy, and F1 score of the result
printModelAccuracy(y_test, y_pred)

# Best hyperparameters to use for model
print("Best Parameters:", randomforest.best_params_)

<h1>7. Logistic Regression</h1>

In [None]:
#create a new logistic regression model ‘lbfgs’, ‘sag’ and ‘newton-cg’ solvers.
log_reg = LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000)

#fit the model to the training data
log_reg.fit(X_train, y_train.values.ravel())

#Use the trained model to predict the test data
y_pred = log_reg.predict(X_test)

# Find the confusion matrix, the accuracy, and F1 score of the result
printModelAccuracy(y_test, y_pred)

<h1>8. Ensemble (Stacking with all models)</h1>

As I mentioned in lecture, it is possible to ensemble different models. So how can we do that in python? Check out the following link and try it for your project!:
https://machinelearningmastery.com/ensemble-machine-learning-algorithms-python-scikit-learn/ 

In [None]:
#knn with best parameters
knn = KNeighborsClassifier(algorithm='auto', leaf_size=3, n_jobs=-1, n_neighbors=9)
knn.fit(X_train,y_train.values.ravel())

#baggingTree with best parameters
baggingTree = BaggingClassifier(DecisionTreeClassifier(max_depth=8), max_features=0.7, max_samples=0.5, n_estimators=100)
baggingTree.fit(X_train, y_train.values.ravel())

#baggingknn with best parameters
baggingknn = BaggingClassifier(knn, max_features=0.5, max_samples=0.7, n_estimators=200)
baggingknn.fit(X_train, y_train.values.ravel())

#Adaboost(DecisionTree) with best parameters
adaboostTree = AdaBoostClassifier(DecisionTreeClassifier(criterion='gini', max_depth=8, splitter='best'), learning_rate=2, n_estimators=1)
adaboostTree.fit(X_train, y_train.values.ravel())

#random forest with best parameters
randomforest = RandomForestClassifier(criterion='gini', max_depth=6, max_features='log2', n_estimators=500)
randomforest.fit(X_train, y_train.values.ravel())

#create a dictionary of our models
estimators=[('knn', knn), 
            ('baggingTree', baggingTree),
            ('baggingknn', baggingknn),
            ('adaboostTree', adaboostTree),
            #('naivebayes', naivebayes),
            #('adaboostnaivebayes', adaboostnaivebayes),
            ('randomforest', randomforest), 
            ('log_reg', log_reg)]

#create our voting classifier, inputting our models, voting hard means asking classifers to make predictions by majority vote
ensemble = VotingClassifier(estimators, voting='hard')

#fit model to training data
ensemble.fit(X_train, y_train.values.ravel())

#Use the trained model to predict the test data
y_pred = ensemble.predict(X_test)

# Find the confusion matrix, the accuracy, and F1 score of the result
printModelAccuracy(y_test, y_pred)