In [1]:
# Machine Learning Models

In [2]:
# Import Dependenceis
import pandas as pd
import numpy as np
import sys
from collections import Counter

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.ensemble import GradientBoostingClassifier
from imblearn.over_sampling import RandomOverSampler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import balanced_accuracy_score
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import ClusterCentroids
from imblearn.combine import SMOTEENN

In [3]:
# Read in Dataset
movies_df = pd.read_csv("Resources/movies_sql.csv")
movies_df.head()

Unnamed: 0,index,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,runtime,original_title,production_budget,worldwide_gross
0,7445,Avengers: Endgame,PG-13,Action,2019,"April 26, 2019 (United States)",8.4,903000,Anthony Russo,Christopher Markus,Robert Downey Jr.,United States,356000000,2797501328,Marvel Studios,181.0,Avengers: Endgame,400000000,2797800564
1,6644,Avengers: Age of Ultron,PG-13,Action,2015,"May 1, 2015 (United States)",7.3,777000,Joss Whedon,Joss Whedon,Robert Downey Jr.,United States,250000000,1402809540,Marvel Studios,141.0,Avengers: Age of Ultron,365000000,1395316979
2,7244,Avengers: Infinity War,PG-13,Action,2018,"April 27, 2018 (United States)",8.4,897000,Anthony Russo,Christopher Markus,Robert Downey Jr.,United States,321000000,2048359754,Marvel Studios,149.0,Avengers: Infinity War,300000000,2048359754
3,7066,Justice League,PG-13,Action,2017,"November 17, 2017 (United States)",6.1,418000,Zack Snyder,Jerry Siegel,Ben Affleck,United States,300000000,657926987,Warner Bros.,120.0,Justice League,300000000,655945209
4,6665,Spectre,PG-13,Action,2015,"November 6, 2015 (United States)",6.8,393000,Sam Mendes,John Logan,Daniel Craig,United Kingdom,245000000,880681519,B24,148.0,Spectre,300000000,879500760


In [4]:
# Create df for ML
ml_movies = movies_df[["name", "genre", "director", "writer", "star", "budget", "gross", "runtime"]].copy()

In [5]:
ml_movies.tail()

Unnamed: 0,name,genre,director,writer,star,budget,gross,runtime
3924,Scream,Horror,Byron Quisenberry,Byron Quisenberry,Pepper Martin,24000000,138883016,82.0
3925,Scream,Horror,Byron Quisenberry,Byron Quisenberry,Pepper Martin,15000000,173046663,82.0
3926,Wuthering Heights,Drama,Jacques Rivette,Pascal Bonitzer,Fabienne Babe,8000000,2721534,130.0
3927,The Wizard of Oz,Animation,Fumihiko Takayama,Yoshimitsu Banno,Aileen Quinn,2777000,34949482,78.0
3928,Cure,Crime,Kiyoshi Kurosawa,Kiyoshi Kurosawa,Masato Hagiwara,10000,94596,111.0


In [6]:
ml_movies.dtypes

name         object
genre        object
director     object
writer       object
star         object
budget        int64
gross         int64
runtime     float64
dtype: object

In [7]:
# create function for building target variable 
def target_creation(row):
    if row['budget'] >= row['gross']:
        return 0
    
    if row['budget'] < row['gross']:
        return 1
    else:
        return np.nan

In [8]:
ml_movies.apply(lambda row: target_creation(row), axis=1)

0       1
1       1
2       1
3       1
4       1
       ..
3924    1
3925    1
3926    0
3927    1
3928    1
Length: 3929, dtype: int64

In [9]:
ml_movies['movie_success'] = ml_movies.apply(lambda row: target_creation(row), axis=1)

In [10]:
ml_movies.movie_success.value_counts()

1    2892
0    1037
Name: movie_success, dtype: int64

In [11]:
ml_movies.head()

Unnamed: 0,name,genre,director,writer,star,budget,gross,runtime,movie_success
0,Avengers: Endgame,Action,Anthony Russo,Christopher Markus,Robert Downey Jr.,356000000,2797501328,181.0,1
1,Avengers: Age of Ultron,Action,Joss Whedon,Joss Whedon,Robert Downey Jr.,250000000,1402809540,141.0,1
2,Avengers: Infinity War,Action,Anthony Russo,Christopher Markus,Robert Downey Jr.,321000000,2048359754,149.0,1
3,Justice League,Action,Zack Snyder,Jerry Siegel,Ben Affleck,300000000,657926987,120.0,1
4,Spectre,Action,Sam Mendes,John Logan,Daniel Craig,245000000,880681519,148.0,1


In [12]:
# Check genre value counts
ml_movies.genre.value_counts()

Action       1104
Comedy        947
Drama         642
Crime         271
Biography     257
Adventure     235
Animation     216
Horror        207
Fantasy        23
Mystery         9
Family          5
Sci-Fi          5
Thriller        5
Romance         3
Name: genre, dtype: int64

In [13]:
# Check director value counts
ml_movies.director.value_counts()

Clint Eastwood       23
Ridley Scott         21
Steven Spielberg     21
Steven Soderbergh    19
Tim Burton           18
                     ..
Jessy Terrero         1
Josef Rusnak          1
Kinka Usher           1
Bruce Paltrow         1
Fumihiko Takayama     1
Name: director, Length: 1622, dtype: int64

In [14]:
# Check writer value counts
ml_movies.writer.value_counts()

Stephen King          20
Woody Allen           16
Luc Besson            15
Kevin Smith           10
M. Night Shyamalan    10
                      ..
Vanessa Taylor         1
Dick King-Smith        1
Louis Begley           1
Isaac Marion           1
Yoshimitsu Banno       1
Name: writer, Length: 2369, dtype: int64

In [15]:
ml_movies.star.value_counts()

Robert De Niro          30
Bruce Willis            29
Denzel Washington       29
Tom Cruise              29
Nicolas Cage            29
                        ..
Darci Kistler            1
Jennifer Jason Leigh     1
F. Murray Abraham        1
Brittany Snow            1
Masato Hagiwara          1
Name: star, Length: 1392, dtype: int64

In [16]:
# Create LabelEncoder object
le = LabelEncoder()

In [17]:
ml_movies['genre'] = le.fit_transform(ml_movies['genre'])
ml_movies

Unnamed: 0,name,genre,director,writer,star,budget,gross,runtime,movie_success
0,Avengers: Endgame,0,Anthony Russo,Christopher Markus,Robert Downey Jr.,356000000,2797501328,181.0,1
1,Avengers: Age of Ultron,0,Joss Whedon,Joss Whedon,Robert Downey Jr.,250000000,1402809540,141.0,1
2,Avengers: Infinity War,0,Anthony Russo,Christopher Markus,Robert Downey Jr.,321000000,2048359754,149.0,1
3,Justice League,0,Zack Snyder,Jerry Siegel,Ben Affleck,300000000,657926987,120.0,1
4,Spectre,0,Sam Mendes,John Logan,Daniel Craig,245000000,880681519,148.0,1
...,...,...,...,...,...,...,...,...,...
3924,Scream,9,Byron Quisenberry,Byron Quisenberry,Pepper Martin,24000000,138883016,82.0,1
3925,Scream,9,Byron Quisenberry,Byron Quisenberry,Pepper Martin,15000000,173046663,82.0,1
3926,Wuthering Heights,6,Jacques Rivette,Pascal Bonitzer,Fabienne Babe,8000000,2721534,130.0,0
3927,The Wizard of Oz,2,Fumihiko Takayama,Yoshimitsu Banno,Aileen Quinn,2777000,34949482,78.0,1


In [18]:
ml_movies['director'] = le.fit_transform(ml_movies['director'])
ml_movies

Unnamed: 0,name,genre,director,writer,star,budget,gross,runtime,movie_success
0,Avengers: Endgame,0,91,Christopher Markus,Robert Downey Jr.,356000000,2797501328,181.0,1
1,Avengers: Age of Ultron,0,834,Joss Whedon,Robert Downey Jr.,250000000,1402809540,141.0,1
2,Avengers: Infinity War,0,91,Christopher Markus,Robert Downey Jr.,321000000,2048359754,149.0,1
3,Justice League,0,1618,Jerry Siegel,Ben Affleck,300000000,657926987,120.0,1
4,Spectre,0,1361,John Logan,Daniel Craig,245000000,880681519,148.0,1
...,...,...,...,...,...,...,...,...,...
3924,Scream,9,197,Byron Quisenberry,Pepper Martin,24000000,138883016,82.0,1
3925,Scream,9,197,Byron Quisenberry,Pepper Martin,15000000,173046663,82.0,1
3926,Wuthering Heights,6,613,Pascal Bonitzer,Fabienne Babe,8000000,2721534,130.0,0
3927,The Wizard of Oz,2,488,Yoshimitsu Banno,Aileen Quinn,2777000,34949482,78.0,1


In [19]:
ml_movies['writer'] = le.fit_transform(ml_movies['writer'])
ml_movies

Unnamed: 0,name,genre,director,writer,star,budget,gross,runtime,movie_success
0,Avengers: Endgame,0,91,382,Robert Downey Jr.,356000000,2797501328,181.0,1
1,Avengers: Age of Ultron,0,834,1228,Robert Downey Jr.,250000000,1402809540,141.0,1
2,Avengers: Infinity War,0,91,382,Robert Downey Jr.,321000000,2048359754,149.0,1
3,Justice League,0,1618,1039,Ben Affleck,300000000,657926987,120.0,1
4,Spectre,0,1361,1135,Daniel Craig,245000000,880681519,148.0,1
...,...,...,...,...,...,...,...,...,...
3924,Scream,9,197,291,Pepper Martin,24000000,138883016,82.0,1
3925,Scream,9,197,291,Pepper Martin,15000000,173046663,82.0,1
3926,Wuthering Heights,6,613,1733,Fabienne Babe,8000000,2721534,130.0,0
3927,The Wizard of Oz,2,488,2359,Aileen Quinn,2777000,34949482,78.0,1


In [20]:
ml_movies['star'] = le.fit_transform(ml_movies['star'])
ml_movies

Unnamed: 0,name,genre,director,writer,star,budget,gross,runtime,movie_success
0,Avengers: Endgame,0,91,382,1115,356000000,2797501328,181.0,1
1,Avengers: Age of Ultron,0,834,1228,1115,250000000,1402809540,141.0,1
2,Avengers: Infinity War,0,91,382,1115,321000000,2048359754,149.0,1
3,Justice League,0,1618,1039,113,300000000,657926987,120.0,1
4,Spectre,0,1361,1135,288,245000000,880681519,148.0,1
...,...,...,...,...,...,...,...,...,...
3924,Scream,9,197,291,1055,24000000,138883016,82.0,1
3925,Scream,9,197,291,1055,15000000,173046663,82.0,1
3926,Wuthering Heights,6,613,1733,425,8000000,2721534,130.0,0
3927,The Wizard of Oz,2,488,2359,21,2777000,34949482,78.0,1


In [21]:
# Prepare dataframe for ml variables
encoded_movies = ml_movies.drop('name', axis=1)
encoded_movies

Unnamed: 0,genre,director,writer,star,budget,gross,runtime,movie_success
0,0,91,382,1115,356000000,2797501328,181.0,1
1,0,834,1228,1115,250000000,1402809540,141.0,1
2,0,91,382,1115,321000000,2048359754,149.0,1
3,0,1618,1039,113,300000000,657926987,120.0,1
4,0,1361,1135,288,245000000,880681519,148.0,1
...,...,...,...,...,...,...,...,...
3924,9,197,291,1055,24000000,138883016,82.0,1
3925,9,197,291,1055,15000000,173046663,82.0,1
3926,6,613,1733,425,8000000,2721534,130.0,0
3927,2,488,2359,21,2777000,34949482,78.0,1


In [22]:
# Define the features set
X = encoded_movies.copy()
X = X.drop("movie_success", axis=1)
X.head()


Unnamed: 0,genre,director,writer,star,budget,gross,runtime
0,0,91,382,1115,356000000,2797501328,181.0
1,0,834,1228,1115,250000000,1402809540,141.0
2,0,91,382,1115,321000000,2048359754,149.0
3,0,1618,1039,113,300000000,657926987,120.0
4,0,1361,1135,288,245000000,880681519,148.0


In [23]:
# Define the target set
y = encoded_movies["movie_success"]
y

0       1
1       1
2       1
3       1
4       1
       ..
3924    1
3925    1
3926    0
3927    1
3928    1
Name: movie_success, Length: 3929, dtype: int64

## Decision Tree Classifier Model

In [24]:
# Split the features and target sets into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [25]:
# Determine the shape of our training and testing sets.
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(2946, 7)
(983, 7)
(2946,)
(983,)


In [26]:
array_sum = np.sum(X_train)
array_sum

genre       9.657000e+03
director    2.465848e+06
writer      3.523545e+06
star        2.065131e+06
budget      1.218876e+11
gross       3.728237e+11
runtime     3.233150e+05
dtype: float64

In [27]:
# Create StandardScaler instance

In [28]:
data_scaler = StandardScaler()

In [29]:
# Fitting the Standard Scaler with the training data
X_scaler = data_scaler.fit(X_train)

In [30]:
# Scaling the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [31]:
X_train_scaled

array([[-0.10311589, -1.63314363, -1.47430661, ..., -0.76770194,
        -0.19093203,  0.01114981],
       [ 0.63871416,  1.11249775,  1.63652071, ..., -0.14235424,
        -0.05122898, -0.76450845],
       [ 0.26779913, -0.34428751,  0.38604711, ..., -0.81236964,
        -0.57333207, -1.17819286],
       ...,
       [-1.21586095, -1.40014195, -0.15948114, ..., -0.36569271,
        -0.36871694, -0.66108735],
       [ 0.26779913,  0.10492293, -1.43628051, ...,  0.97433809,
         1.00942696, -0.14398184],
       [ 1.00962918, -0.02312303,  1.21092092, ..., -0.65603271,
         0.85220599,  0.83851863]])

In [32]:
# Creating the decision tree classifier instance
model = tree.DecisionTreeClassifier()

In [33]:
array_sum = np.sum(X_train_scaled)
array_sum

nan

In [34]:
np.set_printoptions(threshold=sys.maxsize)
print(X_train_scaled)

[[-1.03115887e-01 -1.63314363e+00 -1.47430661e+00 -4.35313560e-01
  -7.67701944e-01 -1.90932030e-01  1.11498132e-02]
 [ 6.38714156e-01  1.11249775e+00  1.63652071e+00 -4.15298558e-01
  -1.42354241e-01 -5.12289769e-02 -7.64508454e-01]
 [ 2.67799135e-01 -3.44287506e-01  3.86047106e-01 -1.56115744e+00
  -8.12369637e-01 -5.73332067e-01 -1.17819286e+00]
 [-1.21586095e+00  8.87892532e-01  1.03249077e+00 -1.51362181e+00
  -5.24263016e-01 -5.64322798e-01 -1.07477176e+00]
 [-1.21586095e+00  1.03063230e+00 -9.24390729e-01  5.05391545e-01
   1.97936118e+00  1.19120921e+00  1.51075580e+00]
 [ 1.00962918e+00  1.38328348e+00  1.44200259e+00 -1.38852804e+00
  -2.09355780e-01 -3.21525474e-01  1.51075580e+00]
 [ 2.67799135e-01  1.31401272e+00 -7.34260240e-01 -3.67762928e-01
  -8.34703483e-01 -5.53219547e-01 -1.43981840e-01]
 [-1.03115887e-01  8.71099618e-01 -1.41930638e-01 -9.20677365e-01
  -2.76357320e-01 -5.69284725e-01  2.44154572e+00]
 [-1.03115887e-01 -1.10206773e+00 -9.71192081e-01 -4.22804184e-0

In [35]:
np.where(np.isnan(X_train_scaled))

(array([2510], dtype=int64), array([6], dtype=int64))

In [36]:
# Change single nan value to zero
X_train_scaled = np.nan_to_num(X_train_scaled)
print(X_train_scaled)

[[-1.03115887e-01 -1.63314363e+00 -1.47430661e+00 -4.35313560e-01
  -7.67701944e-01 -1.90932030e-01  1.11498132e-02]
 [ 6.38714156e-01  1.11249775e+00  1.63652071e+00 -4.15298558e-01
  -1.42354241e-01 -5.12289769e-02 -7.64508454e-01]
 [ 2.67799135e-01 -3.44287506e-01  3.86047106e-01 -1.56115744e+00
  -8.12369637e-01 -5.73332067e-01 -1.17819286e+00]
 [-1.21586095e+00  8.87892532e-01  1.03249077e+00 -1.51362181e+00
  -5.24263016e-01 -5.64322798e-01 -1.07477176e+00]
 [-1.21586095e+00  1.03063230e+00 -9.24390729e-01  5.05391545e-01
   1.97936118e+00  1.19120921e+00  1.51075580e+00]
 [ 1.00962918e+00  1.38328348e+00  1.44200259e+00 -1.38852804e+00
  -2.09355780e-01 -3.21525474e-01  1.51075580e+00]
 [ 2.67799135e-01  1.31401272e+00 -7.34260240e-01 -3.67762928e-01
  -8.34703483e-01 -5.53219547e-01 -1.43981840e-01]
 [-1.03115887e-01  8.71099618e-01 -1.41930638e-01 -9.20677365e-01
  -2.76357320e-01 -5.69284725e-01  2.44154572e+00]
 [-1.03115887e-01 -1.10206773e+00 -9.71192081e-01 -4.22804184e-0

In [37]:
# Checking for nan values in X_test_scaled
np.where(np.isnan(X_test_scaled))

(array([721], dtype=int64), array([6], dtype=int64))

In [38]:
# Converting nan value to 0
X_test_scaled = np.nan_to_num(X_test_scaled)

In [39]:
# Fitting the model
model = model.fit(X_train_scaled, y_train)

In [40]:
# Making predictions using the testing data
predictions = model.predict(X_test_scaled)
predictions

array([1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0,
       1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,

In [41]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])


In [42]:
# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)
acc_score

0.9684638860630722

In [43]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f" Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,240,11
Actual 1,20,712


 Accuracy Score : 0.9684638860630722
Classification Report
              precision    recall  f1-score   support

           0       0.92      0.96      0.94       251
           1       0.98      0.97      0.98       732

    accuracy                           0.97       983
   macro avg       0.95      0.96      0.96       983
weighted avg       0.97      0.97      0.97       983



## Random Forest Classifier Model

In [44]:
# Split the features and target sets into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [45]:
# Fitting the Standard Scaler with the training data
X_scaler = data_scaler.fit(X_train)

In [46]:
# Scaling the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [47]:
# Change single identified nan value to zero
X_train_scaled = np.nan_to_num(X_train_scaled)
X_test_scaled = np.nan_to_num(X_test_scaled)

In [48]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=128, random_state=78)

In [49]:
# Fitting the model
rf_model = rf_model.fit(X_train_scaled, y_train)

In [50]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)
predictions

array([0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0,
       1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1,
       1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 1, 1,
       1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 0, 1, 0, 1, 0, 0, 1, 1, 0, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 0, 1,
       0, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1,
       1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0,

In [51]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

In [52]:
# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [53]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,233,18
Actual 1,10,722


Accuracy Score : 0.9715157680569685
Classification Report
              precision    recall  f1-score   support

           0       0.96      0.93      0.94       251
           1       0.98      0.99      0.98       732

    accuracy                           0.97       983
   macro avg       0.97      0.96      0.96       983
weighted avg       0.97      0.97      0.97       983



## Rank Feature Importance

In [54]:
# Calculate feature importance in the Random Forest model.
importances = rf_model.feature_importances_
importances

array([0.02124481, 0.04470252, 0.04392738, 0.04228947, 0.24811745,
       0.55689866, 0.04281971])

In [55]:
# Sort the features by their importance.
sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

[(0.5568986622410456, 'gross'),
 (0.24811744858422807, 'budget'),
 (0.044702517191872286, 'director'),
 (0.04392737778920701, 'writer'),
 (0.042819714269033644, 'runtime'),
 (0.04228947104028864, 'star'),
 (0.021244808884324663, 'genre')]

## Gradient Boosting Implementation 

In [56]:
# Split the features and target sets into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [57]:
# Fitting the Standard Scaler with the training data
X_scaler = data_scaler.fit(X_train)

In [58]:
# Scaling the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [59]:
# Change single identified nan value to zero
X_train_scaled = np.nan_to_num(X_train_scaled)
X_test_scaled = np.nan_to_num(X_test_scaled)

In [60]:
learning_rates = [0.05, 0.1, 0.25, 0.5, 0.75, 1]
for learning_rate in learning_rates:
    classifier = GradientBoostingClassifier(n_estimators=20,
                                            learning_rate=learning_rate,
                                            max_features=5,
                                            max_depth=3,
                                            random_state=0)
    # Fit the model
    classifier.fit(X_train_scaled, y_train)

    # Print learning rate results
    print("Learning rate: ", learning_rate)
    print("Accuracy score (training): {0:.3f}".format(
    classifier.score(
    X_train_scaled,
    y_train)))
    print("Accuracy score (validation): {0:.3f}".format(
    classifier.score(
    X_test_scaled,
    y_test)))

Learning rate:  0.05
Accuracy score (training): 0.923
Accuracy score (validation): 0.892
Learning rate:  0.1
Accuracy score (training): 0.962
Accuracy score (validation): 0.936
Learning rate:  0.25
Accuracy score (training): 0.990
Accuracy score (validation): 0.963
Learning rate:  0.5
Accuracy score (training): 0.996
Accuracy score (validation): 0.975
Learning rate:  0.75
Accuracy score (training): 0.999
Accuracy score (validation): 0.973
Learning rate:  1
Accuracy score (training): 0.998
Accuracy score (validation): 0.969


In [61]:
# Use 0.25 rate in case of overfitting
classifier = GradientBoostingClassifier(n_estimators=20,
                                       learning_rate=0.25,
                                       max_features=5,
                                       max_depth=3,
                                       random_state=78)
classifier.fit(X_train_scaled, y_train)
predictions = classifier.predict(X_test_scaled)

In [62]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

In [63]:
# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [64]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,223,28
Actual 1,6,726


Accuracy Score : 0.965412004069176
Classification Report
              precision    recall  f1-score   support

           0       0.97      0.89      0.93       251
           1       0.96      0.99      0.98       732

    accuracy                           0.97       983
   macro avg       0.97      0.94      0.95       983
weighted avg       0.97      0.97      0.96       983



# Oversampling

## Naive Random Oversampling

In [65]:
# Split the features and target sets into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [66]:
# Count of Target Class
Counter(y_train)

Counter({1: 2160, 0: 786})

In [67]:
# Resample the training data with the RandomOversampler
ros = RandomOverSampler(random_state=1)
X_resampled, y_resampled = ros.fit_resample(X_train, y_train)

In [68]:
# Count of Resampled Target class
Counter(y_resampled)

Counter({1: 2160, 0: 2160})

In [69]:
# Checking for nan values
np.where(np.isnan(X_resampled))

(array([2510], dtype=int64), array([6], dtype=int64))

In [70]:
np.where(np.isnan(y_resampled))

(array([], dtype=int64),)

In [71]:
# Change single identified nan value to zero
X_resampled = np.nan_to_num(X_resampled)

In [72]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=78)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=78)

In [73]:
# Checking for nan values
np.where(np.isnan(X_test))

(array([721], dtype=int64), array([6], dtype=int64))

In [74]:
np.where(np.isnan(y_test))

(array([], dtype=int64),)

In [75]:
# Change single identified nan value to zero
X_test = np.nan_to_num(X_test)

In [76]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_acc_score= balanced_accuracy_score(y_test, y_pred)

In [77]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_pred)

cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

In [78]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {balanced_acc_score}")
print("Classification Report")
print(classification_report(y_test, y_pred))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,251,0
Actual 1,0,732


Accuracy Score : 1.0
Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       251
           1       1.00      1.00      1.00       732

    accuracy                           1.00       983
   macro avg       1.00      1.00      1.00       983
weighted avg       1.00      1.00      1.00       983



## SMOTE Oversampling

In [79]:
# Split the features and target sets into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [80]:
# Count of Target Class
Counter(y_train)

Counter({1: 2160, 0: 786})

In [81]:
# Checking for nan values
np.where(np.isnan(X_train))

(array([2510], dtype=int64), array([6], dtype=int64))

In [82]:
# Checking for nan values
np.where(np.isnan(X_test))

(array([721], dtype=int64), array([6], dtype=int64))

In [83]:
# Change single identified nan value to zero
X_test = np.nan_to_num(X_test)
X_train = np.nan_to_num(X_train)

In [84]:
# Resample the training data with SMOTE
X_resampled, y_resampled = SMOTE(random_state=78, sampling_strategy='auto').fit_resample(X_train, y_train)

In [85]:
# Count of Resampled Target class
Counter(y_resampled)

Counter({1: 2160, 0: 2160})

In [86]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [87]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_acc_score= balanced_accuracy_score(y_test, y_pred)

In [88]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_pred)

cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

In [89]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {balanced_acc_score}")
print("Classification Report")
print(classification_report(y_test, y_pred))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,251,0
Actual 1,0,732


Accuracy Score : 1.0
Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       251
           1       1.00      1.00      1.00       732

    accuracy                           1.00       983
   macro avg       1.00      1.00      1.00       983
weighted avg       1.00      1.00      1.00       983



# Undersampling

## Cluster Centroids

In [90]:
# Split the features and target sets into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [91]:
# Count of Target Class
Counter(y_train)

Counter({1: 2160, 0: 786})

In [92]:
# Checking for nan values
np.where(np.isnan(X_train))

(array([2510], dtype=int64), array([6], dtype=int64))

In [93]:
# Checking for nan values
np.where(np.isnan(X_test))

(array([721], dtype=int64), array([6], dtype=int64))

In [94]:
# Change single identified nan value to zero
X_test = np.nan_to_num(X_test)
X_train = np.nan_to_num(X_train)

In [95]:
# Resample the data using the ClusterCentroids resampler
cc = ClusterCentroids(random_state=78)
X_resampled, y_resampled = cc.fit_resample(X_train, y_train)

In [96]:
# Count of Resampled Target class
Counter(y_resampled)

Counter({0: 786, 1: 786})

In [97]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=1)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=1)

In [98]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_acc_score= balanced_accuracy_score(y_test, y_pred)

In [99]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_pred)

cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

In [100]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {balanced_acc_score}")
print("Classification Report")
print(classification_report(y_test, y_pred))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,251,0
Actual 1,3,729


Accuracy Score : 0.9979508196721312
Classification Report
              precision    recall  f1-score   support

           0       0.99      1.00      0.99       251
           1       1.00      1.00      1.00       732

    accuracy                           1.00       983
   macro avg       0.99      1.00      1.00       983
weighted avg       1.00      1.00      1.00       983



# Combination (Over and Under) Sampling

## SMOTEENN 

In [101]:
# Split the features and target sets into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [102]:
# Count of Target Class
Counter(y_train)

Counter({1: 2160, 0: 786})

In [103]:
# Checking for nan values
np.where(np.isnan(X_train))

(array([2510], dtype=int64), array([6], dtype=int64))

In [104]:
# Checking for nan values
np.where(np.isnan(X_test))

(array([721], dtype=int64), array([6], dtype=int64))

In [105]:
# Change single identified nan value to zero
X_test = np.nan_to_num(X_test)
X_train = np.nan_to_num(X_train)

In [106]:
# Resample the training data with SMOTEENN
smote_enn = SMOTEENN(random_state=78)
X_resampled, y_resampled = smote_enn.fit_resample(X_train, y_train)

In [107]:
# Count of Resampled Target class
Counter(y_resampled)

Counter({0: 2140, 1: 2117})

In [108]:
# Train the Logistic Regression model using the resampled data
model = LogisticRegression(solver='lbfgs', random_state=78)
model.fit(X_resampled, y_resampled)

LogisticRegression(random_state=78)

In [109]:
# Calculated the balanced accuracy score
y_pred = model.predict(X_test)
balanced_acc_score= balanced_accuracy_score(y_test, y_pred)

In [110]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, y_pred)

cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])

In [111]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {balanced_acc_score}")
print("Classification Report")
print(classification_report(y_test, y_pred))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,251,0
Actual 1,0,732


Accuracy Score : 1.0
Classification Report
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       251
           1       1.00      1.00      1.00       732

    accuracy                           1.00       983
   macro avg       1.00      1.00      1.00       983
weighted avg       1.00      1.00      1.00       983

