In [1]:
# Import Dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
import datetime
from sqlalchemy import create_engine
from config import sql_pw

# Loading data from Provisional Database

In [2]:
# Create Database Engine and extract Dataset from Database
connection_string = f"postgres://postgres:{sql_pw}@group-c-project-db.csna2pebfhlh.us-east-2.rds.amazonaws.com:5432/postgres"
engine = create_engine(connection_string)
#movies_df = pd.read_sql(sql="Non_Mearged_Cleaned_Movie_Data", con=engine)
movies_df = pd.read_sql(sql="Movie_Data", con=engine)
movies_df.head()

Unnamed: 0,index,budget,gross,name,released,runtime,score,votes,year,company_Castle Rock Entertainment,...,writer_Other,writer_Pedro Almodvar,writer_Quentin Tarantino,writer_Robert Rodriguez,writer_Stephen King,writer_Tyler Perry,writer_Wes Craven,writer_William Shakespeare,writer_Woody Allen,recomendation
0,0,8000000.0,52287414.0,Stand by Me,1986-08-22,89,8.1,299174,1986,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0
1,1,6000000.0,70136369.0,Ferris Bueller's Day Off,1986-06-11,103,7.8,264740,1986,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
2,2,15000000.0,179800601.0,Top Gun,1986-05-16,110,6.9,236909,1986,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,3,18500000.0,85160248.0,Aliens,1986-07-18,137,8.4,540152,1986,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0
4,4,9000000.0,18564613.0,Flight of the Navigator,1986-08-01,90,6.9,36636,1986,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0


In [5]:
# Changing datetime to year and month for modeling
movies_df['release_year'] = pd.DatetimeIndex(movies_df['released']).year
movies_df['release_month'] = pd.DatetimeIndex(movies_df['released']).month
movies_df.head()

Unnamed: 0,index,budget,gross,name,released,runtime,score,votes,year,company_Castle Rock Entertainment,...,writer_Quentin Tarantino,writer_Robert Rodriguez,writer_Stephen King,writer_Tyler Perry,writer_Wes Craven,writer_William Shakespeare,writer_Woody Allen,recomendation,release_year,release_month
0,0,8000000.0,52287414.0,Stand by Me,1986-08-22,89,8.1,299174,1986,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,1986,8
1,1,6000000.0,70136369.0,Ferris Bueller's Day Off,1986-06-11,103,7.8,264740,1986,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1986,6
2,2,15000000.0,179800601.0,Top Gun,1986-05-16,110,6.9,236909,1986,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1986,5
3,3,18500000.0,85160248.0,Aliens,1986-07-18,137,8.4,540152,1986,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1986,7
4,4,9000000.0,18564613.0,Flight of the Navigator,1986-08-01,90,6.9,36636,1986,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1986,8


In [6]:
# Droping Identification and datetime column
movies_df_clean = movies_df.copy()
movies_df_clean = movies_df_clean.drop(["name", "released", "year"], axis=1)
movies_df_clean.head()

Unnamed: 0,index,budget,gross,runtime,score,votes,company_Castle Rock Entertainment,company_Columbia Pictures,company_Columbia Pictures Corporation,company_Dimension Films,...,writer_Quentin Tarantino,writer_Robert Rodriguez,writer_Stephen King,writer_Tyler Perry,writer_Wes Craven,writer_William Shakespeare,writer_Woody Allen,recomendation,release_year,release_month
0,0,8000000.0,52287414.0,89,8.1,299174,0.0,0.0,1.0,0.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,2.0,1986,8
1,1,6000000.0,70136369.0,103,7.8,264740,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1986,6
2,2,15000000.0,179800601.0,110,6.9,236909,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1986,5
3,3,18500000.0,85160248.0,137,8.4,540152,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2.0,1986,7
4,4,9000000.0,18564613.0,90,6.9,36636,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1986,8


# Establishing Provisional Machine Learning Model

In [7]:
# Remove loan status target from features data
y = movies_df_clean.recomendation
X = movies_df_clean.drop(columns=["score","recomendation"])

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, stratify=y)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler_train = scaler.fit(X_train)
X_scaler_test = scaler.fit(X_test)

# Scale the data
X_train_scaled = X_scaler_train.transform(X_train)
X_test_scaled = X_scaler_test.transform(X_test)

In [8]:
# Create a balanced random forest classifier and fit the model
rf_model = RandomForestClassifier(n_estimators=1000, max_depth=10, random_state=0).fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Random forest predictive accuracy: 0.724


In [9]:
# Calculate the confusion matrix.
confusion_matrix(y_test, y_pred)

array([[  0, 159,   0],
       [  0, 993,  39],
       [  0, 272, 242]], dtype=int64)

In [10]:
# Print the imbalanced classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       159
         1.0       0.70      0.96      0.81      1032
         2.0       0.86      0.47      0.61       514

    accuracy                           0.72      1705
   macro avg       0.52      0.48      0.47      1705
weighted avg       0.68      0.72      0.67      1705



  _warn_prf(average, modifier, msg_start, len(result))


In [11]:
# List the features sorted in descending order by feature importance
feature_importances = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

for feature in feature_importances:
       print(f"{feature[1]}: ({feature[0]})")

votes: (0.22059683748161935)
runtime: (0.132803370083966)
gross: (0.08065583170147822)
country_USA: (0.05931905047276908)
country_Other: (0.055524449654033)
budget: (0.052784307561707454)
index: (0.04266261892099126)
release_year: (0.03424491740764572)
release_month: (0.027306399144106264)
genre_Biography: (0.02425026568603987)
genre_Action: (0.021398928972256346)
company_Other: (0.016011110235127236)
genre_Drama: (0.01426408459781126)
genre_Horror: (0.011710557551432585)
genre_Comedy: (0.010605173085759015)
rating_PG-13: (0.010321459734780571)
rating_R: (0.009952644927686738)
star_Other: (0.0075042868427256885)
country_UK: (0.007338169233734578)
country_France: (0.007226719381401059)
rating_PG: (0.006689272033346179)
genre_Crime: (0.006584647945948819)
director_Other: (0.005698422631060349)
country_Italy: (0.005623306233454662)
genre_Animation: (0.005391535861180276)
rating_NOT RATED: (0.004497882154565365)
writer_Other: (0.004292263902653954)
country_Japan: (0.0037465682161101944)
ra

In [12]:
# Another potential machine learning option
SCV_model = SVC(decision_function_shape="ovo").fit(X_train_scaled, y_train)

y_pred_SCV = SCV_model.predict(X_test_scaled)
round(SCV_model.score(X_test_scaled, y_test), 4)

0.6862

In [13]:
# Evaluate the model
print(f" SCV predictive accuracy: {accuracy_score(y_test,y_pred_SCV):.3f}")

 SCV predictive accuracy: 0.686


In [14]:
# Calculate the confusion matrix.
confusion_matrix(y_test, y_pred_SCV)

array([[  0, 153,   6],
       [  3, 934,  95],
       [  0, 278, 236]], dtype=int64)

In [15]:
# Print the imbalanced classification report
print(classification_report(y_test, y_pred_SCV))

              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00       159
         1.0       0.68      0.91      0.78      1032
         2.0       0.70      0.46      0.55       514

    accuracy                           0.69      1705
   macro avg       0.46      0.45      0.44      1705
weighted avg       0.63      0.69      0.64      1705

