In [1]:
# Import Dependencies
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
import pandas as pd
from sqlalchemy import create_engine

# Loading data from Provisional Database

In [2]:
# Create Database Engine and extract Dataset from Database
connection_string = "postgres://postgres:postgres@group-c-project-db.csna2pebfhlh.us-east-2.rds.amazonaws.com:5432/postgres"
engine = create_engine(connection_string)
movies_df = pd.read_sql(sql="Movie_Data", con=engine)
movies_df.head()

Unnamed: 0,index,budget,gross,name,runtime,score,votes,year,recomendation
0,0,8000000.0,52287414.0,Stand by Me,89,8.1,299174,1986,2.0
1,1,6000000.0,70136369.0,Ferris Bueller's Day Off,103,7.8,264740,1986,2.0
2,2,15000000.0,179800601.0,Top Gun,110,6.9,236909,1986,1.0
3,3,18500000.0,85160248.0,Aliens,137,8.4,540152,1986,2.0
4,4,9000000.0,18564613.0,Flight of the Navigator,90,6.9,36636,1986,1.0


In [3]:
# Droping Identification column
movies_df_clean = movies_df.copy()
movies_df_clean = movies_df_clean.drop(["name"], axis=1)
movies_df_clean.head()

Unnamed: 0,index,budget,gross,runtime,score,votes,year,recomendation
0,0,8000000.0,52287414.0,89,8.1,299174,1986,2.0
1,1,6000000.0,70136369.0,103,7.8,264740,1986,2.0
2,2,15000000.0,179800601.0,110,6.9,236909,1986,1.0
3,3,18500000.0,85160248.0,137,8.4,540152,1986,2.0
4,4,9000000.0,18564613.0,90,6.9,36636,1986,1.0


# Establishing Provisional Machine Learning Model

In [4]:
# Remove loan status target from features data
y = movies_df_clean.recomendation
X = movies_df_clean.drop(columns=["score","recomendation"])

# Split training/test datasets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0, stratify=y)

# Create a StandardScaler instance
scaler = StandardScaler()

# Fit the StandardScaler
X_scaler_train = scaler.fit(X_train)
X_scaler_test = scaler.fit(X_test)

# Scale the data
X_train_scaled = X_scaler_train.transform(X_train)
X_test_scaled = X_scaler_test.transform(X_test)

In [5]:
# Create a balanced random forest classifier and fit the model
rf_model = RandomForestClassifier(n_estimators=1000, max_depth=10, random_state=0).fit(X_train_scaled, y_train)

# Evaluate the model
y_pred = rf_model.predict(X_test_scaled)
print(f" Random forest predictive accuracy: {accuracy_score(y_test,y_pred):.3f}")

 Random forest predictive accuracy: 0.840


In [6]:
# Calculate the confusion matrix.
confusion_matrix(y_test, y_pred)

array([[ 0,  1,  0],
       [ 0, 14,  2],
       [ 0,  1,  7]], dtype=int64)

In [7]:
# Print the imbalanced classification report
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         1
         1.0       0.88      0.88      0.88        16
         2.0       0.78      0.88      0.82         8

    accuracy                           0.84        25
   macro avg       0.55      0.58      0.57        25
weighted avg       0.81      0.84      0.82        25



  _warn_prf(average, modifier, msg_start, len(result))


In [8]:
# List the features sorted in descending order by feature importance
feature_importances = sorted(zip(rf_model.feature_importances_, X.columns), reverse=True)

for feature in feature_importances:
       print(f"{feature[1]}: ({feature[0]})")

gross: (0.2526905658981335)
votes: (0.2485655399764816)
runtime: (0.22047870552861756)
index: (0.15664229083253534)
budget: (0.12162289776423216)
year: (0.0)


In [9]:
# Another potential machine learning option
SCV_model = SVC(decision_function_shape="ovo").fit(X_train_scaled, y_train)

y_pred_SCV = SCV_model.predict(X_test_scaled)
round(SCV_model.score(X_test_scaled, y_test), 4)

0.84

In [10]:
# Evaluate the model
print(f" SCV predictive accuracy: {accuracy_score(y_test,y_pred_SCV):.3f}")

 SCV predictive accuracy: 0.840


In [11]:
# Calculate the confusion matrix.
confusion_matrix(y_test, y_pred_SCV)

array([[ 0,  1,  0],
       [ 0, 15,  1],
       [ 0,  2,  6]], dtype=int64)

In [12]:
# Print the imbalanced classification report
print(classification_report(y_test, y_pred_SCV))

              precision    recall  f1-score   support

         0.0       0.00      0.00      0.00         1
         1.0       0.83      0.94      0.88        16
         2.0       0.86      0.75      0.80         8

    accuracy                           0.84        25
   macro avg       0.56      0.56      0.56        25
weighted avg       0.81      0.84      0.82        25



  _warn_prf(average, modifier, msg_start, len(result))
