In [1]:
# Import required libraries
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler


In [4]:
# Read in the CSV
path = Path("../Cleaned Data/Cleaned_movie_data.csv")
df = pd.read_csv(path)

df['adult'] = df['adult'].astype(int)

In [25]:
# Separate the data into labels and features

# Separate the y variable, the labels
y = df['profitable']

# Separate the X variable, the features
X = df[['budget', 'adult', 'Winter_release', 'Spring_release', 'Summer_release', 'Autumn_release', 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December', 'Drama', 'Comedy', 'Romance', 'Action', 'Crime', 'Thriller', 'Horror', 'Adventure', 'Science Fiction', 'In_English']]

In [26]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [27]:
# Creating StandardScaler instance
scaler = StandardScaler()

# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [28]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs', random_state=1, max_iter=1000,  class_weight= 'balanced', C = 1)

# Fit the model using the reshaped training data
classifier.fit(X_train_scaled, y_train)

In [29]:
classifier.coef_

array([[ 0.30379565, -0.02493097,  0.15695281,  0.13832326,  0.19479648,
         0.11860995,  0.04772808,  0.06743255,  0.09302994,  0.06409971,
         0.06089106,  0.12556895,  0.13378596,  0.05038529,  0.01532339,
         0.08584244,  0.08596052,  0.12492059, -0.07163121,  0.11704415,
         0.12775272,  0.05097641,  0.02775547,  0.00084362,  0.08926265,
         0.0171438 , -0.0250583 ,  0.02567914]])

In [30]:
#Validating Model
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

Training Data Score: 0.6403406891211769
Testing Data Score: 0.6482198142414861


In [31]:
# Make a prediction using the testing data
predictions = classifier.predict(X_test_scaled)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
8832,1,0
9402,1,1
8392,1,1
4777,1,0
4281,1,0
...,...,...
1894,1,1
1010,1,1
3704,1,0
9704,1,0


In [32]:
# Generate a confusion matrix for the model
cm = confusion_matrix(y_test, predictions)

In [33]:
# Print the classification report for the model
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.60      0.06      0.11       929
           1       0.65      0.98      0.78      1655

    accuracy                           0.65      2584
   macro avg       0.63      0.52      0.45      2584
weighted avg       0.63      0.65      0.54      2584

