In [1]:
# Import required libraries
import numpy as np
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.preprocessing import StandardScaler


In [3]:
# Read in the CSV
path = Path("../Cleaned Data/Cleaned_movie_data.csv")
df = pd.read_csv(path)

df['adult'] = df['adult'].astype(int)

In [3]:
# Separate the data into labels and features

# Separate the y variable, the labels
y = df['profitable']

# Separate the X variable, the features
X = df[['budget', 'adult', 'January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December', 'Drama', 'Comedy', 'Romance', 'Action', 'Crime', 'Thriller', 'Horror', 'Adventure', 'Science Fiction', 'In_English']]

In [4]:
# Import the train_test_learn module
from sklearn.model_selection import train_test_split

# Split the data using train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=1, stratify=y)

In [5]:
# Creating StandardScaler instance
scaler = StandardScaler()

# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)

# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)


In [6]:
# Import the LogisticRegression module from SKLearn
from sklearn.linear_model import LogisticRegression

# Instantiate the Logistic Regression model
# Assign a random_state parameter of 1 to the model
classifier = LogisticRegression(solver='lbfgs', random_state=1, max_iter=1000,  class_weight= 'balanced', C = 1)

# Fit the model using the reshaped training data
classifier.fit(X_train_scaled, y_train)

In [7]:
classifier.coef_

array([[ 0.31170997, -0.02425917,  0.13174455,  0.14908819,  0.17163733,
         0.13693857,  0.1368162 ,  0.23810661,  0.24199002,  0.16276924,
         0.07890564,  0.15111134,  0.14355357,  0.21927725, -0.07608201,
         0.11235595,  0.12770244,  0.04878155,  0.02626238, -0.0003995 ,
         0.0859258 ,  0.01723558, -0.02597379,  0.02435552]])

In [8]:
#Validating Model
print(f"Training Data Score: {classifier.score(X_train_scaled, y_train)}")
print(f"Testing Data Score: {classifier.score(X_test_scaled, y_test)}")

Training Data Score: 0.555813653374629
Testing Data Score: 0.5623065015479877


In [9]:
# Make a prediction using the testing data
predictions = classifier.predict(X_test_scaled)
pd.DataFrame({"Prediction": predictions, "Actual": y_test})

Unnamed: 0,Prediction,Actual
8832,0,0
9402,1,1
8392,0,1
4777,1,0
4281,0,0
...,...,...
1894,1,1
1010,1,1
3704,1,0
9704,0,0


In [10]:
# Generate a confusion matrix for the model
cm = confusion_matrix(y_test, predictions)

In [11]:
# Print the classification report for the model
print(classification_report(y_test, predictions))

              precision    recall  f1-score   support

           0       0.43      0.63      0.51       929
           1       0.72      0.52      0.60      1655

    accuracy                           0.56      2584
   macro avg       0.57      0.58      0.56      2584
weighted avg       0.61      0.56      0.57      2584

