In [1]:
# Import Dependencies
import pandas as pd
import numpy as np

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn import tree
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report

In [2]:
movies_df = pd.read_csv("https://classfinalproject.s3.amazonaws.com/movies_final_final.csv")
movies_df

Unnamed: 0,index,original_title,production_budget,worldwide_gross,imdb_id,budget,revenue,genres,director,primary_actor
0,15,Tangled,260000000,584899819,tt0398286,260000000.0,591794936.0,Animation,"Nathan Greno, Byron Howard",Mandy Moore
1,37,Men in Black 3,215000000,654213485,tt1409024,225000000.0,624026776.0,Action,Barry Sonnenfeld,Will Smith
2,759,Sphere,73000000,50168294,tt0120184,75000000.0,13100000.0,Science Fiction,Barry Levinson,Dustin Hoffman
3,186,Transformers,151000000,708272592,tt0418279,150000000.0,709709780.0,Adventure,Michael Bay,Shia LaBeouf
4,38,Transformers: Revenge of the Fallen,210000000,836519699,tt1055369,150000000.0,836297228.0,Science Fiction,Michael Bay,Shia LaBeouf
...,...,...,...,...,...,...,...,...,...,...
3210,124,Suicide Squad,175000000,745744980,tt1386697,175000000.0,745600054.0,Action,David Ayer,Will Smith
3211,134,Waterworld,175000000,264246220,tt0114898,175000000.0,264218220.0,Adventure,Kevin Reynolds,Kevin Costner
3212,28,The Avengers,225000000,1515100211,tt0118661,60000000.0,48585416.0,Thriller,Jeremiah S. Chechik,Robert Downey Jr.
3213,28,The Avengers,225000000,1515100211,tt0118661,60000000.0,48585416.0,Thriller,Jeremiah S. Chechik,Ralph Fiennes


In [3]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3215 entries, 0 to 3214
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   index              3215 non-null   int64  
 1   original_title     3215 non-null   object 
 2   production_budget  3215 non-null   int64  
 3   worldwide_gross    3215 non-null   int64  
 4   imdb_id            3215 non-null   object 
 5   budget             3215 non-null   float64
 6   revenue            3215 non-null   float64
 7   genres             3215 non-null   object 
 8   director           3215 non-null   object 
 9   primary_actor      3215 non-null   object 
dtypes: float64(2), int64(3), object(5)
memory usage: 251.3+ KB


In [4]:
# Changing multiple object type columns to string data type
movies_df[["original_title", "genres", "imdb_id", "director", "primary_actor"]] = movies_df[["original_title", "genres", "imdb_id", "director", "primary_actor"]].astype("string")

In [5]:
movies_df.dtypes

index                  int64
original_title        string
production_budget      int64
worldwide_gross        int64
imdb_id               string
budget               float64
revenue              float64
genres                string
director              string
primary_actor         string
dtype: object

In [6]:
# Create df for ML
ml_movies = movies_df[["original_title", "genres", "budget", "revenue", "director", "primary_actor"]].copy()

In [7]:
# create function for building target variable 
def target_creation(row):
    if row['budget'] >= row['revenue']:
        return 0
    
    if row['budget'] < row['revenue']:
        return 1
    else:
        return np.nan

In [8]:
ml_movies.apply(lambda row: target_creation(row), axis=1)

0       1
1       1
2       0
3       1
4       1
       ..
3210    1
3211    1
3212    0
3213    0
3214    1
Length: 3215, dtype: int64

In [9]:
ml_movies['movie_success'] = ml_movies.apply(lambda row: target_creation(row), axis=1)

In [10]:
ml_movies.movie_success.value_counts()

1    2296
0     919
Name: movie_success, dtype: int64

In [11]:
ml_movies.head()

Unnamed: 0,original_title,genres,budget,revenue,director,primary_actor,movie_success
0,Tangled,Animation,260000000.0,591794936.0,"Nathan Greno, Byron Howard",Mandy Moore,1
1,Men in Black 3,Action,225000000.0,624026776.0,Barry Sonnenfeld,Will Smith,1
2,Sphere,Science Fiction,75000000.0,13100000.0,Barry Levinson,Dustin Hoffman,0
3,Transformers,Adventure,150000000.0,709709780.0,Michael Bay,Shia LaBeouf,1
4,Transformers: Revenge of the Fallen,Science Fiction,150000000.0,836297228.0,Michael Bay,Shia LaBeouf,1


In [12]:
# Create LabelEncoder object
le = LabelEncoder()

In [13]:
# Encode genres string column
ml_movies['genres'] = le.fit_transform(ml_movies['genres'])
ml_movies

Unnamed: 0,original_title,genres,budget,revenue,director,primary_actor,movie_success
0,Tangled,2,260000000.0,591794936.0,"Nathan Greno, Byron Howard",Mandy Moore,1
1,Men in Black 3,0,225000000.0,624026776.0,Barry Sonnenfeld,Will Smith,1
2,Sphere,14,75000000.0,13100000.0,Barry Levinson,Dustin Hoffman,0
3,Transformers,1,150000000.0,709709780.0,Michael Bay,Shia LaBeouf,1
4,Transformers: Revenge of the Fallen,14,150000000.0,836297228.0,Michael Bay,Shia LaBeouf,1
...,...,...,...,...,...,...,...
3210,Suicide Squad,0,175000000.0,745600054.0,David Ayer,Will Smith,1
3211,Waterworld,1,175000000.0,264218220.0,Kevin Reynolds,Kevin Costner,1
3212,The Avengers,15,60000000.0,48585416.0,Jeremiah S. Chechik,Robert Downey Jr.,0
3213,The Avengers,15,60000000.0,48585416.0,Jeremiah S. Chechik,Ralph Fiennes,0


In [14]:
# Encode director string column
ml_movies['director'] = le.fit_transform(ml_movies['director'])
ml_movies

Unnamed: 0,original_title,genres,budget,revenue,director,primary_actor,movie_success
0,Tangled,2,260000000.0,591794936.0,945,Mandy Moore,1
1,Men in Black 3,0,225000000.0,624026776.0,95,Will Smith,1
2,Sphere,14,75000000.0,13100000.0,94,Dustin Hoffman,0
3,Transformers,1,150000000.0,709709780.0,882,Shia LaBeouf,1
4,Transformers: Revenge of the Fallen,14,150000000.0,836297228.0,882,Shia LaBeouf,1
...,...,...,...,...,...,...,...
3210,Suicide Squad,0,175000000.0,745600054.0,282,Will Smith,1
3211,Waterworld,1,175000000.0,264218220.0,758,Kevin Costner,1
3212,The Avengers,15,60000000.0,48585416.0,578,Robert Downey Jr.,0
3213,The Avengers,15,60000000.0,48585416.0,578,Ralph Fiennes,0


In [15]:
# Encode starring string column
ml_movies['primary_actor'] = le.fit_transform(ml_movies['primary_actor'])
ml_movies

Unnamed: 0,original_title,genres,budget,revenue,director,primary_actor,movie_success
0,Tangled,2,260000000.0,591794936.0,945,725,1
1,Men in Black 3,0,225000000.0,624026776.0,95,1120,1
2,Sphere,14,75000000.0,13100000.0,94,289,0
3,Transformers,1,150000000.0,709709780.0,882,1010,1
4,Transformers: Revenge of the Fallen,14,150000000.0,836297228.0,882,1010,1
...,...,...,...,...,...,...,...
3210,Suicide Squad,0,175000000.0,745600054.0,282,1120,1
3211,Waterworld,1,175000000.0,264218220.0,758,639,1
3212,The Avengers,15,60000000.0,48585416.0,578,925,0
3213,The Avengers,15,60000000.0,48585416.0,578,897,0


In [16]:
# Prepare dataframe for ml variables
encoded_movies = ml_movies.drop('original_title', axis=1)
encoded_movies

Unnamed: 0,genres,budget,revenue,director,primary_actor,movie_success
0,2,260000000.0,591794936.0,945,725,1
1,0,225000000.0,624026776.0,95,1120,1
2,14,75000000.0,13100000.0,94,289,0
3,1,150000000.0,709709780.0,882,1010,1
4,14,150000000.0,836297228.0,882,1010,1
...,...,...,...,...,...,...
3210,0,175000000.0,745600054.0,282,1120,1
3211,1,175000000.0,264218220.0,758,639,1
3212,15,60000000.0,48585416.0,578,925,0
3213,15,60000000.0,48585416.0,578,897,0


In [17]:
# Define the features set
X = encoded_movies.copy()
X = X.drop("movie_success", axis=1)
X.head()

Unnamed: 0,genres,budget,revenue,director,primary_actor
0,2,260000000.0,591794936.0,945,725
1,0,225000000.0,624026776.0,95,1120
2,14,75000000.0,13100000.0,94,289
3,1,150000000.0,709709780.0,882,1010
4,14,150000000.0,836297228.0,882,1010


In [18]:
# Define the target set
y = encoded_movies["movie_success"]
y

0       1
1       1
2       0
3       1
4       1
       ..
3210    1
3211    1
3212    0
3213    0
3214    1
Name: movie_success, Length: 3215, dtype: int64

In [19]:
# Split the features and target sets into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [20]:
# Determine the shape of our training and testing sets.
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(2411, 5)
(804, 5)
(2411,)
(804,)


In [21]:
# Create StandardScaler instance
data_scaler = StandardScaler()

In [22]:
# Fitting the Standard Scaler with the training data
X_scaler = data_scaler.fit(X_train)

In [23]:
# Scaling the data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

In [24]:
# Creating the decision tree classifier instance
model = tree.DecisionTreeClassifier(random_state=78)

In [25]:
# Fitting the model
model = model.fit(X_train_scaled, y_train)

In [26]:
# Making predictions using the testing data
predictions = model.predict(X_test_scaled)
predictions

array([1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0,
       1, 1, 1, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1,
       0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1,
       1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 1,
       0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1,
       0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 0, 1, 1, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 1, 0,
       1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 1, 0, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1,
       1, 0, 1, 1, 1, 0, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 0,
       1, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 1, 1,
       1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 1, 0, 1, 1, 1,

In [27]:
# Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)

cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"])


In [28]:
# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [29]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f" Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,216,11
Actual 1,9,568


 Accuracy Score : 0.9751243781094527
Classification Report
              precision    recall  f1-score   support

           0       0.96      0.95      0.96       227
           1       0.98      0.98      0.98       577

    accuracy                           0.98       804
   macro avg       0.97      0.97      0.97       804
weighted avg       0.98      0.98      0.98       804

