In [2]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn import preprocessing
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

Data Cleaning/Manipulation

In [3]:
#Create DF
df = pd.read_csv('All Time Worldwide Box Office for Action Movies - Sheet1.csv')
df

Unnamed: 0,Rank,Released,Movie,Worldwide\r\nBox Office,Domestic\r\nBox Office,International\r\nBox Office
0,1,2009,Avatar,"$2,923,706,026","$785,221,649","$2,138,484,377"
1,2,2019,Avengers: Endgame,"$2,794,731,755","$858,373,000","$1,936,358,755"
2,3,2022,Avatar: The Way of Water,"$2,318,552,513","$683,875,614","$1,634,676,899"
3,4,2018,Avengers: Infinity War,"$2,048,359,754","$678,815,482","$1,369,544,272"
4,5,2021,Spider-Man: No Way Home,"$1,910,048,245","$814,115,070","$1,095,933,175"
...,...,...,...,...,...,...
1995,1996,2021,Vanquish,"$62,673",,"$62,673"
1996,1997,2011,Gekijouban Trigun: Badlands Rumble,"$62,027","$62,027",
1997,1998,2018,Tian Xia Di Yi Biao Ju,"$62,007",,"$62,007"
1998,1999,2014,Tiger & Bunny the Movie: The Rising,"$61,562","$61,562",


In [4]:
#Clean Data
df_action = df
feature = ['Worldwide\r\nBox Office','Domestic\r\nBox Office', 'International\r\nBox Office']
df_action[feature] = df_action[feature].replace(',','', regex = True)
df_action['Worldwide\r\nBox Office'] = df_action['Worldwide\r\nBox Office'].str.replace('$','')
df_action['Domestic\r\nBox Office'] = df_action['Domestic\r\nBox Office'].str.replace('$','')
df_action['International\r\nBox Office'] = df_action['International\r\nBox Office'].str.replace('$','')
df_action.fillna(0,inplace = True)
df_action 

Unnamed: 0,Rank,Released,Movie,Worldwide\r\nBox Office,Domestic\r\nBox Office,International\r\nBox Office
0,1,2009,Avatar,2923706026,785221649,2138484377
1,2,2019,Avengers: Endgame,2794731755,858373000,1936358755
2,3,2022,Avatar: The Way of Water,2318552513,683875614,1634676899
3,4,2018,Avengers: Infinity War,2048359754,678815482,1369544272
4,5,2021,Spider-Man: No Way Home,1910048245,814115070,1095933175
...,...,...,...,...,...,...
1995,1996,2021,Vanquish,62673,0,62673
1996,1997,2011,Gekijouban Trigun: Badlands Rumble,62027,62027,0
1997,1998,2018,Tian Xia Di Yi Biao Ju,62007,0,62007
1998,1999,2014,Tiger & Bunny the Movie: The Rising,61562,61562,0


In [5]:
#Further Cleaning, rename cols and correct data types
df_action['Worldwide\r\nBox Office'] = df_action['Worldwide\r\nBox Office'].astype('Int64')
df_action['Domestic\r\nBox Office'] = df_action['Domestic\r\nBox Office'].astype(int)
df_action['International\r\nBox Office'] = df_action['International\r\nBox Office'].astype(int)
df_action.rename(columns = {'Worldwide\r\nBox Office': 'Worldwide','Domestic\r\nBox Office':'Domestic','International\r\nBox Office':'International'}, inplace=True )


Creating the extra column for regression/classification report

In [6]:
import re 
def is_sequel(movie):
    sequel_patterns = [
        r'\b2\b', r'\b3\b', r'\b4\b', r'\b5\b', # Numbers
        r'\bII\b', r'\bIII\b', r'\bIV\b', # Roman numerals
        r'\bPart\b', r'Return', r'Revenge', r'Reloaded', r'Revolution',r':' # Words commonly associated with sequels
    ]
    
    for pattern in sequel_patterns:
        # if the movie title matches with the sequel patterns, then it should label as 1 for squeal
        if re.search(pattern, movie, re.IGNORECASE):
            return 1
    # if there is no match, then it should label as 0 for original movie
    return 0
df_action['sequel'] = df_action['Movie'].apply(is_sequel)
df_action


Unnamed: 0,Rank,Released,Movie,Worldwide,Domestic,International,sequel
0,1,2009,Avatar,2923706026,785221649,2138484377,0
1,2,2019,Avengers: Endgame,2794731755,858373000,1936358755,1
2,3,2022,Avatar: The Way of Water,2318552513,683875614,1634676899,1
3,4,2018,Avengers: Infinity War,2048359754,678815482,1369544272,1
4,5,2021,Spider-Man: No Way Home,1910048245,814115070,1095933175,1
...,...,...,...,...,...,...,...
1995,1996,2021,Vanquish,62673,0,62673,0
1996,1997,2011,Gekijouban Trigun: Badlands Rumble,62027,62027,0,1
1997,1998,2018,Tian Xia Di Yi Biao Ju,62007,0,62007,0
1998,1999,2014,Tiger & Bunny the Movie: The Rising,61562,61562,0,1


Regression/Classification report
Focusing on various independent variables such as the released year date, worldwide sales, Domestic sales, and international sales

In [8]:
# find whether a movie is a sequel or not based on domestic, international, and worldwide income

# set the input and response values(sequel)
x = df_action[["Released","Worldwide","Domestic","International"]]
y = df_action.sequel

In [9]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

In [10]:
LogRegMovie = LogisticRegression()


scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(x_train)
X_test = scaler.transform(x_test)

LogRegMovie.fit(X_train,y_train)



y_pred=LogRegMovie.predict(X_test)

In [11]:
#log regression report 
print('Classes', LogRegMovie.classes_)
print('Intercept',LogRegMovie.intercept_)
print("Coefficients", LogRegMovie.coef_ )

Classes [0 1]
Intercept [-1.46981263]
Coefficients [[0.31268777 0.07956419 0.15081677 0.03705436]]


In [12]:
#Classification report
print("Accuracy", LogRegMovie.score(X_test, y_test))
print(classification_report(y_test, y_pred))

Accuracy 0.7975
              precision    recall  f1-score   support

           0       0.80      0.99      0.89       316
           1       0.67      0.07      0.13        84

    accuracy                           0.80       400
   macro avg       0.73      0.53      0.51       400
weighted avg       0.77      0.80      0.73       400



Insight report

The goal of this analysis is to determine the influence of box office sales and released years to predict the likeihood of the binary outcome of sequels or original movies. Using Sciki-learn library to utilized the Regression/Classification reports, the accuracy of the regression is 79.75% which is relatively high to indicate a strong relationship with the variables and the classification report shows original movies to have higher probability than sequels movies. 
