In [74]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn import preprocessing
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [144]:
#Create DF
df = pd.read_csv('All Time Worldwide Box Office for Action Movies - Sheet1.csv')
df

Unnamed: 0,Rank,Released,Movie,Worldwide\r\nBox Office,Domestic\r\nBox Office,International\r\nBox Office
0,1,2009,Avatar,"$2,923,706,026","$785,221,649","$2,138,484,377"
1,2,2019,Avengers: Endgame,"$2,794,731,755","$858,373,000","$1,936,358,755"
2,3,2022,Avatar: The Way of Water,"$2,318,552,513","$683,875,614","$1,634,676,899"
3,4,2018,Avengers: Infinity War,"$2,048,359,754","$678,815,482","$1,369,544,272"
4,5,2021,Spider-Man: No Way Home,"$1,910,048,245","$814,115,070","$1,095,933,175"
...,...,...,...,...,...,...
1995,1996,2021,Vanquish,"$62,673",,"$62,673"
1996,1997,2011,Gekijouban Trigun: Badlands Rumble,"$62,027","$62,027",
1997,1998,2018,Tian Xia Di Yi Biao Ju,"$62,007",,"$62,007"
1998,1999,2014,Tiger & Bunny the Movie: The Rising,"$61,562","$61,562",


In [145]:
#Clean Data
df_action = df
feature = ['Worldwide\r\nBox Office','Domestic\r\nBox Office', 'International\r\nBox Office']
df_action[feature] = df_action[feature].replace(',','', regex = True)
df_action['Worldwide\r\nBox Office'] = df_action['Worldwide\r\nBox Office'].str.replace('$','')
df_action['Domestic\r\nBox Office'] = df_action['Domestic\r\nBox Office'].str.replace('$','')
df_action['International\r\nBox Office'] = df_action['International\r\nBox Office'].str.replace('$','')
df_action.fillna(0,inplace = True)
df_action 

Unnamed: 0,Rank,Released,Movie,Worldwide\r\nBox Office,Domestic\r\nBox Office,International\r\nBox Office
0,1,2009,Avatar,2923706026,785221649,2138484377
1,2,2019,Avengers: Endgame,2794731755,858373000,1936358755
2,3,2022,Avatar: The Way of Water,2318552513,683875614,1634676899
3,4,2018,Avengers: Infinity War,2048359754,678815482,1369544272
4,5,2021,Spider-Man: No Way Home,1910048245,814115070,1095933175
...,...,...,...,...,...,...
1995,1996,2021,Vanquish,62673,0,62673
1996,1997,2011,Gekijouban Trigun: Badlands Rumble,62027,62027,0
1997,1998,2018,Tian Xia Di Yi Biao Ju,62007,0,62007
1998,1999,2014,Tiger & Bunny the Movie: The Rising,61562,61562,0


In [146]:
#Further Cleaning, rename cols and correct data types
df_action['Worldwide\r\nBox Office'] = df_action['Worldwide\r\nBox Office'].astype('Int64')
df_action['Domestic\r\nBox Office'] = df_action['Domestic\r\nBox Office'].astype(int)
df_action['International\r\nBox Office'] = df_action['International\r\nBox Office'].astype(int)
df_action.rename(columns = {'Worldwide\r\nBox Office': 'Worldwide','Domestic\r\nBox Office':'Domestic','International\r\nBox Office':'International'}, inplace=True )


In [147]:
import re 
def is_sequel(movie):
    sequel_patterns = [
        r'\b2\b', r'\b3\b', r'\b4\b', r'\b5\b', # Numbers
        r'\bII\b', r'\bIII\b', r'\bIV\b', # Roman numerals
        r'\bPart\b', r'Return', r'Revenge', r'Reloaded', r'Revolution',r':' # Words commonly associated with sequels
    ]
    
    for pattern in sequel_patterns:
        if re.search(pattern, movie, re.IGNORECASE):
            return 1
    return 0
df_action['sequel'] = df_action['Movie'].apply(is_sequel)
df_action


Unnamed: 0,Rank,Released,Movie,Worldwide,Domestic,International,sequel
0,1,2009,Avatar,2923706026,785221649,2138484377,0
1,2,2019,Avengers: Endgame,2794731755,858373000,1936358755,1
2,3,2022,Avatar: The Way of Water,2318552513,683875614,1634676899,1
3,4,2018,Avengers: Infinity War,2048359754,678815482,1369544272,1
4,5,2021,Spider-Man: No Way Home,1910048245,814115070,1095933175,1
...,...,...,...,...,...,...,...
1995,1996,2021,Vanquish,62673,0,62673,0
1996,1997,2011,Gekijouban Trigun: Badlands Rumble,62027,62027,0,1
1997,1998,2018,Tian Xia Di Yi Biao Ju,62007,0,62007,0
1998,1999,2014,Tiger & Bunny the Movie: The Rising,61562,61562,0,1


In [170]:
x = df_action.iloc[:,[False,True,False,True,False,False,False]]
y = df_action.sequel

In [171]:
#Question 3

X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.20,random_state=0)



In [173]:

LogRegMovie = LogisticRegression()


#scaler = preprocessing.StandardScaler()
#X_train = scaler.fit_transform(X_train)
#X_test = scaler.transform(X_test)

LogRegMovie.fit(X_train,y_train)



y_pred=LogRegMovie.predict(X_test)

In [175]:

print('Classes', LogRegMovie.classes_)
print('Intercept',LogRegMovie.intercept_)
print("Coefficients", LogRegMovie.coef_ )

Classes [0 1]
Intercept [-3.99712365e-07]
Coefficients [[-8.01730608e-04  1.67812186e-09]]


In [176]:

print("Accuracy", LogRegMovie.score(X_test, y_test))
print(classification_report(y_test, y_pred))

Accuracy 0.78
              precision    recall  f1-score   support

           0       0.79      0.99      0.88       312
           1       0.50      0.05      0.08        88

    accuracy                           0.78       400
   macro avg       0.64      0.52      0.48       400
weighted avg       0.72      0.78      0.70       400



In [177]:

x2 = df_action.iloc[:,[False,True,False,False,False,True,False]]
y2 = df_action.sequel
x2

Unnamed: 0,Released,International
0,2009,2138484377
1,2019,1936358755
2,2022,1634676899
3,2018,1369544272
4,2021,1095933175
...,...,...
1995,2021,62673
1996,2011,0
1997,2018,62007
1998,2014,0


In [184]:

X2_train,X2_test,y2_train,y2_test=train_test_split(x2,y2,test_size=0.20,random_state=0)



In [185]:

LogRegMovie = LogisticRegression()


#scaler = preprocessing.StandardScaler()
#X2_train = scaler.fit_transform(X2_train)
#X2_test = scaler.transform(X2_test)

LogRegMovie.fit(X2_train,y2_train)



y2_pred=LogRegMovie.predict(X2_test)

In [186]:

print('Classes', LogRegMovie.classes_)
print('Intercept',LogRegMovie.intercept_)
print("Coefficients", LogRegMovie.coef_ )

Classes [0 1]
Intercept [-3.98217141e-07]
Coefficients [[-7.98703397e-04  2.51613212e-09]]


In [187]:

print("Accuracy", LogRegMovie.score(X2_test, y2_test))
print(classification_report(y2_test, y2_pred))

Accuracy 0.7775
              precision    recall  f1-score   support

           0       0.79      0.98      0.87       312
           1       0.45      0.06      0.10        88

    accuracy                           0.78       400
   macro avg       0.62      0.52      0.49       400
weighted avg       0.71      0.78      0.70       400



In [182]:
# find whether a movie is a sequel or not based on domestic, international, and worldwide income

# set the input and response values(sequel)
x = df_action[["Worldwide","Domestic","International"]]
y = df_action.sequel

In [190]:
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

In [191]:
LogRegMovie = LogisticRegression()


scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(x_train)
X_test = scaler.transform(x_test)

LogRegMovie.fit(X_train,y_train)



y_pred=LogRegMovie.predict(X_test)

In [192]:
#log regression report 
print('Classes', LogRegMovie.classes_)
print('Intercept',LogRegMovie.intercept_)
print("Coefficients", LogRegMovie.coef_ )

Classes [0 1]
Intercept [-1.48775744]
Coefficients [[0.13439452 0.11288561 0.13837828]]


In [194]:
#Classification report
print("Accuracy", LogRegMovie.score(X_test, y_test))
print(classification_report(y_test, y_pred))

Accuracy 0.775
              precision    recall  f1-score   support

           0       0.78      0.99      0.87       308
           1       0.67      0.04      0.08        92

    accuracy                           0.78       400
   macro avg       0.72      0.52      0.48       400
weighted avg       0.75      0.78      0.69       400

