In [2]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.metrics import roc_auc_score, roc_curve, accuracy_score
from sklearn.metrics import precision_score, recall_score
from sklearn import preprocessing
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
#Create DF
df = pd.read_csv('All Time Worldwide Box Office for Action Movies - Sheet1.csv')
df

Unnamed: 0,Rank,Released,Movie,Worldwide\r\nBox Office,Domestic\r\nBox Office,International\r\nBox Office
0,1,2009,Avatar,"$2,923,706,026","$785,221,649","$2,138,484,377"
1,2,2019,Avengers: Endgame,"$2,794,731,755","$858,373,000","$1,936,358,755"
2,3,2022,Avatar: The Way of Water,"$2,318,552,513","$683,875,614","$1,634,676,899"
3,4,2018,Avengers: Infinity War,"$2,048,359,754","$678,815,482","$1,369,544,272"
4,5,2021,Spider-Man: No Way Home,"$1,910,048,245","$814,115,070","$1,095,933,175"
...,...,...,...,...,...,...
1995,1996,2021,Vanquish,"$62,673",,"$62,673"
1996,1997,2011,Gekijouban Trigun: Badlands Rumble,"$62,027","$62,027",
1997,1998,2018,Tian Xia Di Yi Biao Ju,"$62,007",,"$62,007"
1998,1999,2014,Tiger & Bunny the Movie: The Rising,"$61,562","$61,562",


In [4]:
df

Unnamed: 0,Rank,Released,Movie,Worldwide\r\nBox Office,Domestic\r\nBox Office,International\r\nBox Office
0,1,2009,Avatar,"$2,923,706,026","$785,221,649","$2,138,484,377"
1,2,2019,Avengers: Endgame,"$2,794,731,755","$858,373,000","$1,936,358,755"
2,3,2022,Avatar: The Way of Water,"$2,318,552,513","$683,875,614","$1,634,676,899"
3,4,2018,Avengers: Infinity War,"$2,048,359,754","$678,815,482","$1,369,544,272"
4,5,2021,Spider-Man: No Way Home,"$1,910,048,245","$814,115,070","$1,095,933,175"
...,...,...,...,...,...,...
1995,1996,2021,Vanquish,"$62,673",,"$62,673"
1996,1997,2011,Gekijouban Trigun: Badlands Rumble,"$62,027","$62,027",
1997,1998,2018,Tian Xia Di Yi Biao Ju,"$62,007",,"$62,007"
1998,1999,2014,Tiger & Bunny the Movie: The Rising,"$61,562","$61,562",


In [5]:
#Clean Data
df_action = df
feature = ['Worldwide\r\nBox Office','Domestic\r\nBox Office', 'International\r\nBox Office']
df_action[feature] = df_action[feature].replace(',','', regex = True)
df_action['Worldwide\r\nBox Office'] = df_action['Worldwide\r\nBox Office'].str.replace('$','')
df_action['Domestic\r\nBox Office'] = df_action['Domestic\r\nBox Office'].str.replace('$','')
df_action['International\r\nBox Office'] = df_action['International\r\nBox Office'].str.replace('$','')
df_action.fillna(0,inplace = True)
df_action 

Unnamed: 0,Rank,Released,Movie,Worldwide\r\nBox Office,Domestic\r\nBox Office,International\r\nBox Office
0,1,2009,Avatar,2923706026,785221649,2138484377
1,2,2019,Avengers: Endgame,2794731755,858373000,1936358755
2,3,2022,Avatar: The Way of Water,2318552513,683875614,1634676899
3,4,2018,Avengers: Infinity War,2048359754,678815482,1369544272
4,5,2021,Spider-Man: No Way Home,1910048245,814115070,1095933175
...,...,...,...,...,...,...
1995,1996,2021,Vanquish,62673,0,62673
1996,1997,2011,Gekijouban Trigun: Badlands Rumble,62027,62027,0
1997,1998,2018,Tian Xia Di Yi Biao Ju,62007,0,62007
1998,1999,2014,Tiger & Bunny the Movie: The Rising,61562,61562,0


In [6]:
#Further Cleaning, rename cols and correct data types
df_action['Worldwide\r\nBox Office'] = df_action['Worldwide\r\nBox Office'].astype(int)
df_action['Domestic\r\nBox Office'] = df_action['Domestic\r\nBox Office'].astype(int)
df_action['International\r\nBox Office'] = df_action['International\r\nBox Office'].astype(int)
df_action.rename(columns = {'Worldwide\r\nBox Office': 'Worldwide','Domestic\r\nBox Office':'Domestic','International\r\nBox Office':'International'}, inplace=True )


In [7]:



originals = pd.DataFrame(columns=df_action.columns)

# Preprocess the movie titles
df_action['Movie'] = df_action['Movie'].str.lower()

# Define stop words
stop_words = set(['a', 'an', 'the', 'of', 'in'])  

# Initialize 'Sequel' column with default value of 0
df_action['Sequel'] = 0

# Function to check if a movie title contains words from earlier movies or has a number after it
#examples that are sequels:
#Avengers: Endgame (contains avengers, an earlier movie)
#John Wick Chapter 3: (contains 3, as well as 'John Wick')
#Imperfect, probably some false positives and negatives
#'Goldfinger' James Bond movie but no realistic way to detect that
#'Fast' is a relatively common word and might return false positives but no efficient way to filter
def is_sequel(df, originals):
    #Sort values by release date to make sure function moves forwards in time
    df = df.sort_values('Released')
    #Iterate through rows in df
    for index, row in df.iterrows():
        #Get title and release year
        title = row['Movie']
        year = row['Released']
        #Title words = set of non stop words in title
        title_words = set([word for word in title.split() if word not in stop_words]) 
        #If reasonable sequel number found in title, sequel col for row = 1 (read as true)
        if any(word.isdigit() and (2 <= int(word) < 10) for word in title_words):
            df.loc[index, 'Sequel'] = 1
        #If title made of unique non stop words, adds it to originals, which future movies will be compared against
        #Otherwise, make it a sequel
        else:
            found_in_originals = any(title_words.intersection(original_words) for original_words in originals['Movie'].str.lower().str.split())
            if not found_in_originals:
                originals = pd.concat([originals, row.to_frame().T], ignore_index=True)  # Concatenate the row to the 'originals' DataFrame
            else:
                df.loc[index, 'Sequel'] = 1
                
    return originals

originals = is_sequel(df_action, originals)

# Filter the DataFrame based on the "Sequel" column
sequel_movies = df[df['Sequel'] == 1].reset_index().drop(columns='index')
print(sequel_movies.head())
# Print the columns of the filtered DataFrame
df_action


Empty DataFrame
Columns: [Rank, Released, Movie, Worldwide, Domestic, International, Sequel]
Index: []


Unnamed: 0,Rank,Released,Movie,Worldwide,Domestic,International,Sequel
0,1,2009,avatar,2923706026,785221649,2138484377,0
1,2,2019,avengers: endgame,2794731755,858373000,1936358755,0
2,3,2022,avatar: the way of water,2318552513,683875614,1634676899,0
3,4,2018,avengers: infinity war,2048359754,678815482,1369544272,0
4,5,2021,spider-man: no way home,1910048245,814115070,1095933175,0
...,...,...,...,...,...,...,...
1995,1996,2021,vanquish,62673,0,62673,0
1996,1997,2011,gekijouban trigun: badlands rumble,62027,62027,0,0
1997,1998,2018,tian xia di yi biao ju,62007,0,62007,0
1998,1999,2014,tiger & bunny the movie: the rising,61562,61562,0,0


In [72]:
#Question 3

x = df_action.iloc[:,[False,True,False,False,False,False,True]]
y = df_action.International


Unnamed: 0,Released,Sequel
0,2009,0
1,2019,0
2,2022,0
3,2018,0
4,2021,0
...,...,...
1995,2021,0
1996,2011,0
1997,2018,0
1998,2014,0


In [73]:
#Question 3

X_train,X_test,y_train,y_test=train_test_split(x,y,test_size=0.25,random_state=0)



In [75]:
#question 3
LogRegMovie = LogisticRegression()


scaler = preprocessing.StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

LogRegMovie.fit(X_train,y_train)



y_pred=LogRegMovie.predict(X_test)

In [77]:
#question 4
print('Classes', LogRegMovie.classes_)
print('Intercept',LogRegMovie.intercept_)
print("Coefficients", LogRegMovie.coef_ )

Classes [         0         72         88 ... 1369544272 1634676899 2138484377]
Intercept [ 5.01526768 -0.44184864  0.17820821 ... -0.06488044 -0.18001704
  0.11534876]
Coefficients [[-2.12171801  0.        ]
 [-1.90597621  0.        ]
 [-0.80422453  0.        ]
 ...
 [ 0.27591035  0.        ]
 [ 0.50760318  0.        ]
 [-0.20479189  0.        ]]


In [79]:
#question 5
print("Accuracy", LogRegMovie.score(X_test, y_test))
print(classification_report(y_test, LogRegMovie.predict(X_test)))

Accuracy 0.16
              precision    recall  f1-score   support

           0       0.16      1.00      0.28        80
         436       0.00      0.00      0.00         1
         524       0.00      0.00      0.00         1
        2003       0.00      0.00      0.00         1
        4464       0.00      0.00      0.00         1
       24539       0.00      0.00      0.00         1
       26741       0.00      0.00      0.00         1
       28854       0.00      0.00      0.00         1
       30589       0.00      0.00      0.00         1
       34070       0.00      0.00      0.00         1
       39928       0.00      0.00      0.00         1
       42065       0.00      0.00      0.00         1
       61410       0.00      0.00      0.00         1
       63791       0.00      0.00      0.00         1
       64432       0.00      0.00      0.00         1
       67760       0.00      0.00      0.00         1
       68364       0.00      0.00      0.00         1
       68672 

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [90]:
#Question 7(4)
x2 = df_action.iloc[:,[False,True,False,False,False,False,True]]
y2 = df_action.Worldwide

In [88]:
 #question7(4)
 X2_train,X2_test,y2_train,y2_test=train_test_split(x2,y2,test_size=0.25,random_state=0)



In [89]:
#question7 (4)
LogRegMovie = LogisticRegression()


scaler = preprocessing.StandardScaler()
X2_train = scaler.fit_transform(X2_train)
X2_test = scaler.transform(X2_test)

LogRegMovie.fit(X2_train,y2_train)



y_pred=LogRegMovie.predict(X2_test)

In [91]:
#question 7 (4)
print('Classes', LogRegMovie.classes_)
print('Intercept',LogRegMovie.intercept_)
print("Coefficients", LogRegMovie.coef_ )

Classes [     61562      62007      62027 ... 2048359754 2318552513 2923706026]
Intercept [ 0.07762297  0.0223683   0.10323661 ...  0.0223683  -0.05737846
  0.11304168]
Coefficients [[ 0.20584166  0.        ]
 [ 0.3965859   0.        ]
 [ 0.07164976  0.        ]
 ...
 [ 0.3965859   0.        ]
 [ 0.60158731  0.        ]
 [-0.01395377  0.        ]]


In [92]:
#question7(5)
print("Accuracy", LogRegMovie.score(X2_test, y2_test))
print(classification_report(y_test, LogRegMovie.predict(X2_test)))

Accuracy 0.002
              precision    recall  f1-score   support

           0       0.00      0.00      0.00      80.0
         436       0.00      0.00      0.00       1.0
         524       0.00      0.00      0.00       1.0
        2003       0.00      0.00      0.00       1.0
        4464       0.00      0.00      0.00       1.0
       24539       0.00      0.00      0.00       1.0
       26741       0.00      0.00      0.00       1.0
       28854       0.00      0.00      0.00       1.0
       30589       0.00      0.00      0.00       1.0
       34070       0.00      0.00      0.00       1.0
       39928       0.00      0.00      0.00       1.0
       42065       0.00      0.00      0.00       1.0
       61410       0.00      0.00      0.00       1.0
       63791       0.00      0.00      0.00       1.0
       64432       0.00      0.00      0.00       1.0
       67760       0.00      0.00      0.00       1.0
       68364       0.00      0.00      0.00       1.0
       68672

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
