In [1]:
# Initial imports
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
from pathlib import Path
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
%matplotlib inline

In [2]:
# Loading data
data_df = pd.read_csv('C:/Users/shand/group3_final_project/Archive/movies.csv')

# Review the DataFrame
data_df.head()


Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,runtime
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,19000000.0,46998772.0,Warner Bros.,146.0
1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980 (United States)",5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,4500000.0,58853106.0,Columbia Pictures,104.0
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,"June 20, 1980 (United States)",8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,18000000.0,538375067.0,Lucasfilm,124.0
3,Airplane!,PG,Comedy,1980,"July 2, 1980 (United States)",7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,3500000.0,83453539.0,Paramount Pictures,88.0
4,Caddyshack,R,Comedy,1980,"July 25, 1980 (United States)",7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,6000000.0,39846344.0,Orion Pictures,98.0


In [3]:
# Trimming the dataframe
trimmed_votes_df = data_df.drop(data_df[data_df.votes <1000].index)
trimmed_df = trimmed_votes_df.dropna()
trimmed_df.head()

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,runtime
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,19000000.0,46998772.0,Warner Bros.,146.0
1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980 (United States)",5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,4500000.0,58853106.0,Columbia Pictures,104.0
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,"June 20, 1980 (United States)",8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,18000000.0,538375067.0,Lucasfilm,124.0
3,Airplane!,PG,Comedy,1980,"July 2, 1980 (United States)",7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,3500000.0,83453539.0,Paramount Pictures,88.0
4,Caddyshack,R,Comedy,1980,"July 25, 1980 (United States)",7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,6000000.0,39846344.0,Orion Pictures,98.0


In [4]:
# Sort the dataframe by votes
vote_sorted_df = trimmed_df.sort_values("votes", ignore_index=True)
vote_sorted_df.head()

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,runtime
0,Brenda Starr,PG,Adventure,1989,"April 15, 1992 (United States)",4.8,1000.0,Robert Ellis Miller,Noreen Stone,Brooke Shields,United States,16000000.0,67878.0,AM/PM Entertainment,93.0
1,Twice in a Lifetime,R,Drama,1985,"November 8, 1985 (United States)",6.5,1000.0,Bud Yorkin,Colin Welland,Gene Hackman,United States,8000000.0,8402424.0,Bud Yorkin Productions,111.0
2,Five Days One Summer,PG,Drama,1982,1982 (Japan),6.1,1000.0,Fred Zinnemann,Michael Austin,Sean Connery,United States,15000000.0,199078.0,Cable and Wireless Finance,108.0
3,There Goes My Baby,R,Comedy,1994,"September 2, 1994 (United States)",6.4,1000.0,Floyd Mutrux,Floyd Mutrux,Dermot Mulroney,United States,10500000.0,123509.0,Nelson Entertainment,99.0
4,The Taking of Beverly Hills,R,Action,1991,"October 11, 1991 (United States)",5.1,1100.0,Sidney J. Furie,Sidney J. Furie,Ken Wahl,United States,19000000.0,939277.0,Nelson Entertainment,96.0


In [5]:
# Splitting the released date column and extracting the date
def extract_month(date_str):
    date_only_str = date_str.split('(')[0].strip()  # Extracting date part
    date_obj = pd.to_datetime(date_only_str, format="%B %d, %Y", errors='coerce')
    return date_obj

# Apply the function to the 'released' column
vote_sorted_df['release_date'] = vote_sorted_df['released'].apply(extract_month)
# Extracting month in words from the 'release_date' column
vote_sorted_df['month'] = vote_sorted_df['release_date'].dt.strftime('%B')
clean_df = vote_sorted_df.drop(columns=['release_date', 'released'])
clean_df.head()

Unnamed: 0,name,rating,genre,year,score,votes,director,writer,star,country,budget,gross,company,runtime,month
0,Brenda Starr,PG,Adventure,1989,4.8,1000.0,Robert Ellis Miller,Noreen Stone,Brooke Shields,United States,16000000.0,67878.0,AM/PM Entertainment,93.0,April
1,Twice in a Lifetime,R,Drama,1985,6.5,1000.0,Bud Yorkin,Colin Welland,Gene Hackman,United States,8000000.0,8402424.0,Bud Yorkin Productions,111.0,November
2,Five Days One Summer,PG,Drama,1982,6.1,1000.0,Fred Zinnemann,Michael Austin,Sean Connery,United States,15000000.0,199078.0,Cable and Wireless Finance,108.0,
3,There Goes My Baby,R,Comedy,1994,6.4,1000.0,Floyd Mutrux,Floyd Mutrux,Dermot Mulroney,United States,10500000.0,123509.0,Nelson Entertainment,99.0,September
4,The Taking of Beverly Hills,R,Action,1991,5.1,1100.0,Sidney J. Furie,Sidney J. Furie,Ken Wahl,United States,19000000.0,939277.0,Nelson Entertainment,96.0,October


In [6]:
# Determining the success/failure of the movie
clean_df['gross_by_budget']= clean_df['gross']/clean_df['budget']
clean_df.head()

Unnamed: 0,name,rating,genre,year,score,votes,director,writer,star,country,budget,gross,company,runtime,month,gross_by_budget
0,Brenda Starr,PG,Adventure,1989,4.8,1000.0,Robert Ellis Miller,Noreen Stone,Brooke Shields,United States,16000000.0,67878.0,AM/PM Entertainment,93.0,April,0.004242
1,Twice in a Lifetime,R,Drama,1985,6.5,1000.0,Bud Yorkin,Colin Welland,Gene Hackman,United States,8000000.0,8402424.0,Bud Yorkin Productions,111.0,November,1.050303
2,Five Days One Summer,PG,Drama,1982,6.1,1000.0,Fred Zinnemann,Michael Austin,Sean Connery,United States,15000000.0,199078.0,Cable and Wireless Finance,108.0,,0.013272
3,There Goes My Baby,R,Comedy,1994,6.4,1000.0,Floyd Mutrux,Floyd Mutrux,Dermot Mulroney,United States,10500000.0,123509.0,Nelson Entertainment,99.0,September,0.011763
4,The Taking of Beverly Hills,R,Action,1991,5.1,1100.0,Sidney J. Furie,Sidney J. Furie,Ken Wahl,United States,19000000.0,939277.0,Nelson Entertainment,96.0,October,0.049436


In [8]:
# Creating a new column and categorizing it (success or failure) based on the condition specified
clean_df['success_status'] = clean_df['gross_by_budget'].map(lambda x: x>3).astype(int)
clean_df = clean_df.dropna()
clean_df.head()

Unnamed: 0,name,rating,genre,year,score,votes,director,writer,star,country,budget,gross,company,runtime,month,gross_by_budget,success_status
0,Brenda Starr,PG,Adventure,1989,4.8,1000.0,Robert Ellis Miller,Noreen Stone,Brooke Shields,United States,16000000.0,67878.0,AM/PM Entertainment,93.0,April,0.004242,0
1,Twice in a Lifetime,R,Drama,1985,6.5,1000.0,Bud Yorkin,Colin Welland,Gene Hackman,United States,8000000.0,8402424.0,Bud Yorkin Productions,111.0,November,1.050303,0
3,There Goes My Baby,R,Comedy,1994,6.4,1000.0,Floyd Mutrux,Floyd Mutrux,Dermot Mulroney,United States,10500000.0,123509.0,Nelson Entertainment,99.0,September,0.011763,0
4,The Taking of Beverly Hills,R,Action,1991,5.1,1100.0,Sidney J. Furie,Sidney J. Furie,Ken Wahl,United States,19000000.0,939277.0,Nelson Entertainment,96.0,October,0.049436,0
5,Eddie Macon's Run,PG,Action,1983,5.7,1100.0,Jeff Kanew,James McLendon,Kirk Douglas,United States,5000000.0,1262691.0,Universal Pictures,95.0,March,0.252538,0


# RANDOM FOREST

In [9]:
# Define features set
X = clean_df.copy()
X.drop("success_status", axis=1, inplace=True)
X.head()

Unnamed: 0,name,rating,genre,year,score,votes,director,writer,star,country,budget,gross,company,runtime,month,gross_by_budget
0,Brenda Starr,PG,Adventure,1989,4.8,1000.0,Robert Ellis Miller,Noreen Stone,Brooke Shields,United States,16000000.0,67878.0,AM/PM Entertainment,93.0,April,0.004242
1,Twice in a Lifetime,R,Drama,1985,6.5,1000.0,Bud Yorkin,Colin Welland,Gene Hackman,United States,8000000.0,8402424.0,Bud Yorkin Productions,111.0,November,1.050303
3,There Goes My Baby,R,Comedy,1994,6.4,1000.0,Floyd Mutrux,Floyd Mutrux,Dermot Mulroney,United States,10500000.0,123509.0,Nelson Entertainment,99.0,September,0.011763
4,The Taking of Beverly Hills,R,Action,1991,5.1,1100.0,Sidney J. Furie,Sidney J. Furie,Ken Wahl,United States,19000000.0,939277.0,Nelson Entertainment,96.0,October,0.049436
5,Eddie Macon's Run,PG,Action,1983,5.7,1100.0,Jeff Kanew,James McLendon,Kirk Douglas,United States,5000000.0,1262691.0,Universal Pictures,95.0,March,0.252538


In [10]:
# Define target vector
y = clean_df["success_status"].values.reshape(-1, 1)
y[:5]


array([[0],
       [0],
       [0],
       [0],
       [0]])

In [11]:
# Insert dummies for categorical values
X = pd.get_dummies(X)

In [12]:
X

Unnamed: 0,year,score,votes,budget,gross,runtime,gross_by_budget,name_*batteries not included,name_10 Cloverfield Lane,name_10 Things I Hate About You,...,month_December,month_February,month_January,month_July,month_June,month_March,month_May,month_November,month_October,month_September
0,1989,4.8,1000.0,16000000.0,6.787800e+04,93.0,0.004242,False,False,False,...,False,False,False,False,False,False,False,False,False,False
1,1985,6.5,1000.0,8000000.0,8.402424e+06,111.0,1.050303,False,False,False,...,False,False,False,False,False,False,False,True,False,False
3,1994,6.4,1000.0,10500000.0,1.235090e+05,99.0,0.011763,False,False,False,...,False,False,False,False,False,False,False,False,False,True
4,1991,5.1,1100.0,19000000.0,9.392770e+05,96.0,0.049436,False,False,False,...,False,False,False,False,False,False,False,False,True,False
5,1983,5.7,1100.0,5000000.0,1.262691e+06,95.0,0.252538,False,False,False,...,False,False,False,False,False,True,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5379,1994,8.9,1900000.0,8000000.0,2.139288e+08,154.0,26.741095,False,False,False,...,False,False,False,False,False,False,False,False,True,False
5380,1994,8.8,1900000.0,55000000.0,6.782261e+08,142.0,12.331384,False,False,False,...,False,False,False,True,False,False,False,False,False,False
5381,2010,8.8,2100000.0,160000000.0,8.368370e+08,148.0,5.230231,False,False,False,...,False,False,False,True,False,False,False,False,False,False
5382,2008,9.0,2400000.0,185000000.0,1.005974e+09,152.0,5.437695,False,False,False,...,False,False,False,True,False,False,False,False,False,False


In [13]:
# Splitting into Train and Test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=78)

In [14]:
# Creating StandardScaler instance
scaler = StandardScaler()
# Fitting Standard Scaller
X_scaler = scaler.fit(X_train)
# Scaling data
X_train_scaled = X_scaler.transform(X_train)
X_test_scaled = X_scaler.transform(X_test)

 # Fitting the Random Forest Model

In [15]:
# Create a random forest classifier
rf_model = RandomForestClassifier(n_estimators=500,max_depth=30, random_state=78)
# rf_model = RandomForestClassifier(n_estimators=500,random_state=78)

In [16]:
# Fitting the model
# rf_model = rf_model.fit(X_train_scaled, y_train)
rf_model = rf_model.fit(X_train_scaled, y_train.ravel())

#  Making Predictions Using the Random Forest Model

In [17]:
# Making predictions using the testing data
predictions = rf_model.predict(X_test_scaled)

#  Model Evaluation

In [18]:
 # Calculating the confusion matrix
cm = confusion_matrix(y_test, predictions)
cm_df = pd.DataFrame(
    cm, index=["Actual 0", "Actual 1"], columns=["Predicted 0", "Predicted 1"]
)

# Calculating the accuracy score
acc_score = accuracy_score(y_test, predictions)

In [19]:
# Displaying results
print("Confusion Matrix")
display(cm_df)
print(f"Accuracy Score : {acc_score}")
print("Classification Report")
print(classification_report(y_test, predictions))

Confusion Matrix


Unnamed: 0,Predicted 0,Predicted 1
Actual 0,889,0
Actual 1,115,339


Accuracy Score : 0.9143708116157856
Classification Report
              precision    recall  f1-score   support

           0       0.89      1.00      0.94       889
           1       1.00      0.75      0.85       454

    accuracy                           0.91      1343
   macro avg       0.94      0.87      0.90      1343
weighted avg       0.92      0.91      0.91      1343

