In [17]:
# Initial imports
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)
import pandas as pd
from pathlib import Path
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.preprocessing import StandardScaler

In [18]:
# Loading data
data_df = pd.read_csv('movies.csv')
# Review the DataFrame
data_df.head()

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,runtime
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,19000000.0,46998772.0,Warner Bros.,146.0
1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980 (United States)",5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,4500000.0,58853106.0,Columbia Pictures,104.0
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,"June 20, 1980 (United States)",8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,18000000.0,538375067.0,Lucasfilm,124.0
3,Airplane!,PG,Comedy,1980,"July 2, 1980 (United States)",7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,3500000.0,83453539.0,Paramount Pictures,88.0
4,Caddyshack,R,Comedy,1980,"July 25, 1980 (United States)",7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,6000000.0,39846344.0,Orion Pictures,98.0


In [23]:
# Trimming the dataframe
trimmed_votes_df = data_df.drop(data_df[data_df.votes <1000].index)
trimmed_df = trimmed_votes_df.dropna()
trimmed_df.head()

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,runtime
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,19000000.0,46998772.0,Warner Bros.,146.0
1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980 (United States)",5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,4500000.0,58853106.0,Columbia Pictures,104.0
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,"June 20, 1980 (United States)",8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,18000000.0,538375067.0,Lucasfilm,124.0
3,Airplane!,PG,Comedy,1980,"July 2, 1980 (United States)",7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,3500000.0,83453539.0,Paramount Pictures,88.0
4,Caddyshack,R,Comedy,1980,"July 25, 1980 (United States)",7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,6000000.0,39846344.0,Orion Pictures,98.0


In [24]:
# Sort the dataframe by votes
vote_sorted_df = trimmed_df.sort_values("votes", ignore_index=True)
vote_sorted_df.head()

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,runtime
0,Brenda Starr,PG,Adventure,1989,"April 15, 1992 (United States)",4.8,1000.0,Robert Ellis Miller,Noreen Stone,Brooke Shields,United States,16000000.0,67878.0,AM/PM Entertainment,93.0
1,Twice in a Lifetime,R,Drama,1985,"November 8, 1985 (United States)",6.5,1000.0,Bud Yorkin,Colin Welland,Gene Hackman,United States,8000000.0,8402424.0,Bud Yorkin Productions,111.0
2,Five Days One Summer,PG,Drama,1982,1982 (Japan),6.1,1000.0,Fred Zinnemann,Michael Austin,Sean Connery,United States,15000000.0,199078.0,Cable and Wireless Finance,108.0
3,There Goes My Baby,R,Comedy,1994,"September 2, 1994 (United States)",6.4,1000.0,Floyd Mutrux,Floyd Mutrux,Dermot Mulroney,United States,10500000.0,123509.0,Nelson Entertainment,99.0
4,The Taking of Beverly Hills,R,Action,1991,"October 11, 1991 (United States)",5.1,1100.0,Sidney J. Furie,Sidney J. Furie,Ken Wahl,United States,19000000.0,939277.0,Nelson Entertainment,96.0


In [25]:
# Splitting the released date column and extracting the date
def extract_date(date_str):
    date_only_str = date_str.split('(')[0].strip()  # Extracting date part
    date_obj = pd.to_datetime(date_only_str, format="%B %d, %Y", errors='coerce')
    return date_obj

# Apply the function to the 'released' column
vote_sorted_df['release_date'] = vote_sorted_df['released'].apply(extract_date)

vote_sorted_df['formatted_date'] = vote_sorted_df['release_date'].dt.strftime("%m-%d-%Y")
clean_df = vote_sorted_df.drop(columns=['release_date', 'released'])

# Display the DataFrame with only the date extracted
clean_df.head()

Unnamed: 0,name,rating,genre,year,score,votes,director,writer,star,country,budget,gross,company,runtime,formatted_date
0,Brenda Starr,PG,Adventure,1989,4.8,1000.0,Robert Ellis Miller,Noreen Stone,Brooke Shields,United States,16000000.0,67878.0,AM/PM Entertainment,93.0,04-15-1992
1,Twice in a Lifetime,R,Drama,1985,6.5,1000.0,Bud Yorkin,Colin Welland,Gene Hackman,United States,8000000.0,8402424.0,Bud Yorkin Productions,111.0,11-08-1985
2,Five Days One Summer,PG,Drama,1982,6.1,1000.0,Fred Zinnemann,Michael Austin,Sean Connery,United States,15000000.0,199078.0,Cable and Wireless Finance,108.0,
3,There Goes My Baby,R,Comedy,1994,6.4,1000.0,Floyd Mutrux,Floyd Mutrux,Dermot Mulroney,United States,10500000.0,123509.0,Nelson Entertainment,99.0,09-02-1994
4,The Taking of Beverly Hills,R,Action,1991,5.1,1100.0,Sidney J. Furie,Sidney J. Furie,Ken Wahl,United States,19000000.0,939277.0,Nelson Entertainment,96.0,10-11-1991


In [28]:
# Determining the success/failure of the movie
clean_df['gross_by_budget']= clean_df['gross']/clean_df['budget']
clean_df.head()

Unnamed: 0,name,rating,genre,year,score,votes,director,writer,star,country,budget,gross,company,runtime,formatted_date,gross_by_budget
0,Brenda Starr,PG,Adventure,1989,4.8,1000.0,Robert Ellis Miller,Noreen Stone,Brooke Shields,United States,16000000.0,67878.0,AM/PM Entertainment,93.0,04-15-1992,0.004242
1,Twice in a Lifetime,R,Drama,1985,6.5,1000.0,Bud Yorkin,Colin Welland,Gene Hackman,United States,8000000.0,8402424.0,Bud Yorkin Productions,111.0,11-08-1985,1.050303
2,Five Days One Summer,PG,Drama,1982,6.1,1000.0,Fred Zinnemann,Michael Austin,Sean Connery,United States,15000000.0,199078.0,Cable and Wireless Finance,108.0,,0.013272
3,There Goes My Baby,R,Comedy,1994,6.4,1000.0,Floyd Mutrux,Floyd Mutrux,Dermot Mulroney,United States,10500000.0,123509.0,Nelson Entertainment,99.0,09-02-1994,0.011763
4,The Taking of Beverly Hills,R,Action,1991,5.1,1100.0,Sidney J. Furie,Sidney J. Furie,Ken Wahl,United States,19000000.0,939277.0,Nelson Entertainment,96.0,10-11-1991,0.049436


In [30]:
# Creating a new column and categorizing it (success or failure) based on the condition specified
clean_df['success_failure'] = clean_df['gross_by_budget'].map(lambda x: x>3).astype(int)
clean_df = clean_df.dropna()
clean_df.head()

Unnamed: 0,name,rating,genre,year,score,votes,director,writer,star,country,budget,gross,company,runtime,formatted_date,gross_by_budget,success_failure
0,Brenda Starr,PG,Adventure,1989,4.8,1000.0,Robert Ellis Miller,Noreen Stone,Brooke Shields,United States,16000000.0,67878.0,AM/PM Entertainment,93.0,04-15-1992,0.004242,0
1,Twice in a Lifetime,R,Drama,1985,6.5,1000.0,Bud Yorkin,Colin Welland,Gene Hackman,United States,8000000.0,8402424.0,Bud Yorkin Productions,111.0,11-08-1985,1.050303,0
3,There Goes My Baby,R,Comedy,1994,6.4,1000.0,Floyd Mutrux,Floyd Mutrux,Dermot Mulroney,United States,10500000.0,123509.0,Nelson Entertainment,99.0,09-02-1994,0.011763,0
4,The Taking of Beverly Hills,R,Action,1991,5.1,1100.0,Sidney J. Furie,Sidney J. Furie,Ken Wahl,United States,19000000.0,939277.0,Nelson Entertainment,96.0,10-11-1991,0.049436,0
5,Eddie Macon's Run,PG,Action,1983,5.7,1100.0,Jeff Kanew,James McLendon,Kirk Douglas,United States,5000000.0,1262691.0,Universal Pictures,95.0,03-25-1983,0.252538,0
