# Importing necessary modules

In [None]:
import numpy as np
import pandas as pd

# Dataset

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("danielgrijalvas/movies")

print("Path to dataset files:", path)

Path to dataset files: /root/.cache/kagglehub/datasets/danielgrijalvas/movies/versions/2


In [None]:
movies = pd.read_csv("/root/.cache/kagglehub/datasets/danielgrijalvas/movies/versions/2/movies.csv") #reading the dataset

# Exploration

In [None]:
movies.sample(4) # having a look at the dataset

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,runtime
1148,Suspect,R,Crime,1987,"October 23, 1987 (United States)",6.6,9600.0,Peter Yates,Eric Roth,Cher,United States,14500000.0,18782400.0,TriStar Pictures,121.0
4597,Johnson Family Vacation,PG-13,Comedy,2004,"April 7, 2004 (United States)",4.5,7400.0,Christopher Erskin,Todd R. Jones,Cedric the Entertainer,United States,12000000.0,31326183.0,Fox Searchlight Pictures,97.0
2824,Rumpelstiltskin,R,Comedy,1995,"November 24, 1995 (United States)",4.5,2200.0,Mark Jones,Mark Jones,Max Grodénchik,United States,3000000.0,,Republic Pictures (II),87.0
4345,A Man Apart,R,Action,2003,"April 4, 2003 (United States)",6.1,46000.0,F. Gary Gray,Christian Gudegast,Vin Diesel,United States,36000000.0,44350926.0,"""DIA"" Productions GmbH & Co. KG",109.0


In [None]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7668 entries, 0 to 7667
Data columns (total 15 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   name      7668 non-null   object 
 1   rating    7591 non-null   object 
 2   genre     7668 non-null   object 
 3   year      7668 non-null   int64  
 4   released  7666 non-null   object 
 5   score     7665 non-null   float64
 6   votes     7665 non-null   float64
 7   director  7668 non-null   object 
 8   writer    7665 non-null   object 
 9   star      7667 non-null   object 
 10  country   7665 non-null   object 
 11  budget    5497 non-null   float64
 12  gross     7479 non-null   float64
 13  company   7651 non-null   object 
 14  runtime   7664 non-null   float64
dtypes: float64(5), int64(1), object(9)
memory usage: 898.7+ KB


***From looking at the number of non null values we can assume that there are multiple missing data points***



---





# Cleaning the Dataset

In [None]:
movies.isna().sum() #counting the number of missing values in each column

Unnamed: 0,0
name,0
rating,77
genre,0
year,0
released,2
score,3
votes,3
director,0
writer,3
star,1


In [None]:
sum((movies['budget'] == 0) & (movies['gross'] == 0)) # checking a case where movie budget and revenue is both 0

0

In [None]:
sum(movies['budget'] == 0) #counting the number of cases where budget is 0

0

In [None]:
sum(movies['gross'] == 0) #counting the number of cases where revenue is 0

0

In [None]:
#dropping the rest of the missing values

movies.dropna(subset = ['runtime', 'released', 'star', 'country', 'votes', 'writer', 'budget', 'gross', 'company', 'rating'], inplace = True)

In [None]:
movies.isna().sum()

Unnamed: 0,0
name,0
rating,0
genre,0
year,0
released,0
score,0
votes,0
director,0
writer,0
star,0


# Feature Engineering

In [None]:
movies['ROI (in %)'] = ((movies['gross'] - movies['budget'])/movies['budget'])*100 #creating a new feature to calculate the ROI

In [None]:
movies['Profit Margin (in %)'] = ((movies['gross'] - movies['budget'])/movies['gross'])*100 #creating a new feature to calculate the profit margin

In [None]:
movies.head(2)

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,runtime,ROI (in %),Profit Margin (in %)
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,19000000.0,46998772.0,Warner Bros.,146.0,147.361958,59.573412
1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980 (United States)",5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,4500000.0,58853106.0,Columbia Pictures,104.0,1207.8468,92.353845


In [None]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5421 entries, 0 to 7652
Data columns (total 17 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   name                  5421 non-null   object 
 1   rating                5421 non-null   object 
 2   genre                 5421 non-null   object 
 3   year                  5421 non-null   int64  
 4   released              5421 non-null   object 
 5   score                 5421 non-null   float64
 6   votes                 5421 non-null   float64
 7   director              5421 non-null   object 
 8   writer                5421 non-null   object 
 9   star                  5421 non-null   object 
 10  country               5421 non-null   object 
 11  budget                5421 non-null   float64
 12  gross                 5421 non-null   float64
 13  company               5421 non-null   object 
 14  runtime               5421 non-null   float64
 15  ROI (in %)            5421

In [None]:
movies.to_csv("movies_cleaned.csv", index=False) #to save the new updated dataset