# Importing necessary modules

In [4]:
import numpy as np
import pandas as pd

# Dataset

In [5]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("danielgrijalvas/movies")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/movies


In [6]:
movies = pd.read_csv("/kaggle/input/movies/movies.csv") #reading the dataset

# Exploration

In [7]:
movies.sample(10) # having a look at the dataset

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,runtime
1543,The Return of Swamp Thing,PG-13,Action,1989,"May 12, 1989 (United States)",4.7,4400.0,Jim Wynorski,Neil Cuthbert,Dick Durock,United States,4000000.0,192816.0,Lightyear Entertainment,88.0
731,Trouble in Mind,R,Comedy,1985,March 1986 (United States),6.5,1800.0,Alan Rudolph,Alan Rudolph,Kris Kristofferson,United States,,19632.0,Pfeiffer/Blocker Production,111.0
4450,Eternal Sunshine of the Spotless Mind,R,Drama,2004,"March 19, 2004 (United States)",8.3,935000.0,Michel Gondry,Charlie Kaufman,Jim Carrey,United States,20000000.0,74036715.0,Focus Features,108.0
5601,Love Happens,PG-13,Drama,2009,"September 18, 2009 (United States)",5.7,32000.0,Brandon Camp,Brandon Camp,Jennifer Aniston,United States,18000000.0,36088028.0,Universal Pictures,109.0
3736,Drowning Mona,PG-13,Comedy,2000,"March 3, 2000 (United States)",5.7,14000.0,Nick Gomez,Peter Steinfeld,Danny DeVito,United States,37000000.0,15910104.0,Code Entertainment,96.0
2392,Kika,Unrated,Comedy,1993,"May 6, 1994 (United States)",6.5,13000.0,Pedro Almodóvar,Pedro Almodóvar,Peter Coyote,Spain,,2019581.0,El Deseo,114.0
3269,The Mask of Zorro,PG-13,Action,1998,"July 17, 1998 (United States)",6.8,173000.0,Martin Campbell,Johnston McCulley,Antonio Banderas,United States,95000000.0,250288523.0,TriStar Pictures,136.0
3526,Runaway Bride,PG,Comedy,1999,"July 30, 1999 (United States)",5.6,95000.0,Garry Marshall,Josann McGibbon,Julia Roberts,United States,70000000.0,309460292.0,Paramount Pictures,116.0
786,Taipei Story,Not Rated,Drama,1985,1985 (Taiwan),7.7,2500.0,Edward Yang,T'ien-wen Chu,Chin Tsai,Taiwan,,35336.0,Evergreen Film Company,119.0
6559,A Little Chaos,R,Drama,2014,"June 26, 2015 (United States)",6.5,22000.0,Alan Rickman,Jeremy Brock,Kate Winslet,United Kingdom,,10084623.0,BBC Films,112.0


In [8]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7668 entries, 0 to 7667
Data columns (total 15 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   name      7668 non-null   object 
 1   rating    7591 non-null   object 
 2   genre     7668 non-null   object 
 3   year      7668 non-null   int64  
 4   released  7666 non-null   object 
 5   score     7665 non-null   float64
 6   votes     7665 non-null   float64
 7   director  7668 non-null   object 
 8   writer    7665 non-null   object 
 9   star      7667 non-null   object 
 10  country   7665 non-null   object 
 11  budget    5497 non-null   float64
 12  gross     7479 non-null   float64
 13  company   7651 non-null   object 
 14  runtime   7664 non-null   float64
dtypes: float64(5), int64(1), object(9)
memory usage: 898.7+ KB


***From looking at the number of non null values we can assume that there are multiple missing data points***



---





# Cleaning the Dataset

In [9]:
movies.isna().sum() #counting the number of missing values in each column

Unnamed: 0,0
name,0
rating,77
genre,0
year,0
released,2
score,3
votes,3
director,0
writer,3
star,1


In [10]:
sum((movies['budget'] == 0) & (movies['gross'] == 0)) # checking a case where movie budget and revenue is both 0

0

In [11]:
sum(movies['budget'] == 0) #counting the number of cases where budget is 0

0

In [12]:
sum(movies['gross'] == 0) #counting the number of cases where revenue is 0

0

In [13]:
movies.dropna(subset = ['runtime', 'released', 'star', 'country', 'votes', 'writer', 'budget', 'gross', 'company', 'rating'], inplace = True) #dropping the rest of the missing values

In [14]:
movies.isna().sum()

Unnamed: 0,0
name,0
rating,0
genre,0
year,0
released,0
score,0
votes,0
director,0
writer,0
star,0


# Feature Engineering

In [15]:
movies['ROI (in %)'] = ((movies['gross'] - movies['budget'])/movies['budget'])*100 #creating a new feature to calculate the ROI

In [16]:
movies['Profit Margin (in %)'] = ((movies['gross'] - movies['budget'])/movies['gross'])*100 #creating a new feature to calculate the profit margin

In [17]:
movies.head()

Unnamed: 0,name,rating,genre,year,released,score,votes,director,writer,star,country,budget,gross,company,runtime,ROI (in %),Profit Margin (in %)
0,The Shining,R,Drama,1980,"June 13, 1980 (United States)",8.4,927000.0,Stanley Kubrick,Stephen King,Jack Nicholson,United Kingdom,19000000.0,46998772.0,Warner Bros.,146.0,147.361958,59.573412
1,The Blue Lagoon,R,Adventure,1980,"July 2, 1980 (United States)",5.8,65000.0,Randal Kleiser,Henry De Vere Stacpoole,Brooke Shields,United States,4500000.0,58853106.0,Columbia Pictures,104.0,1207.8468,92.353845
2,Star Wars: Episode V - The Empire Strikes Back,PG,Action,1980,"June 20, 1980 (United States)",8.7,1200000.0,Irvin Kershner,Leigh Brackett,Mark Hamill,United States,18000000.0,538375067.0,Lucasfilm,124.0,2890.972594,96.656606
3,Airplane!,PG,Comedy,1980,"July 2, 1980 (United States)",7.7,221000.0,Jim Abrahams,Jim Abrahams,Robert Hays,United States,3500000.0,83453539.0,Paramount Pictures,88.0,2284.386829,95.80605
4,Caddyshack,R,Comedy,1980,"July 25, 1980 (United States)",7.3,108000.0,Harold Ramis,Brian Doyle-Murray,Chevy Chase,United States,6000000.0,39846344.0,Orion Pictures,98.0,564.105733,84.942157


In [18]:
# movies.to_csv("movies_cleaned.csv", index=False) #to save the new updated dataset