# Importing Libraries

In [1]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error, r2_score

# Loading the dataset 

In [2]:
df = pd.read_csv('IMDb Movies India.csv')

In [3]:
df.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


# Understanding of data

In [4]:
df.sample(5)

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
11978,Room No. 203,(1980),,Mystery,,,Parikshit Sahni,Bindu,Shyamalee,Jagdeep
14413,Toote Tare,(1948),,Drama,,,Harish,Shamim,Mridula Rani,Harish
4267,Durga Maiya,(2002),116 min,Action,,,Devayani,Rami Reddy,Roja,
2840,Chandi Ki Deewar,(1964),,Romance,,,Dilip Bose,Bharat Bhushan,Nutan,Asit Kumar Sen
2247,Bhole Bhale,(1950),,,,,Master Bhagwan,Master Bhagwan,,


In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15509 non-null  object 
 1   Year      14981 non-null  object 
 2   Duration  7240 non-null   object 
 3   Genre     13632 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7920 non-null   object 
 6   Director  14984 non-null  object 
 7   Actor 1   13892 non-null  object 
 8   Actor 2   13125 non-null  object 
 9   Actor 3   12365 non-null  object 
dtypes: float64(1), object(9)
memory usage: 1.2+ MB


In [6]:
df.describe(include ='O')

Unnamed: 0,Name,Year,Duration,Genre,Votes,Director,Actor 1,Actor 2,Actor 3
count,15509,14981,7240,13632,7920,14984,13892,13125,12365
unique,13838,102,182,485,2034,5938,4718,4891,4820
top,Anjaam,(2019),120 min,Drama,8,Jayant Desai,Ashok Kumar,Rekha,Pran
freq,7,410,240,2780,227,58,158,83,91


In [7]:
df.shape

(15509, 10)

In [8]:
# duplicated rows in data
df.duplicated().sum()

6

In [9]:
# null values in data
df.isna().sum()

Name           0
Year         528
Duration    8269
Genre       1877
Rating      7590
Votes       7589
Director     525
Actor 1     1617
Actor 2     2384
Actor 3     3144
dtype: int64

# Data Cleaning 

In [10]:
# Locating rows with missing values in columns from 1 to 9
df[df.iloc[:, 1:9].isna().all(axis=1)]

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
1836,Bang Bang Reloaded,,,,,,,,,
1920,Battle of bittora,,,,,,,,,
2653,Campus,,,,,,,,,
3403,Dancing Dad,,,,,,,,,
3807,Dial 100,,,,,,,,,
4271,Durga Rani Singh,,,,,,,,,
8669,Main Hoon Kaun,,,,,,,,,
9600,Mughal Road,,,,,,,,,


In [12]:
# Unique values present in each column
df.nunique()

Name        13838
Year          102
Duration      182
Genre         485
Rating         84
Votes        2034
Director     5938
Actor 1      4718
Actor 2      4891
Actor 3      4820
dtype: int64

In [13]:
# Handling the null values
df.dropna(subset=['Name', 'Year', 'Duration', 'Rating', 'Votes', 'Director', 'Actor 1', 'Actor 2', 'Actor 3'], inplace=True)

In [22]:
#Extracting only the text part from the Name column
df['Name'] = df['Name'].str.extract('([A-Za-z\s\'\-]+)')

In [25]:
# Replacing the brackets from year column as observed above
df['Year'] = df['Year'].str.replace(r'[()]', '', regex=True).astype(int)

In [28]:
# Convert 'Duration' to numeric and replacing the min, while keeping only numerical part
df['Duration'] = pd.to_numeric(df['Duration'].str.replace(r' min', '', regex=True), errors='coerce')

In [35]:
# Splitting the genre by , to keep only unique genres and replacing the null values with mode
df['Genre'] = df['Genre'].str.split(', ')
df = df.explode('Genre')
df['Genre'].fillna(df['Genre'].mode()[0], inplace=True)

In [41]:
# Convert 'Votes' to numeric and replace the , to keep only numerical part
df['Votes'] = pd.to_numeric(df['Votes'].str.replace(',', ''), errors='coerce')

In [43]:
#checking duplicate values by Name and Year

duplicate = df.groupby(['Name', 'Year']).filter(lambda x: len(x) > 1)
duplicate.head(5)

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
3,Yaaram,2019,110,Comedy,4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
3,Yaaram,2019,110,Romance,4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
5,Aur Pyaar Ho Gaya,1997,147,Comedy,4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor
5,Aur Pyaar Ho Gaya,1997,147,Drama,4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor
5,Aur Pyaar Ho Gaya,1997,147,Musical,4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor


In [46]:
# Dropping the duplicated values by Name
df = df.drop_duplicates(subset=['Name'], keep=False)

# Exploratory Data Analysis 

#### Descriptive Analysis

In [49]:
df.describe()

Unnamed: 0,Year,Duration,Rating,Votes
count,1528.0,1528.0,1528.0,1528.0
mean,1997.972513,123.823953,5.976243,552.479712
std,21.181921,25.108144,1.412547,4311.631841
min,1931.0,45.0,1.6,5.0
25%,1985.0,107.0,5.1,14.0
50%,2004.0,126.0,6.1,34.0
75%,2016.0,140.0,7.0,127.25
max,2021.0,300.0,9.4,101014.0


In [50]:
df.describe(include='O')

Unnamed: 0,Name,Genre,Director,Actor 1,Actor 2,Actor 3
count,1528,1528,1528,1528,1528,1528
unique,1528,20,1114,1010,1131,1154
top,Silence Please,Drama,Kanti Shah,Mithun Chakraborty,Mithun Chakraborty,Pran
freq,1,789,13,22,12,16


In [53]:
# Find the row with the highest number of votes
max_votes_row = df[df['Votes'] == df['Votes'].max()]

# Get the name of the movie with the highest votes
movie_highest_votes = max_votes_row['Name'].values[0]

# Find the number of votes for the movie with the highest votes
votes_highest_votes = max_votes_row['Votes'].values[0]

print("Movie with the highest votes:", movie_highest_votes)
print("Number of votes for the movie with the highest votes:", votes_highest_votes)
print('\n', '='*100, '\n')


# Find the row with the lowest number of votes
min_votes_row = df[df['Votes'] == df['Votes'].min()]

# Get the name of the movie with the lowest votes
movie_lowest_votes = min_votes_row['Name'].values[0]

# Find the number of votes for the movie with the lowest votes
votes_lowest_votes = min_votes_row['Votes'].values[0]

print("Movie with the Lowest votes:", movie_lowest_votes)
print("Number of votes for the movie with the Lowest votes:", votes_lowest_votes)

Movie with the highest votes: My Name Is Khan
Number of votes for the movie with the highest votes: 101014


Movie with the Lowest votes: Anmol Sitaare
Number of votes for the movie with the Lowest votes: 5


In [54]:
# Find the row with the highest rating
max_rating_row = df[df['Rating'] == df['Rating'].max()]
movie_highest_rating = max_rating_row['Name'].values[0]
votes_highest_rating = max_rating_row['Votes'].values[0]

print("Movie with the highest rating:", movie_highest_rating)
print("Number of votes for the movie with the highest rating:", votes_highest_rating)
print('\n', '='*100, '\n')


# Find the row with the lowest rating
min_rating_row = df[df['Rating'] == df['Rating'].min()]
movie_lowest_rating = min_rating_row['Name'].values[0]
votes_lowest_rating = min_rating_row['Votes'].values[0]

print("Movie with the Lowest rating:", movie_lowest_rating)
print("Number of votes for the movie with the Lowest rating:", votes_lowest_rating)

Movie with the highest rating: June
Number of votes for the movie with the highest rating: 18


Movie with the Lowest rating: Mumbai Can Dance Saalaa
Number of votes for the movie with the Lowest rating: 43


In [55]:
# Group the dataset by the 'Director' column and count the number of movies each director has directed
director_counts = df['Director'].value_counts()

# Find the director with the highest number of movies directed
most_prolific_director = director_counts.idxmax()
num_movies_directed = director_counts.max()

print("Director with the highest number of movies directed:", most_prolific_director)
print("Number of movies directed by", most_prolific_director, ":", num_movies_directed)
print('\n', '='*100, '\n')


# Group the dataset by the 'Director' column and count the number of movies each director has directed
director_counts = df['Director'].value_counts()

# Find the director with the lowest number of movies directed
least_prolific_director = director_counts.idxmin()
num_movies_directed = director_counts.min()

print("Director with the lowest number of movies directed:", least_prolific_director)
print("Number of movies directed by", most_prolific_director, ":", num_movies_directed)

Director with the highest number of movies directed: Kanti Shah
Number of movies directed by Kanti Shah : 13


Director with the lowest number of movies directed: Rahul Dahiya
Number of movies directed by Kanti Shah : 1


### Univariate Anaysis

In [79]:
fig_year = px.histogram(df, x = 'Year', histnorm='probability density', nbins = 30)
fig_year.update_layout(title='Distribution of Year',  xaxis_title='Year', yaxis_title='Probability Density')
fig_year.show()

In [78]:
fig_duration = px.histogram(df, x = 'Duration', histnorm='probability density', nbins = 40)
fig_duration.update_layout(title='Distribution of Duration')
fig_duration.show()

In [77]:
fig_rating = px.histogram(df, x = 'Rating', histnorm='probability density', nbins = 40)
fig_rating.update_layout(title='Distribution of Rating', title_x=0.5, title_pad=dict(t=20), title_font=dict(size=20), xaxis_title='Rating', yaxis_title='Probability Density')
fig_rating.show()

In [76]:
fig_votes = px.box(df, x = 'Votes')
fig_votes.update_layout(title='Distribution of Votes',xaxis_title='Votes', yaxis_title='Probability Density')
fig_votes.show()

### Bivariate Analysis