In [413]:
import pandas as pd 
import numpy as np 
import matplotlib.pyplot as plt 
import seaborn as sbr
import plotly.express as px 
from sklearn.model_selection import train_test_split


**DATA CLEANING**

In [414]:
df=pd.read_csv("IMDb Movies India.csv",encoding='latin') 

In [415]:
df.head()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
0,,,,Drama,,,J.S. Randhawa,Manmauji,Birbal,Rajendra Bhatia
1,#Gadhvi (He thought he was Gandhi),(2019),109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
2,#Homecoming,(2021),90 min,"Drama, Musical",,,Soumyajit Majumdar,Sayani Gupta,Plabita Borthakur,Roy Angana
3,#Yaaram,(2019),110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
4,...And Once Again,(2010),105 min,Drama,,,Amol Palekar,Rajat Kapoor,Rituparna Sengupta,Antara Mali


In [416]:
df.tail()

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
15504,Zulm Ko Jala Doonga,(1988),,Action,4.6,11.0,Mahendra Shah,Naseeruddin Shah,Sumeet Saigal,Suparna Anand
15505,Zulmi,(1999),129 min,"Action, Drama",4.5,655.0,Kuku Kohli,Akshay Kumar,Twinkle Khanna,Aruna Irani
15506,Zulmi Raj,(2005),,Action,,,Kiran Thej,Sangeeta Tiwari,,
15507,Zulmi Shikari,(1988),,Action,,,,,,
15508,Zulm-O-Sitam,(1998),130 min,"Action, Drama",6.2,20.0,K.C. Bokadia,Dharmendra,Jaya Prada,Arjun Sarja


In [417]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 15509 entries, 0 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      15509 non-null  object 
 1   Year      14981 non-null  object 
 2   Duration  7240 non-null   object 
 3   Genre     13632 non-null  object 
 4   Rating    7919 non-null   float64
 5   Votes     7920 non-null   object 
 6   Director  14984 non-null  object 
 7   Actor 1   13892 non-null  object 
 8   Actor 2   13125 non-null  object 
 9   Actor 3   12365 non-null  object 
dtypes: float64(1), object(9)
memory usage: 1.2+ MB


In [418]:
df.isna().sum()/df.shape[0]*100


Name         0.000000
Year         3.404475
Duration    53.317429
Genre       12.102650
Rating      48.939326
Votes       48.932878
Director     3.385131
Actor 1     10.426204
Actor 2     15.371720
Actor 3     20.272100
dtype: float64

In [419]:
df.dtypes

Name         object
Year         object
Duration     object
Genre        object
Rating      float64
Votes        object
Director     object
Actor 1      object
Actor 2      object
Actor 3      object
dtype: object

**YEAR COLUMN**

In [420]:
df['Year']=df['Year'].str.replace(r'\(|\)','',regex=True) #replace the() wiht ''
df.dropna(subset='Year',inplace=True)
df['Year']=df['Year'].astype(int)
print(df['Year'].dtype)

int32


**Name Column**

In [421]:
df['Name'] = df['Name'].str.extract('([A-Za-z\s\'\-]+)')


**Votes column**

In [422]:
df['Votes']=pd.to_numeric(df['Votes'].str.replace(',',''),errors='coerce')

df['Votes'].dtype


dtype('float64')

**Dealing with null values**

In [423]:
df.dropna(subset=['Name', 'Year', 'Duration', 'Rating', 'Votes', 'Director', 'Actor 1', 'Actor 2', 'Actor 3'], inplace=True)


**replacing nullvalues with the mode in Genre**

In [424]:
df['Genre'].fillna(df['Genre'].mode()[0],inplace=True)


In [425]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 5681 entries, 1 to 15508
Data columns (total 10 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   Name      5681 non-null   object 
 1   Year      5681 non-null   int32  
 2   Duration  5681 non-null   object 
 3   Genre     5681 non-null   object 
 4   Rating    5681 non-null   float64
 5   Votes     5681 non-null   float64
 6   Director  5681 non-null   object 
 7   Actor 1   5681 non-null   object 
 8   Actor 2   5681 non-null   object 
 9   Actor 3   5681 non-null   object 
dtypes: float64(2), int32(1), object(7)
memory usage: 466.0+ KB


In [426]:
df.isna().sum()/df.shape[0]*100


Name        0.0
Year        0.0
Duration    0.0
Genre       0.0
Rating      0.0
Votes       0.0
Director    0.0
Actor 1     0.0
Actor 2     0.0
Actor 3     0.0
dtype: float64

In [427]:
df.head(2)

Unnamed: 0,Name,Year,Duration,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
1,Gadhvi,2019,109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
3,Yaaram,2019,110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor


In [428]:
df.drop_duplicates(inplace=True)

In [429]:
df = df.rename(columns={'Duration': 'Duration m'})


In [430]:
df

Unnamed: 0,Name,Year,Duration m,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
1,Gadhvi,2019,109 min,Drama,7.0,8.0,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
3,Yaaram,2019,110 min,"Comedy, Romance",4.4,35.0,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
5,Aur Pyaar Ho Gaya,1997,147 min,"Comedy, Drama, Musical",4.7,827.0,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor
6,Yahaan,2005,142 min,"Drama, Romance, War",7.4,1086.0,Shoojit Sircar,Jimmy Sheirgill,Minissha Lamba,Yashpal Sharma
8,A Question Mark,2012,82 min,"Horror, Mystery, Thriller",5.6,326.0,Allyson Patel,Yash Dave,Muntazir Ahmad,Kiran Bhatia
...,...,...,...,...,...,...,...,...,...,...
15493,Zubaan,2015,115 min,Drama,6.1,408.0,Mozez Singh,Vicky Kaushal,Sarah Jane Dias,Raaghavv Chanana
15494,Zubeidaa,2001,153 min,"Biography, Drama, History",6.2,1496.0,Shyam Benegal,Karisma Kapoor,Rekha,Manoj Bajpayee
15503,Zulm Ki Zanjeer,1989,125 min,"Action, Crime, Drama",5.8,44.0,S.P. Muthuraman,Chiranjeevi,Jayamalini,Rajinikanth
15505,Zulmi,1999,129 min,"Action, Drama",4.5,655.0,Kuku Kohli,Akshay Kumar,Twinkle Khanna,Aruna Irani


In [431]:
df['Votes']=df['Votes'].astype(int)

In [432]:
df['Duration m']=pd.to_numeric(df['Duration m'].str.replace('min',''),errors='coerce')


In [433]:
df.isnull().sum()

Name          0
Year          0
Duration m    0
Genre         0
Rating        0
Votes         0
Director      0
Actor 1       0
Actor 2       0
Actor 3       0
dtype: int64

In [434]:
df.head()

Unnamed: 0,Name,Year,Duration m,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
1,Gadhvi,2019,109,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
3,Yaaram,2019,110,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
5,Aur Pyaar Ho Gaya,1997,147,"Comedy, Drama, Musical",4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor
6,Yahaan,2005,142,"Drama, Romance, War",7.4,1086,Shoojit Sircar,Jimmy Sheirgill,Minissha Lamba,Yashpal Sharma
8,A Question Mark,2012,82,"Horror, Mystery, Thriller",5.6,326,Allyson Patel,Yash Dave,Muntazir Ahmad,Kiran Bhatia


**EDA PROCESS**

In [435]:
duplicate = df.groupby(['Name', 'Year']).filter(lambda x: len(x) > 1)
duplicate.head(5)

Unnamed: 0,Name,Year,Duration m,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
6062,India's Daughter,2015,63,"Documentary, Biography, Crime",8.2,1998,Leslee Udwin,Asha Devi,Badri Singh,Satendra
6063,India's Daughter,2015,120,"Action, Crime, Drama",6.9,21,Rupesh Paul,Omkar Das Manikpuri,Amann Grewal,Vishal Om Prakash
6217,ISIS,2017,92,Action,6.4,18,Yuvraj Kumar,Amrityan,Moshin Bhat,Snighdadeep Chatterji
6219,ISIS,2017,135,Action,7.5,8,Yuvraj Kumar,Yuvraj Kumar,Manon Faure,Rasheed Naz
8401,M,2014,118,"Drama, Romance",3.9,14,Suresh Jain,Seema Dogra,Paras Sharma,Sushma Sharma


In [436]:
df.drop_duplicates(subset=['Name'], keep=False,inplace=True)


In [437]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 4990 entries, 1 to 15508
Data columns (total 10 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   Name        4990 non-null   object 
 1   Year        4990 non-null   int32  
 2   Duration m  4990 non-null   int64  
 3   Genre       4990 non-null   object 
 4   Rating      4990 non-null   float64
 5   Votes       4990 non-null   int32  
 6   Director    4990 non-null   object 
 7   Actor 1     4990 non-null   object 
 8   Actor 2     4990 non-null   object 
 9   Actor 3     4990 non-null   object 
dtypes: float64(1), int32(2), int64(1), object(6)
memory usage: 389.8+ KB


In [438]:
df.head(2)

Unnamed: 0,Name,Year,Duration m,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
1,Gadhvi,2019,109,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
3,Yaaram,2019,110,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor


**The most productive year**
 

In [439]:
df.describe() #numerical values

Unnamed: 0,Year,Duration m,Rating,Votes
count,4990.0,4990.0,4990.0,4990.0
mean,1997.270942,132.425251,5.900561,2697.899399
std,19.460295,25.452874,1.382698,13991.69518
min,1931.0,21.0,1.1,5.0
25%,1984.0,117.0,5.0,29.0
50%,2003.0,134.0,6.1,124.0
75%,2014.0,149.75,6.9,908.75
max,2021.0,321.0,10.0,591417.0


In [440]:
df.describe(include='O') #string values 

Unnamed: 0,Name,Genre,Director,Actor 1,Actor 2,Actor 3
count,4990,4990,4990,4990,4990,4990
unique,4990,353,2331,1876,2225,2428
top,Gadhvi,Drama,David Dhawan,Jeetendra,Rekha,Pran
freq,1,789,32,82,44,46


In [441]:
colors = ['#6990AD']

In [442]:
df['Year'].max()

2021

In [443]:
df['Year']=df['Year'].astype(int)

**Year**

In [444]:
color = ["#FF6633", "#FFB399", "#FF33FF", "#FFFF99", "#00B3E6"]
colors = ["#FF33FF"]
fig=px.histogram(df,x='Year',histnorm='probability',color_discrete_sequence=colors)
fig.update_layout(title=' IMDB year', title_x=0.2, title_font=dict(size=20), 
                  xaxis_title='Year', yaxis_title='Probability ', 
                  xaxis=dict(showgrid=False), yaxis=dict(showgrid=False),
                    bargap=0.03, 
                    plot_bgcolor = 'slategrey')
fig


**Duration**

In [445]:

fig2 = px.histogram(df, x='Duration m', color='Genre', nbins=60, histnorm='probability')
fig2.update_layout(
    title='IMDB Duration', title_x=0.2, title_font=dict(size=20),
    xaxis_title='Duration', yaxis_title='Probability',
    xaxis=dict(showgrid=False), yaxis=dict(showgrid=False),
    bargap=0.02,
    plot_bgcolor='slategrey'
)

In [446]:
yearavg=df.groupby(['Year','Name'])['Rating'].mean().reset_index()

In [447]:
yearavg2=df.groupby(['Year'])['Rating'].mean().reset_index()

In [448]:
figx=px.line(yearavg2,x='Year',y='Rating', color_discrete_sequence=color)
figx.update_layout(title='Rating trend', title_x=0.5,
                    title_pad=dict(t=20),
                      title_font=dict(size=20), 
                      xaxis_title='Year', yaxis_title='Rating',
                        xaxis=dict(showgrid=False), 
                        yaxis=dict(showgrid=False), 
                        plot_bgcolor = 'white')


In [449]:
yearavg.max()

Year           2021
Name      th August
Rating         10.0
dtype: object

In [450]:
top10=yearavg.nlargest(10,'Rating')
top10

Unnamed: 0,Year,Name,Rating
4869,2020,Love Qubool Hai,10.0
4942,2021,Half Songs,9.7
4657,2019,Gho Gho Rani,9.4
4915,2020,The Reluctant Crime,9.4
4945,2021,June,9.4
4404,2018,Ashok Vatika,9.3
4658,2019,God of gods,9.3
4760,2019,Sindhustan,9.3
4934,2021,Baikunth,9.3
4951,2021,Love Sorries,9.3


In [451]:
top10['Year']=top10['Year'].astype(str)

In [452]:
top10['Year'].dtype

dtype('O')

In [453]:
fig=px.bar(top10,x='Year',y='Rating',color='Name',pattern_shape='Name')
fig.show()

In [454]:
df

Unnamed: 0,Name,Year,Duration m,Genre,Rating,Votes,Director,Actor 1,Actor 2,Actor 3
1,Gadhvi,2019,109,Drama,7.0,8,Gaurav Bakshi,Rasika Dugal,Vivek Ghamande,Arvind Jangid
3,Yaaram,2019,110,"Comedy, Romance",4.4,35,Ovais Khan,Prateik,Ishita Raj,Siddhant Kapoor
5,Aur Pyaar Ho Gaya,1997,147,"Comedy, Drama, Musical",4.7,827,Rahul Rawail,Bobby Deol,Aishwarya Rai Bachchan,Shammi Kapoor
6,Yahaan,2005,142,"Drama, Romance, War",7.4,1086,Shoojit Sircar,Jimmy Sheirgill,Minissha Lamba,Yashpal Sharma
8,A Question Mark,2012,82,"Horror, Mystery, Thriller",5.6,326,Allyson Patel,Yash Dave,Muntazir Ahmad,Kiran Bhatia
...,...,...,...,...,...,...,...,...,...,...
15493,Zubaan,2015,115,Drama,6.1,408,Mozez Singh,Vicky Kaushal,Sarah Jane Dias,Raaghavv Chanana
15494,Zubeidaa,2001,153,"Biography, Drama, History",6.2,1496,Shyam Benegal,Karisma Kapoor,Rekha,Manoj Bajpayee
15503,Zulm Ki Zanjeer,1989,125,"Action, Crime, Drama",5.8,44,S.P. Muthuraman,Chiranjeevi,Jayamalini,Rajinikanth
15505,Zulmi,1999,129,"Action, Drama",4.5,655,Kuku Kohli,Akshay Kumar,Twinkle Khanna,Aruna Irani


In [455]:
#encoding the data using the mean method for each  string char!
genret = df.groupby('Genre')['Rating'].transform('mean')


In [456]:
directort= df.groupby('Director')['Rating'].transform('mean')

actor1t= df.groupby('Actor 1')['Rating'].transform('mean')

actor2t= df.groupby('Actor 2')['Rating'].transform('mean')

actor3t= df.groupby('Actor 3')['Rating'].transform('mean')


In [457]:
df['directort']=directort
df['actor1t']=actor1t
df['actor2t']=actor2t
df['actor3t']=actor3t
df['genret']=genret

In [458]:
from sklearn.metrics import mean_squared_error,accuracy_score,r2_score
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor

In [459]:
X = df[[ 'Year', 'Votes', 'Duration m', 'genret','directort','actor1t', 'actor2t', 'actor3t']]
y=df['Rating']


In [460]:
x_train,x_test,y_train,y_test=train_test_split(X,y,train_size=0.8,random_state=22)
print(f'x_train shape : {x_train.shape}')
print(f'y_train shape : {y_train.shape}')

x_train shape : (3992, 8)
y_train shape : (3992,)


In [461]:
lg=LinearRegression()
lg.fit(x_train,y_train)
predicted=lg.predict(x_test)

In [462]:
predicted

array([8.12411418, 6.61034396, 9.44686319, 7.64261303, 4.82752051,
       8.94259497, 5.65912188, 6.40235763, 6.10990288, 6.36045654,
       6.0503712 , 3.96193853, 6.68577399, 5.97813532, 7.08493201,
       8.41106653, 6.9095181 , 5.61305333, 5.58125175, 4.47899074,
       5.43037597, 4.79426151, 5.20032323, 5.30626841, 5.89162922,
       3.31661776, 5.57980404, 6.78368581, 8.01960965, 7.51362527,
       5.46219955, 6.69845947, 4.99856062, 7.21174662, 6.05895686,
       7.33136777, 6.75514705, 6.80850368, 6.88204684, 7.09424691,
       6.33128931, 6.99079825, 7.4781985 , 6.68767735, 5.74044847,
       6.52955332, 4.97198576, 6.87466595, 3.53285137, 4.53190341,
       5.83994356, 3.67944192, 7.10930313, 6.91303624, 7.50956931,
       5.98377833, 5.55588016, 5.27597206, 8.70703231, 6.40687371,
       6.65024899, 5.14909426, 6.51974141, 7.2220945 , 2.0624695 ,
       5.35512529, 5.08673456, 6.68092158, 4.0998348 , 3.46229305,
       5.87231143, 7.82462544, 5.64264519, 7.04230819, 6.83057

In [463]:
error=mean_squared_error(y_test,predicted)
acc=r2_score(y_test,predicted)

In [464]:
error,acc

(0.4186162851087406, 0.7950685340277919)

**DE TREE CLASSIFIER**

In [465]:
x_train,x_test,y_train,y_test=train_test_split(X,y,train_size=0.8,random_state=42)
print(f'x_train shape : {x_train.shape}')
print(f'y_train shape : {y_train.shape}')

x_train shape : (3992, 8)
y_train shape : (3992,)


In [466]:
dt = DecisionTreeRegressor(random_state=42)
dt.fit(x_train,y_train)
predicted=dt.predict(x_test)

In [467]:
error=mean_squared_error(y_test,predicted)
acc=r2_score(y_test,predicted)

In [469]:
error,acc

(0.6443486973947895, 0.6716886081270588)

**MULTIPLE REGRESSION SUITABLE THAN DECISION TREE CLASSIFIER** 