In [32]:
import numpy as np
import pandas as pd
import altair as alt
import seaborn as sns
import re
import matplotlib.pyplot as plt
%matplotlib inline
from wordcloud import WordCloud, STOPWORDS
stopwords = set(STOPWORDS)
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split

In [33]:
df = pd.read_csv('bollywood_data.csv')
df

Unnamed: 0,title_x,imdb_id,poster_path,wiki_link,title_y,original_title,is_adult,year_of_release,runtime,genres,imdb_rating,imdb_votes,story,summary,tagline,actors,wins_nominations,release_date,budget,box office
0,Uri: The Surgical Strike,tt8291224,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/Uri:_The_Surgica...,Uri: The Surgical Strike,Uri: The Surgical Strike,0,2019,138,Action|Drama|War,8.4,35112,Divided over five chapters the film chronicles...,Indian army special forces execute a covert op...,,Vicky Kaushal|Paresh Rawal|Mohit Raina|Yami Ga...,4 wins,11 January 2019 (USA),25.00,342.06
1,Battalion 609,tt9472208,,https://en.wikipedia.org/wiki/Battalion_609,Battalion 609,Battalion 609,0,2019,131,War,4.1,73,The story revolves around a cricket match betw...,The story of Battalion 609 revolves around a c...,,Vicky Ahuja|Shoaib Ibrahim|Shrikant Kamat|Elen...,,11 January 2019 (India),0.01,2.70
2,The Accidental Prime Minister (film),tt6986710,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/The_Accidental_P...,The Accidental Prime Minister,The Accidental Prime Minister,0,2019,112,Biography|Drama,6.1,5549,Based on the memoir by Indian policy analyst S...,Explores Manmohan Singh's tenure as the Prime ...,,Anupam Kher|Akshaye Khanna|Aahana Kumra|Atul S...,,11 January 2019 (USA),18.00,22.65
3,Why Cheat India,tt8108208,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/Why_Cheat_India,Why Cheat India,Why Cheat India,0,2019,121,Crime|Drama,6.0,1891,The movie focuses on existing malpractices in ...,The movie focuses on existing malpractices in ...,,Emraan Hashmi|Shreya Dhanwanthary|Snighdadeep ...,,18 January 2019 (USA),20.00,10.54
4,Evening Shadows,tt6028796,,https://en.wikipedia.org/wiki/Evening_Shadows,Evening Shadows,Evening Shadows,0,2018,102,Drama,7.3,280,While gay rights and marriage equality has bee...,Under the 'Evening Shadows' truth often plays ...,,Mona Ambegaonkar|Ananth Narayan Mahadevan|Deva...,17 wins & 1 nomination,11 January 2019 (India),2.75,2.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4324,Samadhi (1950 film),tt0268614,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/Samadhi_(1950_film),Samadhi,Samadhi,0,1950,165,Drama,6.1,21,The story is based on the true incident at INA...,The story is based on the true incident at INA...,,Ashok Kumar|Nalini Jaywant|Kuldip Kaur|Shyam|M...,,,,
4325,Sangram (1950 film),tt0244182,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/Sangram_(1950_film),Sangram,Sangram,0,1950,139,Drama,6.2,20,After the death of his wife a policeman fails ...,After the death of his wife a policeman fails ...,,Ashok Kumar|Nalini Jaywant|Nawab|Sajjan|Tiwari...,,,,
4326,Sargam (1950 film),tt0269826,,https://en.wikipedia.org/wiki/Sargam_(1950_film),Melody,Sargam,0,1950,135,Drama|Family,6.8,21,,Add a Plot »,,Raj Kapoor|Rehana|Om Prakash|David Abraham|Rad...,,4 February 1957 (Iran),,
4327,Sheesh Mahal (1950 film),tt0243555,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/Sheesh_Mahal_(19...,Sheesh Mahal,Sheesh Mahal,0,1950,144,Drama,7.0,13,Thakur Jaspal Singh lives in the prestigious a...,Thakur Jaspal Singh lives in the prestigious a...,,Sohrab Modi|Naseem Banu|Pushpa Hans|Nigar Sult...,,,,


# Droping columns which are not useful

In [34]:
df = df.drop(['title_x','title_y', 'imdb_id', 'poster_path', 'wiki_link', 'is_adult', 'tagline', 'release_date', 'story', 'summary'], axis=1)

# Renaming columns for ease of references

In [35]:
df = df.rename(columns={'original_title':'title', 'year_of_release':'year', 'imdb_rating':'rating', 'imdb_rating':'rating', 'imdb_votes':'votes', 'wins_nominations':'awards'})

# Data cleaning

In [36]:
df['awards'] = df['awards'].apply(lambda x : re.sub(r'[A-Za-z]', '', str(x)))

df[['wins', 'nominations']] = df.awards.str.split('&', expand= True)

df['wins'] = df['wins'].str.strip()
df['wins'] = df['wins'].apply(lambda x: 0 if x=='' else x)
df['wins'] = df['wins'].replace(np.nan, 0)
df['wins'] = df['wins'].astype(int)

df['runtime'] = df['runtime'].replace('\\N', np.nan)
df['runtime'] = df['runtime'].ffill()
df['runtime'] = df['runtime'].astype(int)
df = df[df['runtime'] >= 60]

actors = df.actors.str.split('|', expand=True)
df['lead_actor'] = actors[0]

df[['genre', 'genre2', 'genre3']] = df.genres.str.split("|", expand=True)
df[['lead_actor1', 'lead_actor2', 'lead_actor3', 'lead_actor4']] = df.actors.str.split("|", expand=True)[[0,1,2,3]]
df.drop(['genres', 'awards', 'actors', 'genre2', 'genre3', 'nominations'], axis=1)
minor_genre_list = ['Animation', 'Documentary', 'History', 'Music', 'War', 'Sci-Fi', 'Sport']
df = df[~df['genre'].isin(minor_genre_list)]

df = df.drop(['awards', 'genres','lead_actor','actors'], axis=1)
df = df[df['year'] >= 2009]
df = df.drop_duplicates()

In [37]:
df['nominations'] = df['nominations'].fillna(0)
df['genre'] = df['genre'].fillna(0)
df['genre2'] = df['genre2'].fillna(0)
df['genre3'] = df['genre3'].fillna(0)


In [38]:
d = {df.lead_actor1.unique()[i] : i for i in range(len(df.lead_actor1.unique()))}
actor_list = list(d.items())
actor_list1 = np.array(actor_list).reshape(-1,2)
df["NumLeadActor"] = [d[i] for i in df['lead_actor1']]
df

Unnamed: 0,title,year,runtime,rating,votes,budget,box office,wins,nominations,genre,genre2,genre3,lead_actor1,lead_actor2,lead_actor3,lead_actor4,NumLeadActor
0,Uri: The Surgical Strike,2019,138,8.4,35112,25.00,342.06,4,0,Action,Drama,War,Vicky Kaushal,Paresh Rawal,Mohit Raina,Yami Gautam,0
2,The Accidental Prime Minister,2019,112,6.1,5549,18.00,22.65,0,0,Biography,Drama,0,Anupam Kher,Akshaye Khanna,Aahana Kumra,Atul Sharma,1
3,Why Cheat India,2019,121,6.0,1891,20.00,10.54,0,0,Crime,Drama,0,Emraan Hashmi,Shreya Dhanwanthary,Snighdadeep Chatterji,Navneet Srivastava,2
4,Evening Shadows,2018,102,7.3,280,2.75,2.50,17,1,Drama,0,0,Mona Ambegaonkar,Ananth Narayan Mahadevan,Devansh Doshi,Arpit Chaudhary,3
5,Soni,2018,97,7.2,1595,0.50,0.10,3,5,Drama,0,0,Geetika Vidya Ohlyan,Saloni Batra,Vikas Shukla,Mohit Chauhan,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
969,Chandni Chowk to China,2009,154,4.0,7266,80.00,120.00,1,3,Action,Comedy,0,Mithun Chakraborty,Akshay Kumar,Deepika Padukone,Ranvir Shorey,345
970,Familywala,2014,180,5.8,57,6.00,3.00,0,0,Comedy,Drama,Romance,Arjun Rampal,Dia Mirza,Ashok Saraf,Shoma Anand,143
971,Sunglass,2013,100,6.5,24,8.00,5.00,0,0,Comedy,Drama,Thriller,Jaya Bachchan,Madhavan,Tota Roy Chowdhury,Raima Sen,446
972,Sabse Bada Sukh,2018,100,6.1,13,10.00,7.00,0,0,Comedy,Drama,0,Vijay Arora,Asrani,Rajni Bala,Kumud Damle,447


In [39]:
pd.to_datetime(df.year, format='%Y')
df['NumLeadActor'] = df['NumLeadActor'].astype(str)

In [40]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 961 entries, 0 to 973
Data columns (total 17 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         961 non-null    object 
 1   year          961 non-null    int64  
 2   runtime       961 non-null    int32  
 3   rating        961 non-null    float64
 4   votes         961 non-null    int64  
 5   budget        961 non-null    float64
 6   box office    961 non-null    float64
 7   wins          961 non-null    int32  
 8   nominations   961 non-null    object 
 9   genre         961 non-null    object 
 10  genre2        961 non-null    object 
 11  genre3        961 non-null    object 
 12  lead_actor1   960 non-null    object 
 13  lead_actor2   960 non-null    object 
 14  lead_actor3   943 non-null    object 
 15  lead_actor4   906 non-null    object 
 16  NumLeadActor  961 non-null    object 
dtypes: float64(3), int32(2), int64(2), object(10)
memory usage: 127.6+ KB


In [41]:
df.describe()

Unnamed: 0,year,runtime,rating,votes,budget,box office,wins
count,961.0,961.0,961.0,961.0,961.0,961.0,961.0
mean,2014.082206,128.284079,5.657128,6234.078044,24.970583,49.955169,2.423517
std,2.991743,21.707992,1.587672,16068.517719,33.602508,116.521245,6.109814
min,2009.0,79.0,0.0,0.0,0.07,0.0008,0.0
25%,2012.0,114.0,4.6,270.0,5.25,1.0,0.0
50%,2014.0,128.0,5.7,1373.0,14.0,9.99,0.0
75%,2017.0,140.0,6.9,5524.0,30.0,49.0,2.0
max,2019.0,321.0,9.4,310481.0,350.0,2100.0,57.0


In [42]:
df.corr()

Unnamed: 0,year,runtime,rating,votes,budget,box office,wins
year,1.0,-0.023825,0.094429,-0.023959,0.143226,0.120566,-0.067442
runtime,-0.023825,1.0,0.031593,0.322358,0.42252,0.320297,0.235139
rating,0.094429,0.031593,1.0,0.307923,0.004134,0.18398,0.309701
votes,-0.023959,0.322358,0.307923,1.0,0.337082,0.573959,0.631573
budget,0.143226,0.42252,0.004134,0.337082,1.0,0.588879,0.284765
box office,0.120566,0.320297,0.18398,0.573959,0.588879,1.0,0.454319
wins,-0.067442,0.235139,0.309701,0.631573,0.284765,0.454319,1.0


In [43]:
df

Unnamed: 0,title,year,runtime,rating,votes,budget,box office,wins,nominations,genre,genre2,genre3,lead_actor1,lead_actor2,lead_actor3,lead_actor4,NumLeadActor
0,Uri: The Surgical Strike,2019,138,8.4,35112,25.00,342.06,4,0,Action,Drama,War,Vicky Kaushal,Paresh Rawal,Mohit Raina,Yami Gautam,0
2,The Accidental Prime Minister,2019,112,6.1,5549,18.00,22.65,0,0,Biography,Drama,0,Anupam Kher,Akshaye Khanna,Aahana Kumra,Atul Sharma,1
3,Why Cheat India,2019,121,6.0,1891,20.00,10.54,0,0,Crime,Drama,0,Emraan Hashmi,Shreya Dhanwanthary,Snighdadeep Chatterji,Navneet Srivastava,2
4,Evening Shadows,2018,102,7.3,280,2.75,2.50,17,1,Drama,0,0,Mona Ambegaonkar,Ananth Narayan Mahadevan,Devansh Doshi,Arpit Chaudhary,3
5,Soni,2018,97,7.2,1595,0.50,0.10,3,5,Drama,0,0,Geetika Vidya Ohlyan,Saloni Batra,Vikas Shukla,Mohit Chauhan,4
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
969,Chandni Chowk to China,2009,154,4.0,7266,80.00,120.00,1,3,Action,Comedy,0,Mithun Chakraborty,Akshay Kumar,Deepika Padukone,Ranvir Shorey,345
970,Familywala,2014,180,5.8,57,6.00,3.00,0,0,Comedy,Drama,Romance,Arjun Rampal,Dia Mirza,Ashok Saraf,Shoma Anand,143
971,Sunglass,2013,100,6.5,24,8.00,5.00,0,0,Comedy,Drama,Thriller,Jaya Bachchan,Madhavan,Tota Roy Chowdhury,Raima Sen,446
972,Sabse Bada Sukh,2018,100,6.1,13,10.00,7.00,0,0,Comedy,Drama,0,Vijay Arora,Asrani,Rajni Bala,Kumud Damle,447


In [44]:
df1 = df.loc[df['box office'].notnull()]

x = np.array(df1['NumLeadActor']).reshape(-1, 1)
y = df1['box office']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 1/3, random_state=0)


x_train.shape, y_train.shape, x_test.shape, y_test.shape

((640, 1), (640,), (321, 1), (321,))

In [45]:
regressor = LinearRegression()
regressor.fit(x_train, y_train)

LinearRegression()

In [46]:
y_pred = regressor.predict(x_test)
print('Accuracy of linear regression classifier on test set : {:.9f} '.format(regressor.score(x_test, y_test)))

Accuracy of linear regression classifier on test set : 0.023880443 


# Handling Missing Values

In [47]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 961 entries, 0 to 973
Data columns (total 17 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   title         961 non-null    object 
 1   year          961 non-null    int64  
 2   runtime       961 non-null    int32  
 3   rating        961 non-null    float64
 4   votes         961 non-null    int64  
 5   budget        961 non-null    float64
 6   box office    961 non-null    float64
 7   wins          961 non-null    int32  
 8   nominations   961 non-null    object 
 9   genre         961 non-null    object 
 10  genre2        961 non-null    object 
 11  genre3        961 non-null    object 
 12  lead_actor1   960 non-null    object 
 13  lead_actor2   960 non-null    object 
 14  lead_actor3   943 non-null    object 
 15  lead_actor4   906 non-null    object 
 16  NumLeadActor  961 non-null    object 
dtypes: float64(3), int32(2), int64(2), object(10)
memory usage: 127.6+ KB


In [48]:
df.shape

(961, 17)

In [49]:
df.isnull().sum()

title            0
year             0
runtime          0
rating           0
votes            0
budget           0
box office       0
wins             0
nominations      0
genre            0
genre2           0
genre3           0
lead_actor1      1
lead_actor2      1
lead_actor3     18
lead_actor4     55
NumLeadActor     0
dtype: int64

# Genre splitting

In [89]:
genres_df = pd.read_csv('bollywood_data.csv')
genres_df


Unnamed: 0,title_x,imdb_id,poster_path,wiki_link,title_y,original_title,is_adult,year_of_release,runtime,genres,imdb_rating,imdb_votes,story,summary,tagline,actors,wins_nominations,release_date,budget,box office
0,Uri: The Surgical Strike,tt8291224,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/Uri:_The_Surgica...,Uri: The Surgical Strike,Uri: The Surgical Strike,0,2019,138,Action|Drama|War,8.4,35112,Divided over five chapters the film chronicles...,Indian army special forces execute a covert op...,,Vicky Kaushal|Paresh Rawal|Mohit Raina|Yami Ga...,4 wins,11 January 2019 (USA),25.00,342.06
1,Battalion 609,tt9472208,,https://en.wikipedia.org/wiki/Battalion_609,Battalion 609,Battalion 609,0,2019,131,War,4.1,73,The story revolves around a cricket match betw...,The story of Battalion 609 revolves around a c...,,Vicky Ahuja|Shoaib Ibrahim|Shrikant Kamat|Elen...,,11 January 2019 (India),0.01,2.70
2,The Accidental Prime Minister (film),tt6986710,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/The_Accidental_P...,The Accidental Prime Minister,The Accidental Prime Minister,0,2019,112,Biography|Drama,6.1,5549,Based on the memoir by Indian policy analyst S...,Explores Manmohan Singh's tenure as the Prime ...,,Anupam Kher|Akshaye Khanna|Aahana Kumra|Atul S...,,11 January 2019 (USA),18.00,22.65
3,Why Cheat India,tt8108208,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/Why_Cheat_India,Why Cheat India,Why Cheat India,0,2019,121,Crime|Drama,6.0,1891,The movie focuses on existing malpractices in ...,The movie focuses on existing malpractices in ...,,Emraan Hashmi|Shreya Dhanwanthary|Snighdadeep ...,,18 January 2019 (USA),20.00,10.54
4,Evening Shadows,tt6028796,,https://en.wikipedia.org/wiki/Evening_Shadows,Evening Shadows,Evening Shadows,0,2018,102,Drama,7.3,280,While gay rights and marriage equality has bee...,Under the 'Evening Shadows' truth often plays ...,,Mona Ambegaonkar|Ananth Narayan Mahadevan|Deva...,17 wins & 1 nomination,11 January 2019 (India),2.75,2.50
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4324,Samadhi (1950 film),tt0268614,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/Samadhi_(1950_film),Samadhi,Samadhi,0,1950,165,Drama,6.1,21,The story is based on the true incident at INA...,The story is based on the true incident at INA...,,Ashok Kumar|Nalini Jaywant|Kuldip Kaur|Shyam|M...,,,,
4325,Sangram (1950 film),tt0244182,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/Sangram_(1950_film),Sangram,Sangram,0,1950,139,Drama,6.2,20,After the death of his wife a policeman fails ...,After the death of his wife a policeman fails ...,,Ashok Kumar|Nalini Jaywant|Nawab|Sajjan|Tiwari...,,,,
4326,Sargam (1950 film),tt0269826,,https://en.wikipedia.org/wiki/Sargam_(1950_film),Melody,Sargam,0,1950,135,Drama|Family,6.8,21,,Add a Plot »,,Raj Kapoor|Rehana|Om Prakash|David Abraham|Rad...,,4 February 1957 (Iran),,
4327,Sheesh Mahal (1950 film),tt0243555,https://upload.wikimedia.org/wikipedia/en/thum...,https://en.wikipedia.org/wiki/Sheesh_Mahal_(19...,Sheesh Mahal,Sheesh Mahal,0,1950,144,Drama,7.0,13,Thakur Jaspal Singh lives in the prestigious a...,Thakur Jaspal Singh lives in the prestigious a...,,Sohrab Modi|Naseem Banu|Pushpa Hans|Nigar Sult...,,,,


In [90]:
#Droping columns which are not useful
genres_df = genres_df.drop(['title_x','title_y', 'imdb_id', 'poster_path', 'wiki_link', 'is_adult', 'tagline', 'release_date', 'story', 'summary'], axis=1)

In [91]:
#Renaming columns for ease of references
genres_df = genres_df.rename(columns={'original_title':'title', 'year_of_release':'year', 'imdb_rating':'rating', 'imdb_rating':'rating', 'imdb_votes':'votes', 'wins_nominations':'awards'})


In [96]:
#Data cleaning
genres_df['awards'] = genres_df['awards'].apply(lambda x : re.sub(r'[A-Za-z]', '', str(x)))

genres_df[['wins', 'nominations']] = genres_df.awards.str.split('&', expand= True)

genres_df['wins'] = genres_df['wins'].str.strip()
genres_df['wins'] = genres_df['wins'].apply(lambda x: 0 if x=='' else x)
genres_df['wins'] = genres_df['wins'].replace(np.nan, 0)
genres_df['wins'] =genres_df['wins'].astype(int)

genres_df['runtime'] = genres_df['runtime'].replace('\\N', np.nan)
genres_df['runtime'] = genres_df['runtime'].ffill()
genres_df['runtime'] = genres_df['runtime'].astype(int)
genres_df = genres_df[genres_df['runtime'] >= 60]

actors = genres_df.actors.str.split('|', expand=True)
genres_df['lead_actor'] = actors[0]

genres_df[['genre', 'genre2', 'genre3']] = genres_df.genres.str.split("|", expand=True)


minor_genre_list = ['Animation', 'Documentary', 'History', 'Music', 'War', 'Sci-Fi', 'Sport']
genres_df = genres_df[~genres_df['genre'].isin(minor_genre_list)]

genres_df = genres_df[genres_df['year'] >= 2009]
genres_df = genres_df.drop_duplicates()

In [97]:
gens = set()
for i in genres_df['genres']:
    for j in i.split('|'):
        gens.add(j)
gens

{'Action',
 'Adventure',
 'Animation',
 'Biography',
 'Comedy',
 'Crime',
 'Drama',
 'Family',
 'Fantasy',
 'History',
 'Horror',
 'Music',
 'Musical',
 'Mystery',
 'Romance',
 'Sci-Fi',
 'Short',
 'Sport',
 'Thriller',
 'War'}

In [98]:
genres_df = genres_df.drop(['genres', 'awards', 'actors', 'genre2', 'genre3', 'nominations'], axis=1)

In [87]:
genres_df

Unnamed: 0,title,year,runtime,rating,votes,budget,box office,wins,lead_actor,genre
0,Uri: The Surgical Strike,2019,138,8.4,35112,25.00,342.06,4,Vicky Kaushal,Action
2,The Accidental Prime Minister,2019,112,6.1,5549,18.00,22.65,0,Anupam Kher,Biography
3,Why Cheat India,2019,121,6.0,1891,20.00,10.54,0,Emraan Hashmi,Crime
4,Evening Shadows,2018,102,7.3,280,2.75,2.50,17,Mona Ambegaonkar,Drama
5,Soni,2018,97,7.2,1595,0.50,0.10,3,Geetika Vidya Ohlyan,Drama
...,...,...,...,...,...,...,...,...,...,...
969,Chandni Chowk to China,2009,154,4.0,7266,80.00,120.00,1,Mithun Chakraborty,Action
970,Familywala,2014,180,5.8,57,6.00,3.00,0,Arjun Rampal,Comedy
971,Sunglass,2013,100,6.5,24,8.00,5.00,0,Jaya Bachchan,Comedy
972,Sabse Bada Sukh,2018,100,6.1,13,10.00,7.00,0,Vijay Arora,Comedy
