In [438]:
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

In [439]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

# loading the data

In [440]:
df=pd.read_csv("Movies.csv")

In [441]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16744 entries, 0 to 16743
Data columns (total 17 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Unnamed: 0       16744 non-null  int64  
 1   ID               16744 non-null  int64  
 2   Title            16744 non-null  object 
 3   Year             16744 non-null  int64  
 4   Age              7354 non-null   object 
 5   IMDb             16173 non-null  float64
 6   Rotten Tomatoes  5158 non-null   object 
 7   Netflix          16744 non-null  int64  
 8   Hulu             16744 non-null  int64  
 9   Prime Video      16744 non-null  int64  
 10  Disney+          16744 non-null  int64  
 11  Type             16744 non-null  int64  
 12  Directors        16018 non-null  object 
 13  Genres           16469 non-null  object 
 14  Country          16309 non-null  object 
 15  Language         16130 non-null  object 
 16  Runtime          16152 non-null  float64
dtypes: float64(2

# Data cleaning

In [442]:
df.duplicated().any()

np.False_

In [443]:
df[["Directors","Genres","Language","Country"]]=df[["Directors","Genres","Language","Country"]].fillna("Unknown")

In [444]:
median_runtime = df["Runtime"].median()
df["Runtime"]=df["Runtime"].fillna(median_runtime)

In [445]:
df=df.drop(columns=["Unnamed: 0",'Type','ID','Rotten Tomatoes'])

In [446]:
df=df.dropna(axis=0,subset=['Age','IMDb'])
df=df.reset_index(drop=True)

In [447]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7316 entries, 0 to 7315
Data columns (total 13 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   Title        7316 non-null   object 
 1   Year         7316 non-null   int64  
 2   Age          7316 non-null   object 
 3   IMDb         7316 non-null   float64
 4   Netflix      7316 non-null   int64  
 5   Hulu         7316 non-null   int64  
 6   Prime Video  7316 non-null   int64  
 7   Disney+      7316 non-null   int64  
 8   Directors    7316 non-null   object 
 9   Genres       7316 non-null   object 
 10  Country      7316 non-null   object 
 11  Language     7316 non-null   object 
 12  Runtime      7316 non-null   float64
dtypes: float64(2), int64(5), object(6)
memory usage: 743.2+ KB


# Data Transformation / Feature Engineering

In [448]:
def run_time(x):
    if x['Runtime'] <= 90:
        return "Short Movie"
    elif x['Runtime'] <= 200:
        return "Long Movie"
    else:
        return "Series"

df["Runtime_type"] = df.apply(run_time,axis=1)


In [449]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7316 entries, 0 to 7315
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   Title         7316 non-null   object 
 1   Year          7316 non-null   int64  
 2   Age           7316 non-null   object 
 3   IMDb          7316 non-null   float64
 4   Netflix       7316 non-null   int64  
 5   Hulu          7316 non-null   int64  
 6   Prime Video   7316 non-null   int64  
 7   Disney+       7316 non-null   int64  
 8   Directors     7316 non-null   object 
 9   Genres        7316 non-null   object 
 10  Country       7316 non-null   object 
 11  Language      7316 non-null   object 
 12  Runtime       7316 non-null   float64
 13  Runtime_type  7316 non-null   object 
dtypes: float64(2), int64(5), object(7)
memory usage: 800.3+ KB


In [450]:
df.columns[[4,5,6,7]]

Index(['Netflix', 'Hulu', 'Prime Video', 'Disney+'], dtype='object')

In [451]:
plt_df=pd.melt(df,id_vars=['Title','Age','IMDb'],
               value_vars=df.columns[[4,5,6,7]],
               var_name='OTT',value_name='Availability')
plt_df

Unnamed: 0,Title,Age,IMDb,OTT,Availability
0,Inception,13+,8.8,Netflix,1
1,The Matrix,18+,8.7,Netflix,1
2,Avengers: Infinity War,13+,8.5,Netflix,1
3,Back to the Future,7+,8.5,Netflix,1
4,"The Good, the Bad and the Ugly",18+,8.8,Netflix,1
...,...,...,...,...,...
29259,Richie Rich's Christmas Wish,all,4.1,Disney+,1
29260,The Jungle Book: Mowgli's Story,all,4.2,Disney+,1
29261,The Bears and I,all,6.2,Disney+,1
29262,Whispers: An Elephant's Tale,all,5.0,Disney+,1


In [452]:
plt_df=plt_df.loc[plt_df['Availability']==1] 
plt_df=plt_df.drop(columns='Availability')

In [453]:
plt_df=plt_df.sort_values(by=['Title']).reset_index(drop=True)

In [454]:
plt_df

Unnamed: 0,Title,Age,IMDb,OTT
0,#IMomSoHard Live,18+,5.2,Prime Video
1,#LoveSwag,13+,5.4,Prime Video
2,#cats_the_mewvie,16+,5.2,Netflix
3,$elfie Shootout,16+,3.4,Prime Video
4,...And Your Name Is Jonah,7+,7.1,Prime Video
...,...,...,...,...
7703,Æon Flux,13+,5.4,Hulu
7704,İncir Reçeli 2,13+,6.1,Netflix
7705,审死官,7+,6.9,Netflix
7706,审死官,7+,6.9,Prime Video


In [455]:
plt_df.loc[[7705,7706],'Title']='Justice, My Foot'
plt_df.loc[[7707],'Title']="water Dance"

In [456]:
plt_df['Title']=plt_df['Title'].str.replace('[^A-Za-z0-9 ,]','',regex=True)

In [457]:
plt_df

Unnamed: 0,Title,Age,IMDb,OTT
0,IMomSoHard Live,18+,5.2,Prime Video
1,LoveSwag,13+,5.4,Prime Video
2,catsthemewvie,16+,5.2,Netflix
3,elfie Shootout,16+,3.4,Prime Video
4,And Your Name Is Jonah,7+,7.1,Prime Video
...,...,...,...,...
7703,on Flux,13+,5.4,Hulu
7704,ncir Reeli 2,13+,6.1,Netflix
7705,"Justice, My Foot",7+,6.9,Netflix
7706,"Justice, My Foot",7+,6.9,Prime Video


In [458]:
df

Unnamed: 0,Title,Year,Age,IMDb,Netflix,Hulu,Prime Video,Disney+,Directors,Genres,Country,Language,Runtime,Runtime_type
0,Inception,2010,13+,8.8,1,0,0,0,Christopher Nolan,"Action,Adventure,Sci-Fi,Thriller","United States,United Kingdom","English,Japanese,French",148.0,Long Movie
1,The Matrix,1999,18+,8.7,1,0,0,0,"Lana Wachowski,Lilly Wachowski","Action,Sci-Fi",United States,English,136.0,Long Movie
2,Avengers: Infinity War,2018,13+,8.5,1,0,0,0,"Anthony Russo,Joe Russo","Action,Adventure,Sci-Fi",United States,English,149.0,Long Movie
3,Back to the Future,1985,7+,8.5,1,0,0,0,Robert Zemeckis,"Adventure,Comedy,Sci-Fi",United States,English,116.0,Long Movie
4,"The Good, the Bad and the Ugly",1966,18+,8.8,1,0,1,0,Sergio Leone,Western,"Italy,Spain,West Germany",Italian,161.0,Long Movie
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7311,Richie Rich's Christmas Wish,1998,all,4.1,0,0,0,1,John Murlowski,"Comedy,Family",United States,English,84.0,Short Movie
7312,The Jungle Book: Mowgli's Story,1998,all,4.2,0,0,0,1,Nick Marck,"Adventure,Family",United States,English,77.0,Short Movie
7313,The Bears and I,1974,all,6.2,0,0,0,1,Bernard McEveety,"Drama,Family",United States,English,89.0,Short Movie
7314,Whispers: An Elephant's Tale,2000,all,5.0,0,0,0,1,Dereck Joubert,"Adventure,Family",United States,English,72.0,Short Movie


In [459]:
df['tot_plts']=df[df.columns[[4,5,6,7]]].sum(axis=1)
# df[['Netflix', 'Hulu', 'Prime Video', 'Disney+']].sum(axis=1)

In [460]:
def len_lst(val):
    return len(val)

In [461]:
df['No_of_genres']=df['Genres'].str.split(',').apply(len_lst)
df['No_of_countries']=df['Country'].str.split(',').apply(len_lst)
df['No_of_Languages']=df['Language'].str.split(',').apply(len_lst)

In [462]:
df

Unnamed: 0,Title,Year,Age,IMDb,Netflix,Hulu,Prime Video,Disney+,Directors,Genres,Country,Language,Runtime,Runtime_type,tot_plts,No_of_genres,No_of_countries,No_of_Languages
0,Inception,2010,13+,8.8,1,0,0,0,Christopher Nolan,"Action,Adventure,Sci-Fi,Thriller","United States,United Kingdom","English,Japanese,French",148.0,Long Movie,1,4,2,3
1,The Matrix,1999,18+,8.7,1,0,0,0,"Lana Wachowski,Lilly Wachowski","Action,Sci-Fi",United States,English,136.0,Long Movie,1,2,1,1
2,Avengers: Infinity War,2018,13+,8.5,1,0,0,0,"Anthony Russo,Joe Russo","Action,Adventure,Sci-Fi",United States,English,149.0,Long Movie,1,3,1,1
3,Back to the Future,1985,7+,8.5,1,0,0,0,Robert Zemeckis,"Adventure,Comedy,Sci-Fi",United States,English,116.0,Long Movie,1,3,1,1
4,"The Good, the Bad and the Ugly",1966,18+,8.8,1,0,1,0,Sergio Leone,Western,"Italy,Spain,West Germany",Italian,161.0,Long Movie,2,1,3,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7311,Richie Rich's Christmas Wish,1998,all,4.1,0,0,0,1,John Murlowski,"Comedy,Family",United States,English,84.0,Short Movie,1,2,1,1
7312,The Jungle Book: Mowgli's Story,1998,all,4.2,0,0,0,1,Nick Marck,"Adventure,Family",United States,English,77.0,Short Movie,1,2,1,1
7313,The Bears and I,1974,all,6.2,0,0,0,1,Bernard McEveety,"Drama,Family",United States,English,89.0,Short Movie,1,2,1,1
7314,Whispers: An Elephant's Tale,2000,all,5.0,0,0,0,1,Dereck Joubert,"Adventure,Family",United States,English,72.0,Short Movie,1,2,1,1


# Data Analysis

### 1. find the no_of_movies available in each platform

### 2. find the no_of comedy movies available in each platforms

### 3. find the  movies which are available in more than 1 platforms and sort the output in desc_order of tot_plts

### 4. find the average IMDb ratings for all the platforms 

### 5. find the age wise no_of_movies for all the platforms 

### 6. find the no_of movies which has Horror Genre only 

### 7. find the no_of movies which has Horror as one of the Genre not completely Horror

### 8. find the no_of movies which has Horror Genre and may or may not have other Genres

### 9. which Genre is focused more on 18+

### 10. which platform is focused more on 'Horror' Genre

### 11. find the pltform and age for which 'Horror' movie count is high

### 12. find the top10 directors according the no_of_movies

### 13. find the bottom 10 released year according the no_of_movies released

### 14. which movie is having highest Genres