In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [2]:
df = pd.read_csv("mymoviedb.csv", lineterminator = '\n')

In [3]:
df.head()

Unnamed: 0,Release_Date,Title,Overview,Popularity,Vote_Count,Vote_Average,Original_Language,Genre,Poster_Url
0,2021-12-15,Spider-Man: No Way Home,Peter Parker is unmasked and no longer able to...,5083.954,8940,8.3,en,"Action, Adventure, Science Fiction",https://image.tmdb.org/t/p/original/1g0dhYtq4i...
1,2022-03-01,The Batman,"In his second year of fighting crime, Batman u...",3827.658,1151,8.1,en,"Crime, Mystery, Thriller",https://image.tmdb.org/t/p/original/74xTEgt7R3...
2,2022-02-25,No Exit,Stranded at a rest stop in the mountains durin...,2618.087,122,6.3,en,Thriller,https://image.tmdb.org/t/p/original/vDHsLnOWKl...
3,2021-11-24,Encanto,"The tale of an extraordinary family, the Madri...",2402.201,5076,7.7,en,"Animation, Comedy, Family, Fantasy",https://image.tmdb.org/t/p/original/4j0PNHkMr5...
4,2021-12-22,The King's Man,As a collection of history's worst tyrants and...,1895.511,1793,7.0,en,"Action, Adventure, Thriller, War",https://image.tmdb.org/t/p/original/aq4Pwv5Xeu...


In [4]:
df.columns

Index(['Release_Date', 'Title', 'Overview', 'Popularity', 'Vote_Count',
       'Vote_Average', 'Original_Language', 'Genre', 'Poster_Url'],
      dtype='object')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 9827 entries, 0 to 9826
Data columns (total 9 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   Release_Date       9827 non-null   object 
 1   Title              9827 non-null   object 
 2   Overview           9827 non-null   object 
 3   Popularity         9827 non-null   float64
 4   Vote_Count         9827 non-null   int64  
 5   Vote_Average       9827 non-null   float64
 6   Original_Language  9827 non-null   object 
 7   Genre              9827 non-null   object 
 8   Poster_Url         9827 non-null   object 
dtypes: float64(2), int64(1), object(6)
memory usage: 691.1+ KB


In [6]:
df.describe()

Unnamed: 0,Popularity,Vote_Count,Vote_Average
count,9827.0,9827.0,9827.0
mean,40.326088,1392.805536,6.439534
std,108.873998,2611.206907,1.129759
min,13.354,0.0,0.0
25%,16.1285,146.0,5.9
50%,21.199,444.0,6.5
75%,35.1915,1376.0,7.1
max,5083.954,31077.0,10.0


# convert Release_date to DATETIME Format

In [7]:
df['Release_Date'] = pd.to_datetime(df['Release_Date'])
print(df['Release_Date'])

0      2021-12-15
1      2022-03-01
2      2022-02-25
3      2021-11-24
4      2021-12-22
          ...    
9822   1973-10-15
9823   2020-10-01
9824   2016-05-06
9825   2021-03-31
9826   1984-09-23
Name: Release_Date, Length: 9827, dtype: datetime64[ns]


In [8]:
df['Genre'].head()

0    Action, Adventure, Science Fiction
1              Crime, Mystery, Thriller
2                              Thriller
3    Animation, Comedy, Family, Fantasy
4      Action, Adventure, Thriller, War
Name: Genre, dtype: object

In [9]:
df.duplicated()

0       False
1       False
2       False
3       False
4       False
        ...  
9822    False
9823    False
9824    False
9825    False
9826    False
Length: 9827, dtype: bool

In [10]:
df.duplicated().sum()

np.int64(0)

In [11]:
df.drop(['Overview','Original_Language','Poster_Url'] , axis = 1, inplace = True)

In [12]:
df

Unnamed: 0,Release_Date,Title,Popularity,Vote_Count,Vote_Average,Genre
0,2021-12-15,Spider-Man: No Way Home,5083.954,8940,8.3,"Action, Adventure, Science Fiction"
1,2022-03-01,The Batman,3827.658,1151,8.1,"Crime, Mystery, Thriller"
2,2022-02-25,No Exit,2618.087,122,6.3,Thriller
3,2021-11-24,Encanto,2402.201,5076,7.7,"Animation, Comedy, Family, Fantasy"
4,2021-12-22,The King's Man,1895.511,1793,7.0,"Action, Adventure, Thriller, War"
...,...,...,...,...,...,...
9822,1973-10-15,Badlands,13.357,896,7.6,"Drama, Crime"
9823,2020-10-01,Violent Delights,13.356,8,3.5,Horror
9824,2016-05-06,The Offering,13.355,94,5.0,"Mystery, Thriller, Horror"
9825,2021-03-31,The United States vs. Billie Holiday,13.354,152,6.7,"Music, Drama, History"


Exploration Summary

we have a dataframe consisting of 9827 rows and 9 columns.

our dataset looks a bit tidy with no NaNs nor duplicated values.

Release_Date column needs to be casted into date time and to extract only the year value.

Overview, Original_Language and Poster-Url wouldn't be so useful during analysis, so we'll drop them.

there is noticable outliers in Popularity column

Vote_Average bettter be categorised for proper analysis.

Genre column has comma separated values and white spaces that need to be handled and casted into category.Exploration Summary

In [13]:
df['Year'] = df['Release_Date'].dt.year

In [14]:
df['Year'].dtype

dtype('int32')

Categorizing columns in Vote_Average

In [15]:
def categorize_col(df, col, labels):
    edges = [df[col].describe()['min'],
             df[col].describe()['25%'],
             df[col].describe()['50%'],
             df[col].describe()['75%'],
             df[col].describe()['max']]
    
    df[col] = pd.cut(df[col], edges, labels = labels, duplicates = 'drop')
    return df

In [16]:
labels = ['Flop', 'Below_average', 'Average', 'Popular']

In [17]:
categorize_col(df, 'Vote_Average', labels)

Unnamed: 0,Release_Date,Title,Popularity,Vote_Count,Vote_Average,Genre,Year
0,2021-12-15,Spider-Man: No Way Home,5083.954,8940,Popular,"Action, Adventure, Science Fiction",2021
1,2022-03-01,The Batman,3827.658,1151,Popular,"Crime, Mystery, Thriller",2022
2,2022-02-25,No Exit,2618.087,122,Below_average,Thriller,2022
3,2021-11-24,Encanto,2402.201,5076,Popular,"Animation, Comedy, Family, Fantasy",2021
4,2021-12-22,The King's Man,1895.511,1793,Average,"Action, Adventure, Thriller, War",2021
...,...,...,...,...,...,...,...
9822,1973-10-15,Badlands,13.357,896,Popular,"Drama, Crime",1973
9823,2020-10-01,Violent Delights,13.356,8,Flop,Horror,2020
9824,2016-05-06,The Offering,13.355,94,Flop,"Mystery, Thriller, Horror",2016
9825,2021-03-31,The United States vs. Billie Holiday,13.354,152,Average,"Music, Drama, History",2021


In [19]:
df['Vote_Average'].unique()

['Popular', 'Below_average', 'Average', 'Flop', NaN]
Categories (4, object): ['Flop' < 'Below_average' < 'Average' < 'Popular']

In [20]:
df.head()

Unnamed: 0,Release_Date,Title,Popularity,Vote_Count,Vote_Average,Genre,Year
0,2021-12-15,Spider-Man: No Way Home,5083.954,8940,Popular,"Action, Adventure, Science Fiction",2021
1,2022-03-01,The Batman,3827.658,1151,Popular,"Crime, Mystery, Thriller",2022
2,2022-02-25,No Exit,2618.087,122,Below_average,Thriller,2022
3,2021-11-24,Encanto,2402.201,5076,Popular,"Animation, Comedy, Family, Fantasy",2021
4,2021-12-22,The King's Man,1895.511,1793,Average,"Action, Adventure, Thriller, War",2021


In [21]:
df['Vote_Average'].value_counts()

Vote_Average
Flop             2467
Popular          2450
Average          2412
Below_average    2398
Name: count, dtype: int64