# Importing the libraries 

In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns 

# About the dataset
### Netflix is a popular streaming service that offers a vast catalog of movies, TV shows, and original contents. This dataset is a cleaned version of the original version which can be found here. The data consist of contents added to Netflix from 2008 to 2021. The oldest content is as old as 1925 and the newest as 2021. This dataset will be cleaned with PostgreSQL and visualized with Tableau. The purpose of this dataset is to test my data cleaning and visualization skills. The cleaned data can be found below and the Tableau dashboard can be found here .

In [3]:
df=pd.read_csv('netflix1.csv')

In [25]:
df.size

79110

In [4]:
df.head()

Unnamed: 0,show_id,type,title,director,country,date_added,release_year,rating,duration,listed_in
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,United States,9/25/2021,2020,PG-13,90 min,Documentaries
1,s3,TV Show,Ganglands,Julien Leclercq,France,9/24/2021,2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act..."
2,s6,TV Show,Midnight Mass,Mike Flanagan,United States,9/24/2021,2021,TV-MA,1 Season,"TV Dramas, TV Horror, TV Mysteries"
3,s14,Movie,Confessions of an Invisible Girl,Bruno Garotti,Brazil,9/22/2021,2021,TV-PG,91 min,"Children & Family Movies, Comedies"
4,s8,Movie,Sankofa,Haile Gerima,United States,9/24/2021,1993,TV-MA,125 min,"Dramas, Independent Movies, International Movies"


In [6]:
df.tail()

Unnamed: 0,show_id,type,title,director,country,date_added,release_year,rating,duration,listed_in
8785,s8797,TV Show,Yunus Emre,Not Given,Turkey,1/17/2017,2016,TV-PG,2 Seasons,"International TV Shows, TV Dramas"
8786,s8798,TV Show,Zak Storm,Not Given,United States,9/13/2018,2016,TV-Y7,3 Seasons,Kids' TV
8787,s8801,TV Show,Zindagi Gulzar Hai,Not Given,Pakistan,12/15/2016,2012,TV-PG,1 Season,"International TV Shows, Romantic TV Shows, TV ..."
8788,s8784,TV Show,Yoko,Not Given,Pakistan,6/23/2018,2016,TV-Y,1 Season,Kids' TV
8789,s8786,TV Show,YOM,Not Given,Pakistan,6/7/2018,2016,TV-Y7,1 Season,Kids' TV


In [7]:
df.sample(5)

Unnamed: 0,show_id,type,title,director,country,date_added,release_year,rating,duration,listed_in
7157,s2191,TV Show,The Umbrella Academy,Not Given,United States,7/31/2020,2020,TV-14,2 Seasons,"TV Action & Adventure, TV Mysteries, TV Sci-Fi..."
1937,s2372,Movie,Lola Igna,Eduardo Roy Jr.,Philippines,6/18/2020,2019,TV-14,114 min,"Comedies, International Movies"
5862,s7989,Movie,Sex Doll,Sylvie Verheyde,United Kingdom,7/10/2017,2016,UR,103 min,"Dramas, International Movies, Romantic Movies"
4198,s5890,Movie,Results,Andrew Bujalski,United States,10/22/2015,2015,R,105 min,"Comedies, Independent Movies"
5391,s7385,Movie,Mahjong Heroes,Li Pei-Chuan,Hong Kong,8/16/2018,1981,TV-PG,92 min,"Comedies, International Movies"


### DataSet description


In [8]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8790 entries, 0 to 8789
Data columns (total 10 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8790 non-null   object
 1   type          8790 non-null   object
 2   title         8790 non-null   object
 3   director      8790 non-null   object
 4   country       8790 non-null   object
 5   date_added    8790 non-null   object
 6   release_year  8790 non-null   int64 
 7   rating        8790 non-null   object
 8   duration      8790 non-null   object
 9   listed_in     8790 non-null   object
dtypes: int64(1), object(9)
memory usage: 686.8+ KB


## Tasks We Are gonna do in Data Cleaning process are 
### 1. Treating the nulls
### 2. Treating the duplicates
### 3. Populate the missing values
### 4. Drop Uneeded Columns

In [10]:
df.columns

Index(['show_id', 'type', 'title', 'director', 'country', 'date_added',
       'release_year', 'rating', 'duration', 'listed_in'],
      dtype='object')

In [11]:
df.drop(columns='rating',inplace=True)

In [12]:
df.head()

Unnamed: 0,show_id,type,title,director,country,date_added,release_year,duration,listed_in
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,United States,9/25/2021,2020,90 min,Documentaries
1,s3,TV Show,Ganglands,Julien Leclercq,France,9/24/2021,2021,1 Season,"Crime TV Shows, International TV Shows, TV Act..."
2,s6,TV Show,Midnight Mass,Mike Flanagan,United States,9/24/2021,2021,1 Season,"TV Dramas, TV Horror, TV Mysteries"
3,s14,Movie,Confessions of an Invisible Girl,Bruno Garotti,Brazil,9/22/2021,2021,91 min,"Children & Family Movies, Comedies"
4,s8,Movie,Sankofa,Haile Gerima,United States,9/24/2021,1993,125 min,"Dramas, Independent Movies, International Movies"


## Renaming of the columns

In [13]:
new_columns=[]
for i in df.columns:
    new_columns.append(i.capitalize())

In [15]:
print(new_columns)

['Show_id', 'Type', 'Title', 'Director', 'Country', 'Date_added', 'Release_year', 'Duration', 'Listed_in']


In [16]:
df.columns=new_columns

In [17]:
df.head()

Unnamed: 0,Show_id,Type,Title,Director,Country,Date_added,Release_year,Duration,Listed_in
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,United States,9/25/2021,2020,90 min,Documentaries
1,s3,TV Show,Ganglands,Julien Leclercq,France,9/24/2021,2021,1 Season,"Crime TV Shows, International TV Shows, TV Act..."
2,s6,TV Show,Midnight Mass,Mike Flanagan,United States,9/24/2021,2021,1 Season,"TV Dramas, TV Horror, TV Mysteries"
3,s14,Movie,Confessions of an Invisible Girl,Bruno Garotti,Brazil,9/22/2021,2021,91 min,"Children & Family Movies, Comedies"
4,s8,Movie,Sankofa,Haile Gerima,United States,9/24/2021,1993,125 min,"Dramas, Independent Movies, International Movies"


### Handelling duplicates

In [18]:
df.duplicated().sum()

np.int64(0)

In [19]:
df.drop_duplicates(inplace=True)

### Handelling NA Values

In [20]:
df.isna().sum()

Show_id         0
Type            0
Title           0
Director        0
Country         0
Date_added      0
Release_year    0
Duration        0
Listed_in       0
dtype: int64

In [21]:
df.dropna(inplace=True)

In [22]:
df.head()

Unnamed: 0,Show_id,Type,Title,Director,Country,Date_added,Release_year,Duration,Listed_in
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,United States,9/25/2021,2020,90 min,Documentaries
1,s3,TV Show,Ganglands,Julien Leclercq,France,9/24/2021,2021,1 Season,"Crime TV Shows, International TV Shows, TV Act..."
2,s6,TV Show,Midnight Mass,Mike Flanagan,United States,9/24/2021,2021,1 Season,"TV Dramas, TV Horror, TV Mysteries"
3,s14,Movie,Confessions of an Invisible Girl,Bruno Garotti,Brazil,9/22/2021,2021,91 min,"Children & Family Movies, Comedies"
4,s8,Movie,Sankofa,Haile Gerima,United States,9/24/2021,1993,125 min,"Dramas, Independent Movies, International Movies"


In [23]:
df['Show_id']

0          s1
1          s3
2          s6
3         s14
4          s8
        ...  
8785    s8797
8786    s8798
8787    s8801
8788    s8784
8789    s8786
Name: Show_id, Length: 8790, dtype: object

In [27]:
df['Show_id']=df['Show_id'].apply(lambda x:x.split('s')[1])

In [28]:
df.head()

Unnamed: 0,Show_id,Type,Title,Director,Country,Date_added,Release_year,Duration,Listed_in
0,1,Movie,Dick Johnson Is Dead,Kirsten Johnson,United States,9/25/2021,2020,90 min,Documentaries
1,3,TV Show,Ganglands,Julien Leclercq,France,9/24/2021,2021,1 Season,"Crime TV Shows, International TV Shows, TV Act..."
2,6,TV Show,Midnight Mass,Mike Flanagan,United States,9/24/2021,2021,1 Season,"TV Dramas, TV Horror, TV Mysteries"
3,14,Movie,Confessions of an Invisible Girl,Bruno Garotti,Brazil,9/22/2021,2021,91 min,"Children & Family Movies, Comedies"
4,8,Movie,Sankofa,Haile Gerima,United States,9/24/2021,1993,125 min,"Dramas, Independent Movies, International Movies"


In [29]:
type(df['Show_id'][0])

str

In [30]:
df['Show_id']=df['Show_id'].astype(int)

In [31]:
df['Show_id']

0          1
1          3
2          6
3         14
4          8
        ... 
8785    8797
8786    8798
8787    8801
8788    8784
8789    8786
Name: Show_id, Length: 8790, dtype: int64

In [33]:
df.head()

Unnamed: 0,Show_id,Type,Title,Director,Country,Date_added,Release_year,Duration,Listed_in
0,1,Movie,Dick Johnson Is Dead,Kirsten Johnson,United States,9/25/2021,2020,90 min,Documentaries
1,3,TV Show,Ganglands,Julien Leclercq,France,9/24/2021,2021,1 Season,"Crime TV Shows, International TV Shows, TV Act..."
2,6,TV Show,Midnight Mass,Mike Flanagan,United States,9/24/2021,2021,1 Season,"TV Dramas, TV Horror, TV Mysteries"
3,14,Movie,Confessions of an Invisible Girl,Bruno Garotti,Brazil,9/22/2021,2021,91 min,"Children & Family Movies, Comedies"
4,8,Movie,Sankofa,Haile Gerima,United States,9/24/2021,1993,125 min,"Dramas, Independent Movies, International Movies"


In [34]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8790 entries, 0 to 8789
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   Show_id       8790 non-null   int64 
 1   Type          8790 non-null   object
 2   Title         8790 non-null   object
 3   Director      8790 non-null   object
 4   Country       8790 non-null   object
 5   Date_added    8790 non-null   object
 6   Release_year  8790 non-null   int64 
 7   Duration      8790 non-null   object
 8   Listed_in     8790 non-null   object
dtypes: int64(2), object(7)
memory usage: 618.2+ KB


In [36]:
df['Date_added'].apply(lambda x:x.replace('/','-'))

0        9-25-2021
1        9-24-2021
2        9-24-2021
3        9-22-2021
4        9-24-2021
           ...    
8785     1-17-2017
8786     9-13-2018
8787    12-15-2016
8788     6-23-2018
8789      6-7-2018
Name: Date_added, Length: 8790, dtype: object

In [39]:
df['Date_added']=pd.to_datetime(df['Date_added'])
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8790 entries, 0 to 8789
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Show_id       8790 non-null   int64         
 1   Type          8790 non-null   object        
 2   Title         8790 non-null   object        
 3   Director      8790 non-null   object        
 4   Country       8790 non-null   object        
 5   Date_added    8790 non-null   datetime64[ns]
 6   Release_year  8790 non-null   int64         
 7   Duration      8790 non-null   object        
 8   Listed_in     8790 non-null   object        
dtypes: datetime64[ns](1), int64(2), object(6)
memory usage: 618.2+ KB


In [41]:
df['Release_year']=df['Release_year'].astype(int)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8790 entries, 0 to 8789
Data columns (total 9 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   Show_id       8790 non-null   int64         
 1   Type          8790 non-null   object        
 2   Title         8790 non-null   object        
 3   Director      8790 non-null   object        
 4   Country       8790 non-null   object        
 5   Date_added    8790 non-null   datetime64[ns]
 6   Release_year  8790 non-null   int64         
 7   Duration      8790 non-null   object        
 8   Listed_in     8790 non-null   object        
dtypes: datetime64[ns](1), int64(2), object(6)
memory usage: 618.2+ KB


In [42]:
data=df['Country'].value_counts().reset_index()
data[data['Country']=='India']

Unnamed: 0,Country,count
1,India,1057


In [43]:
df

Unnamed: 0,Show_id,Type,Title,Director,Country,Date_added,Release_year,Duration,Listed_in
0,1,Movie,Dick Johnson Is Dead,Kirsten Johnson,United States,2021-09-25,2020,90 min,Documentaries
1,3,TV Show,Ganglands,Julien Leclercq,France,2021-09-24,2021,1 Season,"Crime TV Shows, International TV Shows, TV Act..."
2,6,TV Show,Midnight Mass,Mike Flanagan,United States,2021-09-24,2021,1 Season,"TV Dramas, TV Horror, TV Mysteries"
3,14,Movie,Confessions of an Invisible Girl,Bruno Garotti,Brazil,2021-09-22,2021,91 min,"Children & Family Movies, Comedies"
4,8,Movie,Sankofa,Haile Gerima,United States,2021-09-24,1993,125 min,"Dramas, Independent Movies, International Movies"
...,...,...,...,...,...,...,...,...,...
8785,8797,TV Show,Yunus Emre,Not Given,Turkey,2017-01-17,2016,2 Seasons,"International TV Shows, TV Dramas"
8786,8798,TV Show,Zak Storm,Not Given,United States,2018-09-13,2016,3 Seasons,Kids' TV
8787,8801,TV Show,Zindagi Gulzar Hai,Not Given,Pakistan,2016-12-15,2012,1 Season,"International TV Shows, Romantic TV Shows, TV ..."
8788,8784,TV Show,Yoko,Not Given,Pakistan,2018-06-23,2016,1 Season,Kids' TV


In [44]:
df.to_csv('cleaned_netflix-data.csv',index=False)