# Netflix Data Analysis

In [9]:
# importing pandas library
import pandas as pd

### Data Loading & Overview

In [28]:
# laoding Netflix data

netflix_raw = pd.read_csv('netflix.csv')

In [22]:
# Display basic information and the first few rows of the dataset

netflix_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      6173 non-null   object
 4   cast          7982 non-null   object
 5   country       7976 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8803 non-null   object
 9   duration      8804 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [24]:
netflix_raw.head()

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


### Data Cleaning, Transformation and Exporting

In [37]:
# Cleaning the data

# Handling missing values
# categorical_cols = ['director', 'cast', 'country', 'rating', 'duration']
# netflix_data[categorical_cols] = netflix_data[categorical_cols].fillna('Unknown')

categorical_cols = ['director', 'cast', 'country', 'rating', 'duration']
netflix_raw[categorical_cols] = netflix_raw[categorical_cols].fillna('Unknown')
netflix_raw.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       8807 non-null   object
 1   type          8807 non-null   object
 2   title         8807 non-null   object
 3   director      8807 non-null   object
 4   cast          8807 non-null   object
 5   country       8807 non-null   object
 6   date_added    8797 non-null   object
 7   release_year  8807 non-null   int64 
 8   rating        8807 non-null   object
 9   duration      8807 non-null   object
 10  listed_in     8807 non-null   object
 11  description   8807 non-null   object
dtypes: int64(1), object(11)
memory usage: 825.8+ KB


In [63]:
# For `date_added` column

netflix_raw['date_added'] = netflix_raw['date_added'].fillna('Unknown')
netflix_raw['date_added'] = netflix_raw['date_added'].replace('Unknown', '1900-01-01')

In [65]:
# Convert `date_added` column to datetime
netflix_raw['date_added'] = pd.to_datetime(netflix_raw['date_added'], errors='coerce')

In [67]:
# Preview cleaned data information and check for remaining missing values
cleaned_data_info = netflix_raw.info()
remaining_nulls = netflix_raw.isnull().sum()

cleaned_data_info, remaining_nulls

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8807 entries, 0 to 8806
Data columns (total 12 columns):
 #   Column        Non-Null Count  Dtype         
---  ------        --------------  -----         
 0   show_id       8807 non-null   object        
 1   type          8807 non-null   object        
 2   title         8807 non-null   object        
 3   director      8807 non-null   object        
 4   cast          8807 non-null   object        
 5   country       8807 non-null   object        
 6   date_added    8807 non-null   datetime64[ns]
 7   release_year  8807 non-null   int64         
 8   rating        8807 non-null   object        
 9   duration      8807 non-null   object        
 10  listed_in     8807 non-null   object        
 11  description   8807 non-null   object        
dtypes: datetime64[ns](1), int64(1), object(10)
memory usage: 825.8+ KB


(None,
 show_id         0
 type            0
 title           0
 director        0
 cast            0
 country         0
 date_added      0
 release_year    0
 rating          0
 duration        0
 listed_in       0
 description     0
 dtype: int64)

In [81]:
# Saving the cleaned dataset to a CSV file for MySQL import
cleaned_file_path = 'netflix_cleared.csv'
netflix_raw.to_csv(cleaned_file_path, index=False)

cleaned_file_path

'netflix_cleared.csv'