## 1) Libraries import

In [13]:
from datetime import datetime as dt
import pandas as pd
import numpy as np


## 2) Load dataset

In [14]:
df = pd.read_csv('../Data/netflix_data.csv')
df.head()


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,duration,description,genre
0,s2,Movie,7:19,Jorge Michel Grau,"Demián Bichir, Héctor Bonilla, Oscar Serrano, ...",Mexico,23-Dec-16,2016,93,After a devastating earthquake hits Mexico Cit...,Dramas
1,s3,Movie,23:59,Gilbert Chan,"Tedd Chan, Stella Chung, Henley Hii, Lawrence ...",Singapore,20-Dec-18,2011,78,"When an army recruit is found dead, his fellow...",Horror Movies
2,s4,Movie,9,Shane Acker,"Elijah Wood, John C. Reilly, Jennifer Connelly...",United States,16-Nov-17,2009,80,"In a postapocalyptic world, rag-doll robots hi...",Action
3,s5,Movie,21,Robert Luketic,"Jim Sturgess, Kevin Spacey, Kate Bosworth, Aar...",United States,1-Jan-20,2008,123,A brilliant group of students become card-coun...,Dramas
4,s6,TV Show,46,Serdar Akar,"Erdal Beşikçioğlu, Yasemin Allen, Melis Birkan...",Turkey,1-Jul-17,2016,1,A genetics professor experiments with a treatm...,International TV


## 3) Basic structure & quick checks

In [15]:

print("Shape:", df.shape)
print("\nColumns:", df.columns.tolist())
print("\nData types:\n", df.dtypes)
print("\nInfo:")
df.info()
print("\nMissing values (per column):")
print(df.isnull().sum())
print("\nNumeric summary:")
display(df.describe(include=[np.number]))
print("\nNon-numeric summary (top):")
display(df.describe(include=[object]).T)


Shape: (4812, 11)

Columns: ['show_id', 'type', 'title', 'director', 'cast', 'country', 'date_added', 'release_year', 'duration', 'description', 'genre']

Data types:
 show_id         object
type            object
title           object
director        object
cast            object
country         object
date_added      object
release_year     int64
duration         int64
description     object
genre           object
dtype: object

Info:
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4812 entries, 0 to 4811
Data columns (total 11 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   show_id       4812 non-null   object
 1   type          4812 non-null   object
 2   title         4812 non-null   object
 3   director      4812 non-null   object
 4   cast          4812 non-null   object
 5   country       4812 non-null   object
 6   date_added    4812 non-null   object
 7   release_year  4812 non-null   int64 
 8   duration      4812 non-nul

Unnamed: 0,release_year,duration
count,4812.0,4812.0
mean,2012.711554,99.566708
std,9.517978,30.889305
min,1942.0,1.0
25%,2011.0,88.0
50%,2016.0,99.0
75%,2018.0,116.0
max,2021.0,253.0



Non-numeric summary (top):


Unnamed: 0,count,unique,top,freq
show_id,4812,4812,s2,1
type,4812,2,Movie,4677
title,4812,4812,7:19,1
director,4812,3615,"Raúl Campos, Jan Suter",18
cast,4812,4690,Samuel West,10
country,4812,72,United States,1886
date_added,4812,1292,1-Jan-20,100
description,4812,4807,When pretty new neighbor Seema falls for their...,2
genre,4812,31,Dramas,1343


## 4) Clean & preprocessing

In [18]:
if 'date_added' in df.columns:
    # First, clean the column
    df['date_added'] = df['date_added'].astype('string').str.strip()

    # Try multiple date formats
    df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')



    # Extract components
    df['added_day'] = df['date_added'].dt.day
    df['added_month'] = df['date_added'].dt.month
    df['added_year'] = df['date_added'].dt.year

    print(df[['date_added', 'added_day', 'added_month', 'added_year']].head())
    print(f"\nDate range: {df['date_added'].min()} to {df['date_added'].max()}")

  date_added  added_day  added_month  added_year
0 2016-12-23         23           12        2016
1 2018-12-20         20           12        2018
2 2017-11-16         16           11        2017
3 2020-01-01          1            1        2020
4 2017-07-01          1            7        2017

Date range: 2008-01-01 00:00:00 to 2021-01-16 00:00:00


In [19]:
if 'country' in df.columns:
    print("Top 10 countries:")
    print(df['country'].value_counts().head(10))
else:
    print(" 'country' column not found!")

if 'type' in df.columns:
    print("\nCount by type:")
    print(df['type'].value_counts())
else:
    print(" 'type' column not found!")

if 'release_year' in df.columns:
    print("\nRelease year distribution (last 20 years):")
    print(df['release_year'].value_counts().sort_index().tail(20))
else:
    print(" 'release_year' column not found!")

if 'date_added' in df.columns:
    print("\nRows with missing date_added:", df['date_added'].isna().sum())
else:
    print(" 'date_added' column not found!")



Top 10 countries:
country
United States     1886
India              864
United Kingdom     311
Canada             155
France             133
Spain              112
Egypt               91
Turkey              81
Japan               81
Philippines         74
Name: count, dtype: int64

Count by type:
type
Movie      4677
TV Show     135
Name: count, dtype: int64

Release year distribution (last 20 years):
release_year
2002     39
2003     39
2004     50
2005     59
2006     75
2007     69
2008     99
2009    101
2010    125
2011    119
2012    150
2013    183
2014    224
2015    340
2016    562
2017    646
2018    624
2019    488
2020    379
2021      4
Name: count, dtype: int64

Rows with missing date_added: 0
