# Extraction and Load part 

In [None]:
# Importing the pandas library for data manipulation and analysis
import pandas as pd

# Reading the Netflix titles dataset from a CSV file and storing it in a DataFrame named 'df'
df = pd.read_csv('netflix_titles.csv')


In [None]:
# Displays the first 5 rows of the DataFrame to get a quick overview of the dataset
df.head()


Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
0,s1,Movie,Dick Johnson Is Dead,Kirsten Johnson,,United States,"September 25, 2021",2020,PG-13,90 min,Documentaries,"As her father nears the end of his life, filmm..."
1,s2,TV Show,Blood & Water,,"Ama Qamata, Khosi Ngema, Gail Mabalane, Thaban...",South Africa,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, TV Dramas, TV Mysteries","After crossing paths at a party, a Cape Town t..."
2,s3,TV Show,Ganglands,Julien Leclercq,"Sami Bouajila, Tracy Gotoas, Samuel Jouy, Nabi...",,"September 24, 2021",2021,TV-MA,1 Season,"Crime TV Shows, International TV Shows, TV Act...",To protect his family from a powerful drug lor...
3,s4,TV Show,Jailbirds New Orleans,,,,"September 24, 2021",2021,TV-MA,1 Season,"Docuseries, Reality TV","Feuds, flirtations and toilet talk go down amo..."
4,s5,TV Show,Kota Factory,,"Mayur More, Jitendra Kumar, Ranjan Raj, Alam K...",India,"September 24, 2021",2021,TV-MA,2 Seasons,"International TV Shows, Romantic TV Shows, TV ...",In a city of coaching centers known to train I...


In [None]:
# Importing SQLAlchemy, a library used for connecting and interacting with SQL databases
import sqlalchemy as sal

# Creating a connection engine to a Microsoft SQL Server database named 'Netflix_DB'
# Replace 'DESKTOP-MIHAVPG' with your actual server name if different
# Make sure the correct ODBC driver is installed on your system
engine = sal.create_engine('mssql://DESKTOP-MIHAVPG/Netflix_DB?driver=ODBC+DRIVER+17+FOR+SQL+SERVER')

# Establishing a connection to the database using the engine
conn = engine.connect()

# Writing the DataFrame 'df' to a SQL table named 'netflix_table12'
# index=False prevents pandas from writing the DataFrame index as a column
# if_exists='append' means data will be added to the table if it already exists
df.to_sql('netflix_table12', con=conn, index=False, if_exists='append')


107

In [None]:
# Returns the number of rows (entries) in the DataFrame
len(df)


8807

In [73]:
# Returns the shape of the DataFrame as a tuple (rows, columns)
df.shape


(8807, 12)

In [61]:
df[df.show_id=='s5023']

Unnamed: 0,show_id,type,title,director,cast,country,date_added,release_year,rating,duration,listed_in,description
5022,s5023,Movie,반드시 잡는다,Hong-seon Kim,Baek Yoon-sik,South Korea,"February 28, 2018",2017,TV-MA,110 min,"Dramas, International Movies, Thrillers",After people in his town start turning up dead...


In [None]:
# To find the maximum length of values in the 'title' column
# This is useful because the column might not be read properly in SQL Server due to length limitations
# Knowing the max length helps define the appropriate VARCHAR size when creating the table in SQL Server
max(df.title.str.len())

# We should do the same for all text-based columns to avoid performance issues or data truncation in SQL Server


5

In [74]:
# Checks for missing (null) values in each column of the DataFrame
# Returns the total number of null values per column
df.isna().sum()


show_id            0
type               0
title              0
director        2634
cast             825
country          831
date_added        10
release_year       0
rating             4
duration           3
listed_in          0
description        0
dtype: int64