In [None]:
# Load YAML configuration file
import yaml

with open("../config.yaml", "r") as file:
    config = yaml.safe_load(file)

# Load dataset with pandas
import pandas as pd
df = pd.read_csv(config['input_data']['file'])  # Load dataset based on YAML path
df.head()  # Show the first 5 rows of the dataset


In [None]:
# Explore structure and metadata
df.shape         # Check number of rows and columns
df.columns       # List all column names
df.info()        # Get data types and non-null counts per column

In [None]:
# Check missing values in descending order
df.isnull().sum().sort_values(ascending=False)

In [None]:
# Number of unique values per column
df.nunique().sort_values()

# Distribution of key categorical columns
df['type'].value_counts(normalize=True)  # Distribution of 'Movie' vs 'TV Show'
df['country'].value_counts().head(10)    # Top 10 countries
df['rating'].value_counts().head(10)     # Most common ratings
df['listed_in'].value_counts().head(10)  # Most frequent genres


In [None]:
# Print summaries for presentation/exploration
print("Content types:")
print(df['type'].value_counts())

print("Most common ratings:")
print(df['rating'].value_counts().head(10))

print("Top countries by number of titles:")
print(df['country'].value_counts().head(10))

print("Most frequent release years:")
print(df['release_year'].value_counts().head(10))


In [None]:
# View a sample of the genres column
df['listed_in'].sample(10)


In [None]:
# Check unique countries
df['country'].unique()[:20]


In [None]:
# Split 'duration' column into duration value and type (e.g., "90 min" → 90 + 'min')
df[['duration_int', 'duration_type']] = df['duration'].str.extract(r'(\d+)\s*(\w+)')
df['duration_int'] = pd.to_numeric(df['duration_int'], errors='coerce')  # Convert to numeric
df[['duration', 'duration_int', 'duration_type']].head()  # Preview new columns


In [None]:
# Convert 'date_added' column to datetime format
df['date_added'] = pd.to_datetime(df['date_added'], errors='coerce')
df['date_added'].dtype  # Confirm conversion


In [None]:
# Split genres into lists (for multi-label classification)
df['genres'] = df['listed_in'].str.split(', ')
df[['listed_in', 'genres']].sample(5)  # Check results

In [None]:
# Repeat duration extraction and conversion (redundant, this can be cleaned!)
df[['duration_int', 'duration_type']] = df['duration'].str.extract(r'(\d+)\s*(\w+)')
df['duration_int'] = pd.to_numeric(df['duration_int'], errors='coerce')
df[['duration', 'duration_int', 'duration_type']].sample(10)



In [None]:
# Frequency of duration types (e.g., 'min', 'Season')
df['duration_type'].value_counts()

# Fix inconsistency (e.g., replace plural form)
df['duration_type'] = df['duration_type'].replace('Seasons', 'Season')
df['duration_type'].value_counts()  # Confirm correction

In [None]:
# Check for missing durations
df['duration_int'].isnull().sum()

# Keep only rows with valid duration
df = df[df['duration_int'].notna()]

# Inspect rows where duration was missing (optional)
df[df['duration_int'].isna()][['duration']]


In [None]:
df = df.drop_duplicates( subset=['show_id', 'title', 'release_year'])
print("Number of rows after removing duplicates:", df.shape[0])

In [None]:
duplicates = df.duplicated(subset=['show_id', 'title', 'release_year']).sum()
print("Remaining duplicates:", duplicates)

In [None]:
# Count missing values per column
df.isnull().sum().sort_values(ascending=False)


In [None]:
# Fill missing values with appropriate defaults

# 1. Replace missing 'director' with "Unknown"
df['director'] = df['director'].fillna("Unknown")

# 2. Replace missing 'cast' with "Unknown"
df['cast'] = df['cast'].fillna("Unknown")

# 3. Replace missing 'country' with "Unknown"
df['country'] = df['country'].fillna("Unknown")

# 4. Drop rows where 'date_added' is missing
df = df[df['date_added'].notna()]

# 5. Replace missing 'rating' with "Not Rated"
df['rating'] = df['rating'].fillna("Not Rated")

In [None]:
# Check missing values per column after cleaning
df.isnull().sum().sort_values(ascending=False)

In [None]:
df.columns = [col.lower().strip().replace(' ', '_') for col in df.columns]

In [None]:
df['country'] = df['country'].apply(lambda x: x.split(',')[0] if isinstance(x, str) else x)

In [None]:
print(df['duration_type'].value_counts())
print(df['type'].value_counts())
print(df['rating'].value_counts())
print(df['country'].value_counts().head(20))


In [None]:
df['rating'] = df['rating'].replace({
    'NR': 'Unrated',
    'UR': 'Unrated',
    'Not Rated': 'Unrated',
    'UNRATED': 'Unrated', 
    'TV-Y7-FV': 'TV-Y7'  
})

In [None]:
df['duration_type'] = df['duration_type'].str.lower()

In [None]:
def extract_main_country(country):
    if isinstance(country, str):
        return country.split(',')[0]
    return country

df['main_country'] = df['country'].apply(extract_main_country)

In [None]:
df[['country', 'main_country']].sample(10)
df['main_country'].value_counts().head(10)
df[df['main_country'].str.contains(',', na=False)][['country', 'main_country']]


In [None]:
df.columns = df.columns.str.strip().str.lower().str.replace(' ', '_')
df['type'].value_counts()
df['duration_type'].value_counts()
df['rating'].value_counts()


In [None]:
df['rating'] = df['rating'].replace('Unrated', 'Not Rated')
df['rating'].value_counts()


In [None]:
print("Date format:", df['date_added'].dtype)

print("Earliest date:", df['date_added'].min())
print("Latest date:", df['date_added'].max())

df['date_added'].sample(10, random_state=1)

In [None]:
# Calculate average duration per content type
avg_duration = df.groupby('duration_type')['duration_int'].mean().reset_index()
print("🔹 Average duration per type:")
print(avg_duration)

In [None]:
# Count number of titles per main country
country_counts = df['main_country'].value_counts().reset_index()
country_counts.columns = ['country', 'count']
print("\n🔹 Number of titles by country:")
print(country_counts.head(10))

In [None]:
# Explode genres and count most common
df_exploded = df.explode('genres')  # Convert genre lists into separate rows
genre_counts = df_exploded['genres'].value_counts().reset_index()
genre_counts.columns = ['genre', 'count']
print("\n🔹 Most common genres:")
print(genre_counts.head(10))

In [None]:
# Count number of titles by release year
titles_per_year = df['release_year'].value_counts().sort_index().reset_index()
titles_per_year.columns = ['release_year', 'count']
print("\n🔹 Titles by release year:")
print(titles_per_year.tail(10))  # Show most recent years

In [None]:
df.to_csv('../data/clean/cleaned_netflix_data.csv', index=False)


In [None]:
#Export cleaned DataFrame to CSV for SQL import or future use
df.to_csv('../data/clean/cleaned_netflix_data.csv', index=False)