# Data Cleaning for Netflix Viewing Trends Project

This notebook loads, cleans, and saves processed versions of all raw CSV files in the `data/raw` folder.

In [1]:
import pandas as pd
import numpy as np
import os

raw_dir = '../data/raw'
processed_dir = '../data/processed'
os.makedirs(processed_dir, exist_ok=True)

# List your raw CSV files
csv_files = [
    'MoviesOnStreamingPlatforms_updated.csv',
    'Netflix Revenue updated.xlsx',
    'TV_Shows.csv'
]

## Cleaning Function

Define a cleaning function to apply consistent cleaning steps to all files.

In [2]:
def clean_dataframe(df):
    # Remove duplicate rows
    df = df.drop_duplicates()
    # Strip whitespace from string columns
    for col in df.select_dtypes(include='object'):
        df[col] = df[col].str.strip()
    # Fill missing numeric columns with median
    for col in df.select_dtypes(include=[np.number]):
        df[col] = df[col].fillna(df[col].median())
    # Fill missing object columns with mode
    for col in df.select_dtypes(include='object'):
        if df[col].isnull().any():
            df[col] = df[col].fillna(df[col].mode()[0])
    return df

## Clean and Save Each File

In [None]:
# First, let's add openpyxl for Excel support
get_ipython().system('pip install openpyxl')

# Update the file processing code with better error handling
for filename in csv_files:
    try:
        print(f"\nProcessing file: {filename}")
        raw_path = os.path.join(raw_dir, filename)
        
        # Check if file exists
        if not os.path.exists(raw_path):
            print(f"Error: File not found - {raw_path}")
            continue
            
        name, ext = os.path.splitext(filename)
        processed_filename = f"{name}_cleaned.csv"  # Always save as CSV
        processed_path = os.path.join(processed_dir, processed_filename)
        
        print(f"Reading file: {raw_path}")
        if ext.lower() == '.csv':
            df = pd.read_csv(raw_path)
        elif ext.lower() in ['.xls', '.xlsx']:
            df = pd.read_excel(raw_path, engine='openpyxl')
        else:
            print(f"Unsupported file type: {filename}")
            continue
            
        print(f"Shape before cleaning: {df.shape}")
        df_clean = clean_dataframe(df)
        print(f"Shape after cleaning: {df_clean.shape}")
        
        # Save as CSV
        df_clean.to_csv(processed_path, index=False)
        print(f"Successfully saved to: {processed_path}")
        
    except Exception as e:
        print(f"Error processing {filename}: {str(e)}")

print("\nProcessing complete!")

Cleaned file saved to: ../data/processed\MoviesOnStreamingPlatforms_updated_cleaned.csv


ImportError: Missing optional dependency 'openpyxl'.  Use pip or conda to install openpyxl.

All raw files have been cleaned and saved to the `data/processed` folder.