**1. Initial Cleaning**

In [1]:
import pandas as pd
import numpy as np

In [2]:
# Load the dataset
df = pd.read_csv('netflix_customer_churn.csv')

In [3]:
print(df.shape)

(5000, 14)


In [4]:
df.columns

Index(['customer_id', 'age', 'gender', 'subscription_type', 'watch_hours',
       'last_login_days', 'region', 'device', 'monthly_fee', 'churned',
       'payment_method', 'number_of_profiles', 'avg_watch_time_per_day',
       'favorite_genre'],
      dtype='object')

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 14 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   customer_id             5000 non-null   object 
 1   age                     5000 non-null   int64  
 2   gender                  5000 non-null   object 
 3   subscription_type       5000 non-null   object 
 4   watch_hours             5000 non-null   float64
 5   last_login_days         5000 non-null   int64  
 6   region                  5000 non-null   object 
 7   device                  5000 non-null   object 
 8   monthly_fee             5000 non-null   float64
 9   churned                 5000 non-null   int64  
 10  payment_method          5000 non-null   object 
 11  number_of_profiles      5000 non-null   int64  
 12  avg_watch_time_per_day  5000 non-null   float64
 13  favorite_genre          5000 non-null   object 
dtypes: float64(3), int64(4), object(7)
memor

In [6]:
df.head()

Unnamed: 0,customer_id,age,gender,subscription_type,watch_hours,last_login_days,region,device,monthly_fee,churned,payment_method,number_of_profiles,avg_watch_time_per_day,favorite_genre
0,a9b75100-82a8-427a-a208-72f24052884a,51,Other,Basic,14.73,29,Africa,TV,8.99,1,Gift Card,1,0.49,Action
1,49a5dfd9-7e69-4022-a6ad-0a1b9767fb5b,47,Other,Standard,0.7,19,Europe,Mobile,13.99,1,Gift Card,5,0.03,Sci-Fi
2,4d71f6ce-fca9-4ff7-8afa-197ac24de14b,27,Female,Standard,16.32,10,Asia,TV,13.99,0,Crypto,2,1.48,Drama
3,d3c72c38-631b-4f9e-8a0e-de103cad1a7d,53,Other,Premium,4.51,12,Oceania,TV,17.99,1,Crypto,2,0.35,Horror
4,4e265c34-103a-4dbb-9553-76c9aa47e946,56,Other,Standard,1.89,13,Africa,Mobile,13.99,1,Crypto,2,0.13,Action


In [7]:
# Drop customer_id
df.drop(columns='customer_id', inplace=True)

In [8]:
# Rename churn column
df.rename(columns={'churned': 'churn'}, inplace=True)

In [9]:
# Standardize column names
df.columns = df.columns.str.lower()

**2. Missing Values Handling**

In [10]:
# Categorical: Fill with mode
categorical_cols = ['gender', 'region', 'device', 'payment_method', 'favorite_genre', 'subscription_type']
for col in categorical_cols:
    df[col] = df[col].fillna(df[col].mode()[0])

In [11]:
# Numerical: Fill with median
numerical_cols = ['age', 'watch_hours', 'last_login_days', 'monthly_fee',
                  'number_of_profiles', 'avg_watch_time_per_day']
for col in numerical_cols:
    df[col] = df[col].fillna(df[col].median())

**3. Standardize & Fix Types**

In [12]:
# Format string columns
for col in categorical_cols:
    df[col] = df[col].astype(str).str.title()

In [13]:
# Convert types
df['age'] = df['age'].astype(int)
df['last_login_days'] = df['last_login_days'].astype(int)
df['number_of_profiles'] = df['number_of_profiles'].astype(int)
df['churn'] = df['churn'].astype(bool)

**4. Validate & Clean Data**

In [14]:
# Remove invalid age entries
df = df[(df['age'] >= 18) & (df['age'] <= 100)]

In [15]:
# Remove negative or invalid values
df = df[(df[numerical_cols] >= 0).all(axis=1)]

In [16]:
# Check monthly_fee validity
df = df[df['monthly_fee'].isin([8.99, 13.99, 17.99])]

In [17]:
# Remove duplicates (if any)
df.drop_duplicates(inplace=True)

**5. Feature Engineering**

In [18]:
# Engagement feature
df['value_per_hour'] = (df['watch_hours'] / df['monthly_fee']).round(2)

In [19]:
# Activity status
df['activity_status'] = np.where(df['last_login_days'] <= 7, 'Active',
                          np.where(df['last_login_days'] <= 30, 'Intermittent', 'Dormant'))

In [20]:
# Age group
df['age_group'] = pd.cut(df['age'], bins=[17,25,35,50,100],
                         labels=['18-25','26-35','36-50','51+'])

In [21]:
# Save cleaned dataset
df.to_csv('netflix_customer_churn_cleaned.csv', index=False)
print("Cleaned data saved to 'netflix_customer_churn_cleaned.csv'")

Cleaned data saved to 'netflix_customer_churn_cleaned.csv'
