In [2]:
import pandas as pd
import numpy as np

# ------------------------------
# Step 1: Load raw dataset
# ------------------------------
df = pd.read_csv(r"C:\Users\OJ 001\Downloads\customer_subscriptions_dirty.csv")
print("✅ Raw dataset loaded")

# Drop duplicates
df.drop_duplicates(inplace=True)

# ------------------------------
# Step 2: Handle missing values
# ------------------------------
# Categorical columns: fill missing with 'Unknown'
cat_cols = df.select_dtypes(include='object').columns
df[cat_cols] = df[cat_cols].fillna('Unknown')

# Numeric columns: fill missing with median
num_cols = df.select_dtypes(include='number').columns
df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# ------------------------------
# Step 3: Fix inconsistencies in categorical columns
# ------------------------------
# Gender
df['gender'] = df['gender'].replace({
    'M':'Male','m':'Male','male':'Male',
    'F':'Female','f':'Female','female':'Female',
    'Other':'Unknown','y':'yes','n':'no'
})

# Has Internet
df['has_internet'] = df['has_internet'].replace({
    'TRUE':'Yes','1':'Yes','0':'No','FALSE':'No','Y':'Yes','N':'No'
})

# Dependents
df['dependents'] = df['dependents'].replace({
    'yes':'Yes','no':'No'
})

# Contract
df['contract'] = df['contract'].str.lower().replace({
    '2-yr':'two year','1-yr':'one year'
})

# Payment method
df['payment_method'] = df['payment_method'].replace({
    'Mpesa':'M-pesa'
})

# City
df['city'] = df['city'].replace({
    'Niarobi':'Nairobi','Nrb':'Nairobi'
})

# Fill any remaining missing categorical values
cat_cols = ['gender','has_internet','dependents','contract','payment_method','city','plan_type']
df[cat_cols] = df[cat_cols].fillna('Unknown')

# ------------------------------
# Step 4: Convert binary columns to 0/1
# ------------------------------
binary_cols = ['has_internet','dependents']

for col in binary_cols:
    df[col] = df[col].astype(str).str.strip().str.lower()
    df[col] = df[col].replace({
        'yes':1,'no':0,'true':1,'false':0,'y':1,'n':0,'1':1,'0':0
    })
    df[col] = pd.to_numeric(df[col], errors='coerce')

print("✅ Binary columns converted:")
print(df[binary_cols].head(10))

# ------------------------------
# Step 5: Clean numeric columns
# ------------------------------
num_cols = ['age','tenure_months','monthly_charges','total_charges']

for col in num_cols:
    df[col] = df[col].astype(str).str.replace(',', '')  # remove commas
    df[col] = pd.to_numeric(df[col], errors='coerce')   # convert to numeric

df[num_cols] = df[num_cols].fillna(df[num_cols].median())

# ------------------------------
# Step 6: One-hot encode multi-class categorical columns
# ------------------------------
multi_cat_cols = ['gender','city','plan_type','payment_method','contract']
df_encoded = pd.get_dummies(df, columns=multi_cat_cols, drop_first=True)

# ------------------------------
# Step 7: Drop identifier or non-informative columns
# ------------------------------
# Example: customer IDs (replace with actual column names)
id_cols = ['customer_id'] if 'customer_id' in df_encoded.columns else []
df_encoded.drop(columns=id_cols, inplace=True)
print("✅ Dropped identifier columns:", id_cols)

# ------------------------------
# Step 8: Ensure boolean columns are numeric
# ------------------------------
bool_cols = df_encoded.select_dtypes(include='bool').columns
for col in bool_cols:
    df_encoded[col] = df_encoded[col].astype(int)

# ------------------------------
# Step 9: Save final numeric dataset
# ------------------------------
df_encoded.to_csv("telecom_ml_ready.csv", index=False)
print("✅ Fully numeric ML-ready dataset saved as telecom_ml_ready.csv")


✅ Raw dataset loaded
✅ Binary columns converted:
   has_internet  dependents
0             0         0.0
1             1         0.0
2             1         0.0
3             1         0.0
4             1         0.0
5             0         1.0
6             0         0.0
7             1         1.0
8             0         0.0
9             1         0.0
✅ Dropped identifier columns: ['customer_id']
✅ Fully numeric ML-ready dataset saved as telecom_ml_ready.csv


  df[col] = df[col].replace({
