In [None]:
# Import necessary libraries for data manipulation
import pandas as pd
import numpy as np

# Universal Data Cleaning & Preparation Template

This Jupyter Notebook provides a step-by-step template for cleaning and preparing any tabular dataset for machine learning. Follow the cells in order to handle missing values, convert data types, and encode categorical features for optimal model performance.

In [None]:
# Import necessary libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
import joblib

### Step 1: Load Your Data

In the cell below, replace `'your_dataset.csv'` with the path to your own data file. Make sure the data file is in the same folder as this notebook for the easiest setup.

## 1. Data Loading and Exploration

In [None]:
# --- Configuration: Please set your file path here ---
file_path = 'your_dataset.csv'

# --- Data Loading ---
try:
    df = pd.read_csv(file_path)
    print("Data loaded successfully!")
    # Display the first 5 rows to confirm
    print(df.head())

except FileNotFoundError:
    print(f"ERROR: The file '{file_path}' was not found.")
    print("Please make sure your data file is in the same directory as this notebook and the file_path variable is set correctly.")

### Step 2: Fix Incorrect Data Types

This step addresses a common issue where a numeric column is incorrectly read as text (`object`). The code below will convert a specified column to a numeric format. Any values that cannot be converted will be marked as missing data (`NaN`), which we will handle in the next step.

**Action Required:** In the code cell below, update the `column_to_fix` variable with the name of the column from your dataset that you need to convert.

In [None]:
# --- Configuration ---
# The user of this template should change this column name.
# We are using 'TotalCharges' because it exists in our current test data.
column_to_fix = 'TotalCharges'

# --- Execution ---
print(f"Attempting to convert column: '{column_to_fix}'...")
if column_to_fix in df.columns and df[column_to_fix].dtype == 'object':
    initial_missing = df[column_to_fix].isnull().sum()
    df[column_to_fix] = pd.to_numeric(df[column_to_fix], errors='coerce')
    new_missing = df[column_to_fix].isnull().sum()
    print(f"SUCCESS: Column '{column_to_fix}' was converted to a numeric data type.")
    if new_missing > initial_missing:
        print(f"INFO: {new_missing - initial_missing} new missing values (NaN) were created during conversion.")
elif column_to_fix in df.columns:
     print(f"INFO: Column '{column_to_fix}' is already a numeric type. No action needed.")
else:
    print(f"INFO: Column '{column_to_fix}' not found in the DataFrame. Skipping this step.")

### Step 3: Handle Missing Values

Machine learning models cannot work with missing values (NaN). We need to either drop the rows containing them or fill them with a meaningful value (a process called imputation).

The code below will automatically find all numeric columns that have missing values and fill them with the **mean** (average) of that column. This is a common and robust strategy.

In [None]:
# This cell finds all numeric columns with missing values and fills them with that column's mean.

print("--- Missing Values Before Filling ---")
# Let's see which columns have missing values and how many
missing_values = df.isnull().sum()
print(missing_values[missing_values > 0])

# Impute missing values for numeric columns
for col in df.select_dtypes(include=np.number).columns:
    if df[col].isnull().sum() > 0:
        mean_value = df[col].mean()
        df[col] = df[col].fillna(mean_value)
        print(f"\nINFO: Missing values in numeric column '{col}' were filled with its mean: {mean_value:.2f}")

print("\n--- Missing Values After Filling ---")
# Verify that there are no more missing values in numeric columns
missing_values_after = df.isnull().sum()
print(missing_values_after[missing_values_after > 0])

### Step 4: Encode Categorical Data

Finally, we need to convert all text-based columns into numbers for the model. We will use two techniques:
1.  **Binary Encoding:** For columns with only two unique values (e.g., 'Yes'/'No'). These will be mapped to `0` and `1`.
2.  **One-Hot Encoding:** For columns with more than two categories. This creates new columns for each category with `0` or `1` values.

**Action Required:** In the code cell below, update the `target_column` and `binary_cols_to_map` variables to match your dataset's column names.

In [None]:
# --- Configuration ---
# The column you want to predict.
target_column = 'Churn' 

# List of columns that have only two text values (e.g., 'Yes'/'No', 'Male'/'Female').
binary_cols_to_map = ['gender', 'Partner', 'Dependents', 'PhoneService', 'PaperlessBilling']

# --- Execution ---
# Work on a copy to be safe and keep the original df for reference.
df_final = df.copy()

print("--- Starting Categorical Encoding ---")

# Encode other binary columns
for col in binary_cols_to_map:
    if col in df_final.columns and df_final[col].dtype == 'object':
        unique_vals = df_final[col].unique()
        if len(unique_vals) == 2:
            mapping = {unique_vals[0]: 0, unique_vals[1]: 1}
            df_final[col] = df_final[col].map(mapping)
            print(f"SUCCESS: Binary column '{col}' was encoded using mapping: {mapping}")

# Encode the target column (handle it separately for clarity)
if target_column in df_final.columns and df_final[target_column].dtype == 'object':
     unique_vals = df_final[target_column].unique()
     if len(unique_vals) == 2:
        mapping = {unique_vals[0]: 0, unique_vals[1]: 1}
        df_final[target_column] = df_final[target_column].map(mapping)
        print(f"SUCCESS: Target column '{target_column}' was encoded using mapping: {mapping}")

# One-Hot Encode all remaining 'object' type columns
# These are columns with more than 2 text categories
object_cols_to_encode = df_final.select_dtypes(include=['object']).columns
if len(object_cols_to_encode) > 0:
    df_final = pd.get_dummies(df_final, columns=object_cols_to_encode, drop_first=True)
    print(f"\nSUCCESS: The following columns were One-Hot Encoded: {list(object_cols_to_encode)}")
else:
    print("\nINFO: No multi-category columns found to One-Hot Encode.")

print("\n--- Encoding Complete ---")
print("Displaying the first 5 rows of the final, fully numeric DataFrame:")
df_final.head()

In [None]:
# Get a summary of the DataFrame
churn_df.info()

## 2. Data Preprocessing and Cleaning

In [None]:
# Fix 'TotalCharges' column and drop rows with missing values
churn_df['TotalCharges'] = pd.to_numeric(churn_df['TotalCharges'], errors='coerce')
churn_df.dropna(inplace=True)

# Encode binary and target variables
churn_df['Churn'] = churn_df['Churn'].map({'No': 0, 'Yes': 1})
churn_df['gender'] = churn_df['gender'].map({'Female': 0, 'Male': 1})
binary_cols = ['Partner', 'Dependents', 'PhoneService', 'PaperlessBilling']
for col in binary_cols:
    churn_df[col] = churn_df[col].map({'No': 0, 'Yes': 1})

# One-Hot Encode multi-category columns and drop customerID
multi_cat_cols = churn_df.select_dtypes(include=['object']).columns.drop(['customerID'])
churn_df_final = pd.get_dummies(churn_df, columns=multi_cat_cols, drop_first=True)
churn_df_final = churn_df_final.drop('customerID', axis=1)

### Final Data Check
As a final preprocessing step, we clean the column names for API compatibility and check the data types to ensure everything is numeric.

In [None]:
# Clean column names
churn_df_final.columns = churn_df_final.columns.str.replace(' ', '_')

# Verify that all columns are now numeric
print("Final Data Types:")
churn_df_final.info()

## 3. Model Building and Training

In [None]:
# Separate features (X) and target (y)
X = churn_df_final.drop('Churn', axis=1)
y = churn_df_final['Churn']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Initialize and train the Logistic Regression model
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train_scaled, y_train)

## 4. Model Evaluation

In [None]:
# Evaluate the model's accuracy
accuracy = log_model.score(X_test_scaled, y_test)
print(f"Model Accuracy: {accuracy:.4f}")

# Generate the confusion matrix
y_pred = log_model.predict(X_test_scaled)
cm = confusion_matrix(y_test, y_pred)
print("\nConfusion Matrix:")
print(cm)

## 5. Save Model and Scaler
The final trained model and the scaler are saved to disk for use in the API.

In [None]:
# Save the model and scaler to disk
joblib.dump(log_model, 'churn_model.pkl')
joblib.dump(scaler, 'scaler.pkl')

print("Model and Scaler saved successfully.")