In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.impute import SimpleImputer

In [None]:
# Task 1: Data Preparation

In [None]:
# Load the dataset
df = pd.read_csv('TCC_Dataset  (1).csv')

In [None]:
# Handle missing values
# Check for missing values
print("Missing values:\n", df.isnull().sum())

In [None]:
# Convert 'TotalCharges' to numeric, handle non-numeric values by coercing to NaN
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce')

In [None]:
# Impute missing values for all numerical columns
numerical_cols = df.select_dtypes(include=['float64', 'int64']).columns
imputer = SimpleImputer(strategy='median')
df[numerical_cols] = imputer.fit_transform(df[numerical_cols])

In [None]:
# Drop 'customerID' as it's not useful for prediction
if 'customerID' in df.columns:
    df.drop('customerID', axis=1, inplace=True)

In [None]:
# Encode categorical variables
# Identify categorical columns
categorical_cols = df.select_dtypes(include=['object']).columns

In [None]:
# Apply LabelEncoder to categorical variables
label_encoders = {}
for col in categorical_cols:
    le = LabelEncoder()
    df[col] = le.fit_transform(df[col])
    label_encoders[col] = le

In [None]:
# Verify no NaN values remain
print("\nMissing values after imputation:\n", df.isnull().sum())

In [None]:
# Task 2: Split Data for Training and Testing

In [None]:
# Define features (X) and target (y)
X = df.drop('Churn', axis=1)
y = df['Churn']

In [None]:
# Split data into 80% training and 20% testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [None]:
# Scale numerical features
scaler = StandardScaler()
numerical_cols = ['tenure', 'MonthlyCharges', 'TotalCharges']
X_train[numerical_cols] = scaler.fit_transform(X_train[numerical_cols])
X_test[numerical_cols] = scaler.transform(X_test[numerical_cols])

In [None]:
# Double-check for NaN values in X_train and X_test
print("\nNaNirono in X_train:", np.any(np.isnan(X_train)))
print("NaN in X_test:", np.any(np.isnan(X_test)))

In [None]:
# Task 3: Feature Selection

In [None]:

# Use SelectKBest to select top 10 features based on ANOVA F-value
selector = SelectKBest(score_func=f_classif, k=10)
X_train_selected = selector.fit_transform(X_train, y_train)
X_test_selected = selector.transform(X_test)

In [None]:
# Get selected feature names
selected_features = X_train.columns[selector.get_support()].tolist()
print("Selected features:", selected_features)

In [None]:
Task 4: Model Selection

In [None]:
# Evaluate multiple binary classification algorithms
models = {
    'Logistic Regression': LogisticRegression(random_state=42),
    'Decision Tree': DecisionTreeClassifier(random_state=42),
    'Random Forest': RandomForestClassifier(random_state=42),
    'Gradient Boosting': GradientBoostingClassifier(random_state=42)
}


In [None]:
# Print model options
print("\nModel Selection:")
for name, model in models.items():
    print(f"- {name}: Suitable for binary classification, handles {len(selected_features)} features well.")