In [1]:
import pandas as pd
import numpy as np

# Load dataset
df = pd.read_csv('telecom_customer_churn_cleaned.csv')

# View summary
print(df.info())
print(df.head())

# Drop irrelevant columns (example: ID, customer name, etc. ‚Äî adjust as needed)
irrelevant_cols = ['Customer ID', 'Name', 'Unnamed: 0']  # change according to your dataset
df = df.drop(columns=[col for col in irrelevant_cols if col in df.columns], errors='ignore')

# Remove duplicate rows
df = df.drop_duplicates()

# Handle NaN values
# Option 1: Drop rows with too many NaNs
df = df.dropna(thresh=len(df.columns) - 2)  # keeps rows with at least n-2 non-NaN values

# Option 2: Fill remaining NaNs
df = df.fillna(df.median(numeric_only=True))  # numeric columns
df = df.fillna(df.mode().iloc[0])  # categorical columns

# Verify cleaning
print("Remaining NaN values per column:\n", df.isna().sum())



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6923 entries, 0 to 6922
Data columns (total 38 columns):
 #   Column                             Non-Null Count  Dtype  
---  ------                             --------------  -----  
 0   Customer ID                        6923 non-null   object 
 1   Gender                             6923 non-null   object 
 2   Age                                6923 non-null   int64  
 3   Married                            6923 non-null   object 
 4   Number of Dependents               6923 non-null   int64  
 5   City                               6923 non-null   object 
 6   Zip Code                           6923 non-null   int64  
 7   Latitude                           6923 non-null   float64
 8   Longitude                          6923 non-null   float64
 9   Number of Referrals                6923 non-null   int64  
 10  Tenure in Months                   6923 non-null   int64  
 11  Offer                              3123 non-null   objec

In [3]:
# === Week 5: Supervised Learning ‚Äì Regression ===
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_absolute_error, mean_squared_error

# Load dataset
df = pd.read_csv('telecom_customer_churn_cleaned.csv')

# Show columns for reference
print("Columns in dataset:\n", df.columns.tolist())

# Select numeric columns only
df_numeric = df.select_dtypes(include=[np.number]).dropna()

# Try to find the correct target column automatically
target_candidates = ['Monthly Charges', 'MonthlyCharges', 'Total Charges', 'TotalCharges', 'Monthly_Fee']
target_col = None
for col in target_candidates:
    if col in df_numeric.columns:
        target_col = col
        break

if not target_col:
    raise KeyError("‚ö†Ô∏è Could not find a numeric target column (e.g., Monthly Charges or Total Charges). "
                   "Please check your dataset column names.")

print(f"\n‚úÖ Using '{target_col}' as the target variable.\n")

# Define X (features) and y (target)
X = df_numeric.drop(columns=[target_col])
y = df_numeric[target_col]

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Train Linear Regression model
lr = LinearRegression()
lr.fit(X_train, y_train)

# Predict
y_pred = lr.predict(X_test)

# Evaluate with MAE and RMSE
mae = mean_absolute_error(y_test, y_pred)
rmse = np.sqrt(mean_squared_error(y_test, y_pred))

print(f"Mean Absolute Error (MAE): {mae:.2f}")
print(f"Root Mean Squared Error (RMSE): {rmse:.2f}")

# Optional: compare actual vs predicted
comparison = pd.DataFrame({'Actual': y_test.values, 'Predicted': y_pred})
print("\nSample predictions:\n", comparison.head())


Columns in dataset:
 ['Customer ID', 'Gender', 'Age', 'Married', 'Number of Dependents', 'City', 'Zip Code', 'Latitude', 'Longitude', 'Number of Referrals', 'Tenure in Months', 'Offer', 'Phone Service', 'Avg Monthly Long Distance Charges', 'Multiple Lines', 'Internet Service', 'Internet Type', 'Avg Monthly GB Download', 'Online Security', 'Online Backup', 'Device Protection Plan', 'Premium Tech Support', 'Streaming TV', 'Streaming Movies', 'Streaming Music', 'Unlimited Data', 'Contract', 'Paperless Billing', 'Payment Method', 'Monthly Charge', 'Total Charges', 'Total Refunds', 'Total Extra Data Charges', 'Total Long Distance Charges', 'Total Revenue', 'Customer Status', 'Churn Category', 'Churn Reason']

‚úÖ Using 'Total Charges' as the target variable.

Mean Absolute Error (MAE): 0.00
Root Mean Squared Error (RMSE): 0.00

Sample predictions:
     Actual  Predicted
0  4085.75    4085.75
1   747.20     747.20
2    48.45      48.45
3  5894.50    5894.50
4  1415.00    1415.00


In [5]:
# Week 6: Supervised Learning ‚Äì Classification
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# Load dataset
df = pd.read_csv('telecom_customer_churn_cleaned.csv')

# Drop irrelevant columns (if exist)
irrelevant_cols = ['Customer ID', 'Name', 'Unnamed: 0']
df = df.drop(columns=[col for col in irrelevant_cols if col in df.columns], errors='ignore')

# Identify target column automatically
target_candidates = ['Customer Status', 'Churn', 'Exited', 'Target']
target_col = None
for col in target_candidates:
    if col in df.columns:
        target_col = col
        break

if not target_col:
    raise KeyError("‚ö†Ô∏è Could not find churn/target column. Please verify your dataset.")

print(f"‚úÖ Using '{target_col}' as target column.\n")

# Encode target variable (e.g., Churned=1, Stayed=0)
le = LabelEncoder()
df[target_col] = le.fit_transform(df[target_col])

# Convert categorical columns into numeric using one-hot encoding
X = df.drop(columns=[target_col])
X = pd.get_dummies(X, drop_first=True)
y = df[target_col]

# Handle missing values
X = X.fillna(X.median(numeric_only=True))
X = X.fillna(X.mode().iloc[0])

# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Logistic Regression
log_model = LogisticRegression(max_iter=1000)
log_model.fit(X_train, y_train)
log_pred = log_model.predict(X_test)

# Random Forest
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train, y_train)
rf_pred = rf_model.predict(X_test)

# Evaluate accuracies
log_acc = accuracy_score(y_test, log_pred)
rf_acc = accuracy_score(y_test, rf_pred)

print(f"Logistic Regression Accuracy: {log_acc:.3f}")
print(f"Random Forest Accuracy: {rf_acc:.3f}")

best_model = "Random Forest" if rf_acc > log_acc else "Logistic Regression"
print(f"\nüèÜ Best Model: {best_model}")


‚úÖ Using 'Customer Status' as target column.



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Logistic Regression Accuracy: 0.788
Random Forest Accuracy: 0.973

üèÜ Best Model: Random Forest
