In [1]:
import pandas as pd
import numpy as np
import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('TRAIN DATA.csv')
df.head()

FileNotFoundError: [Errno 2] No such file or directory: 'TRAIN DATA.csv'

In [None]:
pd.DataFrame(df.info())

## **Checking null values**

In [None]:
df.isnull().sum()

## **Checking duplicates values**

In [None]:
df.duplicated().sum()

## **Changing data type from object to float type**

In [None]:
df['TotalCharges'] = pd.to_numeric(df['TotalCharges'], errors='coerce').astype(dtype='float64')

In [None]:
df['TotalCharges'].isnull().sum()

There might be some values that cannot be turned in float value, so it converted it into null value.

In [None]:
df.dropna(inplace=True) # droping values because only 8 values were null

In [None]:
df['TotalCharges'].isnull().sum()

## **These columns have more than two values**

In [None]:
lst =['MultipleLines',	'InternetService', 	'OnlineSecurity',	'OnlineBackup',	'DeviceProtection',	
      'TechSupport',	'StreamingTV',	'StreamingMovies',	'Contract', 'PaymentMethod']

In [None]:
lst_3 = df[lst]

In [None]:
from sklearn.preprocessing import LabelEncoder
for col in lst:
    lst_3[col] = LabelEncoder().fit_transform(df[col])


**Using LabelEncoder, I converted it into numerical columns**

In [None]:
lst_3.head()

## **These columns have only two values**

In [None]:
val_2 = ['gender', 'Partner',	'Dependents', 'PhoneService', 'PaperlessBilling']

In [None]:
#Encode categorical variables
df_encoded = pd.get_dummies(df[val_2], drop_first=True)

In [None]:
for i in df_encoded.columns:
    df_encoded[i] = df_encoded[i].apply(lambda x:1 if x else 0)
df_encoded.columns = val_2

**Using Lambda function, Assining value 1 to True and 0 to False**

In [None]:
df_encoded.head()

## **These columns have very large values**
- Using StandardScaler, mapping values in range -3 to +3 assuming normal distribution

In [None]:
from sklearn.preprocessing import StandardScaler
num_col = df[['tenure', 'MonthlyCharges', 'TotalCharges']]

sl = StandardScaler().fit_transform(num_col)
num_df = pd.DataFrame(sl, columns=['tenure', 'MonthlyCharges', 'TotalCharges'], index=df.index)
num_df.head()

## **Concatenating all vlaues**

In [None]:
x_col = pd.concat([lst_3, df_encoded, df['SeniorCitizen'], num_df], axis=1)
x_col

In [None]:
# calculating correlation based on "Pearson correlation"
corr = x_col.corr()

## **Plotting heatmap to see correlation**

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt 
plt.figure(figsize=(12, 8))
sns.heatmap(corr, annot=True, fmt='.2f', cmap='coolwarm')
plt.title('Heatmap: Pearson correlation', color='Orange', fontsize=16)
plt.show()

# Using Random undersampling to remove bias

In [None]:
from imblearn.under_sampling import RandomUnderSampler

rx, ry = RandomUnderSampler().fit_resample(x_col, df['churned'])
print(rx)

In [None]:
ry.value_counts()

# IMPORT TEST DATASET

In [None]:
data = pd.read_csv('TEST DATA.csv')
data.head()

# CLEANING TEST DATASET

In [None]:
data['TotalCharges'] = pd.to_numeric(data['TotalCharges'], errors='coerce').astype(dtype='float64')

In [None]:
data['TotalCharges'].isnull().sum()

In [None]:
data.dropna(inplace=True) # droping values because only 8 values were null

In [None]:
k =['MultipleLines',	'InternetService', 	'OnlineSecurity',	'OnlineBackup',	'DeviceProtection',	
      'TechSupport',	'StreamingTV',	'StreamingMovies',	'Contract', 'PaymentMethod']

In [None]:
lst_4 = data[k]

In [None]:
from sklearn.preprocessing import LabelEncoder
for col in k:
    lst_4[col] = LabelEncoder().fit_transform(data[col])

In [None]:
val_3 = ['gender', 'Partner',	'Dependents', 'PhoneService', 'PaperlessBilling']

In [None]:
#Encode categorical variables
data_encoded = pd.get_dummies(data[val_3], drop_first=True)

In [None]:
for i in data_encoded.columns:
    data_encoded[i] = data_encoded[i].apply(lambda x:1 if x else 0)
data_encoded.columns = val_3

In [None]:
from sklearn.preprocessing import StandardScaler
num_col = data[['tenure', 'MonthlyCharges', 'TotalCharges']]

sl = StandardScaler().fit_transform(num_col)
num_data = pd.DataFrame(sl, columns=['tenure', 'MonthlyCharges', 'TotalCharges'], index=data.index)

In [None]:
x_test_col = pd.concat([lst_4, data_encoded, data['SeniorCitizen'], num_data], axis=1)
x_test_col

# LOGISTIC REGRESSION MODEL

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
log_model = LogisticRegression()
log_model.fit(x_col,df['churned'])

y_pred_log = log_model.predict(x_col)

print("🔹 Logistic Regression Accuracy:", accuracy_score(df['churned'], y_pred_log))
print(classification_report(df['churned'], y_pred_log))

# RandomForest MODEL

In [None]:
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(x_col,df['churned'])

y_pred_rf = rf_model.predict(x_col)

print("Random Forest Accuracy:", accuracy_score(df['churned'], y_pred_rf))

print(classification_report(df['churned'], y_pred_rf))

# PREDICTING PROBABLITIES FOR ROC-AUC CURVE

In [None]:
import matplotlib.pyplot as plt
from sklearn.metrics import roc_curve, auc

# Predict probabilities
y_prob_log = log_model.predict_proba(x_test_col)
y_prob_rf = rf_model.predict_proba(x_test_col)
y_prob_rf 

# Important Coefficients from Logistic Regression Model

In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

# Assuming this is your trained model and feature set
# log_model = LogisticRegression().fit(X_train, y_train)
# X = your input features (DataFrame)

# Get feature names and coefficients
feature_names = x_col.columns
coefficients = log_model.coef_[0]  # Coefficients for the positive class

# Create a DataFrame for easier plotting
coef_df = pd.DataFrame({
    'Feature': feature_names,
    'Coefficient': coefficients,
    'AbsCoefficient': np.abs(coefficients)
})

# Sort by absolute value of coefficients
coef_df = coef_df.sort_values(by='AbsCoefficient', ascending=False)

# Plot
plt.figure(figsize=(8, 5))
plt.barh(coef_df['Feature'], coef_df['Coefficient'], color='skyblue')
plt.axvline(0, color='gray', linestyle='--')
plt.xlabel('Coefficient Value')
plt.title('Important Coefficients from Logistic Regression Model')
plt.gca().invert_yaxis()  # Most important on top
plt.grid(True, axis='x')
plt.tight_layout()
plt.show()