In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
#read data
df = pd.read_csv('data.csv')

df = df.drop('customer_id', axis=1)
df.info()
df.describe()

In [None]:
#DETECT AND HANDLE MISSING VALUES AND DULICATES
print(df.isnull().sum())
print("Duplication: " , df.duplicated().sum())

In [None]:
#outliers detection
""" for column in df.select_dtypes(include=('number')):
    plt.figure(figsize=(8,4))
    sns.boxplot(x=column, data=df)
    plt.show() """

In [None]:
#outliers in credit_score and handling them
#IQR method Credit score
Q1 = df["credit_score"].quantile(0.25)
Q3 = df["credit_score"].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5*IQR
upper_bound = Q3 + 1.5*IQR
outliers = (df['credit_score'] > upper_bound) | (df["credit_score"] < lower_bound)
print(outliers.value_counts())

#Feature Engineering
df["low_credit_score"] = 0
df.loc[df["credit_score"] < lower_bound, "low_credit_score"] = 1

df.loc[outliers, "credit_score"] = df["credit_score"].clip(lower_bound)
sns.boxplot(x=df['credit_score'])
#plt.show()

In [None]:
#age outliers
Q1 = df["age"].quantile(0.25)
Q3 = df["age"].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5*IQR
upper_bound = Q3 + 1.5*IQR
outliers = (df['age'] > upper_bound) | (df["age"] < lower_bound)
print(outliers.value_counts())
df["is_Elderly"] = 0
df.loc[(df["age"] > upper_bound), "is_Elderly"] = 1
df.loc[outliers, "age"] = df["age"].clip(lower_bound, upper_bound)



In [None]:
df['age'].describe()
sns.boxplot(x=df['age'])
#plt.show()
df.head(3)
df[df['is_Elderly'] == 1]

In [None]:


df["vip_customer"] = 0
df.loc[(df["balance"] > df["balance"].quantile(0.70)) & (df["credit_score"] > df["credit_score"].quantile(0.70)) & (df["active_member"] == 1) & (df["credit_card"] >= 1), "vip_customer"] = 1
df.info()

In [None]:
#encoding categorical variables
df = pd.get_dummies(df,columns=['country'] ,prefix='country',dtype=int)
df.info()


In [None]:
df.head(5)

In [None]:
#gender
from sklearn.preprocessing import LabelEncoder
laben_encoder = LabelEncoder()
df["gender"] = laben_encoder.fit_transform(df["gender"])
df.info()

In [None]:
df.head(3)

In [None]:
# Feature engineering
df["IsActiveWithbalance"] = 0
df.loc[(df["active_member"]==1) & (df["balance"]>0), "IsActiveWithbalance"]=1

df["IsActiveWithoutbalance"] = 0
df.loc[(df["active_member"]==1) & (df["balance"]==0), "IsActiveWithoutbalance"]=1

df["InactiveWithoutbalance"] = 0
df.loc[(df["active_member"]==0) & (df["balance"]==0), "InactiveWithoutbalance"]=1

In [None]:
#correlation matrix
plt.figure(figsize=(12,10))
correlation_matrix = df.corr().select_dtypes(include=['number'])
sns.heatmap(correlation_matrix, annot=True, fmt=".2f", cmap='viridis')
plt.title("Correlation Matrix")
#plt.show()

In [None]:
#Feature Selection using Chi-Squared Test
from sklearn.feature_selection import chi2
from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
x = df.drop(columns=["churn"],axis=1)
y = df["churn"]
X_scaled = scaler.fit_transform(x)

chi_scores = chi2(X_scaled,y)
print(chi_scores)


In [None]:
#the higher the more important
chi_values =pd.Series(chi_scores[0],index=x.columns)
chi_values.sort_values(ascending=False,inplace=True)
chi_values.plot.bar()
#plt.show()

In [None]:
#p-values higher the less important
p_values =pd.Series(chi_scores[1],index=x.columns)
p_values.sort_values(ascending=False,inplace=True)
p_values.plot.bar()
#plt.show()

In [None]:
#Feature Selection using Mutual Information
from sklearn.feature_selection import mutual_info_classif
mi_scores = mutual_info_classif(x,y,discrete_features='auto')
mi_df = pd.DataFrame({'df': x.columns, 'MI Score': mi_scores})
mi_df = mi_df.sort_values(by='MI Score', ascending=False)
plt.figure(figsize=(8,6))
plt.bar(mi_df['df'], mi_df['MI Score'])
plt.xticks(rotation=90)
plt.title("Mutual Information Feature Importance")
#plt.show()

In [None]:
#Feature Selection using Random Forest
from sklearn.ensemble import RandomForestClassifier

X = df.drop("churn", axis=1)
y = df["churn"]

model = RandomForestClassifier(random_state=42)
model.fit(X, y)
importances = model.feature_importances_
rf_df = pd.DataFrame({
    'df': X.columns,
    'Importance': importances
}).sort_values(by='Importance', ascending=False)

# Visualize
plt.figure(figsize=(8,6))
plt.bar(rf_df['df'], rf_df['Importance'])
plt.xticks(rotation=90)
plt.title("Random Forest df Importance")
#plt.show()

In [None]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_auc_score, classification_report

features = ['age', 'credit_score', 'estimated_salary', 'balance', 'products_number', 'active_member', 'IsActiveWithbalance', 'IsActiveWithoutbalance', 'low_credit_score', 'is_Elderly', 'vip_customer', 'country_France', 'country_Germany', 'country_Spain', 'gender','InactiveWithoutbalance']


X = df.drop('churn', axis=1)
y = df['churn']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

tree = DecisionTreeClassifier(
    max_depth=6,
    random_state=42
)

tree.fit(X_train, y_train)
y_pred = tree.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
train_acc = accuracy_score(y_train, tree.predict(X_train))
test_acc  = accuracy_score(y_test,  tree.predict(X_test))
test_auc  = roc_auc_score(y_test, tree.predict_proba(X_test)[:,1])

print(f"Train Accuracy : {train_acc:.4f}")
print(f"Test Accuracy  : {test_acc:.4f}")
print(f"Test AUC       : {test_auc:.4f}")
print(f"Accuracy       : {accuracy:.4f}")

In [None]:
from sklearn.tree import plot_tree


plt.figure(figsize=(22, 14))
plot_tree(
    tree,
    fontsize=12,
    feature_names=X.columns,
    max_depth=5   # ← show only top 4 levels so it fits nicely
)
plt.title("Your Decision Tree – First 5 Levels (trained to depth 6)", fontsize=16)
plt.show()

In [None]:
from sklearn import metrics
import numpy as np

actual = np.random.binomial(1, .9 ,size=1000)
predicted = np.random.binomial(1, .9 ,size=1000)
confusion_mtx = metrics.confusion_matrix(actual, predicted)
cm_display = metrics.ConfusionMatrixDisplay(confusion_matrix = confusion_mtx, display_labels = [False, True])
cm_display.plot()

