### Tên dataset được sử dụng : Churn Modelling 
### Mô tả : Dataset chưa thông tin của khách hàng và biến mục tiêu là giá trị nhị phân thể hiện khách hàng có còn là khách hàng của ngân hàng nữa hay không 

In [None]:
import pandas as pd
import numpy as np

In [14]:
filename='Churn_Modelling.csv'
df = pd.read_csv(filename)
df.head(5)
print(df.shape)

(10000, 14)


In [15]:
# Loại bỏ cột "customerID" khỏi dữ liệu
df = df.drop("CustomerId", axis=1)
df = df.drop("RowNumber", axis=1)

In [16]:
name=list(df.columns)

In [17]:
types =df.dtypes
print(types)

Surname             object
CreditScore          int64
Geography           object
Gender              object
Age                  int64
Tenure               int64
Balance            float64
NumOfProducts        int64
HasCrCard            int64
IsActiveMember       int64
EstimatedSalary    float64
Exited               int64
dtype: object


In [18]:
# Xử lý giá trị thiếu (nếu có)
df.dropna(inplace=True)

In [19]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
for i in range(0,len(types)):
    if types[i]=='object':
            le.fit_transform(df[name[i]])
            df[name[i]]=le.transform(df[name[i]])

In [20]:
data= df.values
x= data[:,:-1]
y= data[:,-1]
print(x[0])
print(y[0])

[1.1150000e+03 6.1900000e+02 0.0000000e+00 0.0000000e+00 4.2000000e+01
 2.0000000e+00 0.0000000e+00 1.0000000e+00 1.0000000e+00 1.0000000e+00
 1.0134888e+05]
1.0


In [21]:
from sklearn.preprocessing import MinMaxScaler
scaler =MinMaxScaler()
scaler.fit(x)
X_scaler =scaler.transform(x)
print(X_scaler[0])

[0.38041624 0.538      0.         0.         0.32432432 0.2
 0.         0.         1.         1.         0.50673489]


In [23]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test=\
train_test_split(X_scaler,y,test_size=0.2,random_state=42)
print(y_train)

[0. 0. 1. ... 1. 1. 0.]


In [24]:
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV


In [27]:
# Initialize the models
Logistic_reg = LogisticRegression()
knn = KNeighborsClassifier(n_neighbors=3)  # You can adjust the number of neighbors (k) as needed
tree=DecisionTreeClassifier()
gaussian_nb = GaussianNB()

# Train the models
Logistic_reg.fit(x_train, y_train)
knn.fit(x_train, y_train)
tree.fit(x_train, y_train)
gaussian_nb.fit(x_train, y_train)
# Make predictions
y_pred_log = Logistic_reg.predict(x_test)
y_pred_knn = knn.predict(x_test)
y_pred_tree= tree.predict(x_test)
y_pred_gaussian = gaussian_nb.predict(x_test)
print(y_pred_log)
print(y_pred_knn)
print(y_pred_tree)
print(y_pred_gaussian)

[0. 0. 0. ... 0. 0. 0.]
[0. 0. 0. ... 1. 0. 0.]
[0. 0. 0. ... 1. 0. 0.]
[0. 0. 0. ... 0. 0. 0.]


In [30]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

def Evaluate(y_test, y_pred):
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1-score: {f1}")


In [31]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score

# Evaluate Linear Regression
print("LogisticRegression:")
Evaluate(y_test, y_pred_log)
  
# Evaluate K-Nearest Neighbors
print("\nK-Nearest Neighbors:")
Evaluate(y_test, y_pred_knn)

# Evaluate DecisionTree
print("\nDecisionTree")
Evaluate(y_test, y_pred_tree)

# Evaluate Gaussian
print("\nGaussian:")
Evaluate(y_test, y_pred_gaussian)


LogisticRegression:
Accuracy: 0.8145
Precision: 0.5982142857142857
Recall: 0.17048346055979643
F1-score: 0.2653465346534653

K-Nearest Neighbors:
Accuracy: 0.7995
Precision: 0.4844961240310077
Recall: 0.31806615776081426
F1-score: 0.38402457757296465

DecisionTree
Accuracy: 0.7785
Precision: 0.4429223744292237
Recall: 0.49363867684478374
F1-score: 0.46690734055354993

Gaussian:
Accuracy: 0.8295
Precision: 0.6884057971014492
Recall: 0.24173027989821882
F1-score: 0.3578154425612053


### Ensemble Learing

### GradientBoostingClassifier

In [39]:
from sklearn.ensemble import GradientBoostingClassifier
clf_1 = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,max_depth=1, random_state=0).fit(x_train, y_train)
y_pred= clf_1.predict(x_test)
print("GradientBoostingClassifier:")
Evaluate(y_test, y_pred)

GradientBoostingClassifier:
Accuracy: 0.856
Precision: 0.6895306859205776
Recall: 0.4860050890585242
F1-score: 0.5701492537313433


### RandomForestClassifier

In [40]:
from sklearn.ensemble import RandomForestClassifier
clf_2 = RandomForestClassifier(n_estimators=10, max_depth=None,min_samples_split=2, random_state=0).fit(x_train, y_train)
y_pred= clf_2.predict(x_test)
print("RandomForestClassifier:")
Evaluate(y_test, y_pred)

RandomForestClassifier:
Accuracy: 0.8555
Precision: 0.7363636363636363
Recall: 0.4122137404580153
F1-score: 0.5285481239804242


### ExtraTreesClassifier

In [42]:
from sklearn.ensemble import ExtraTreesClassifier
clf_3 = ExtraTreesClassifier(n_estimators=10, max_depth=None,min_samples_split=2, random_state=0).fit(x_train, y_train)
y_pred= clf_3.predict(x_test)
print("ExtraTreesClassifier:")
Evaluate(y_test, y_pred)

ExtraTreesClassifier:
Accuracy: 0.851
Precision: 0.7251184834123223
Recall: 0.3893129770992366
F1-score: 0.5066225165562914
