In [35]:
import pandas as pd

# 讀取資料集
df = pd.read_csv('customer_churn.csv')

In [36]:
#第1題
from sklearn.model_selection import StratifiedShuffleSplit

#取出60%
split = StratifiedShuffleSplit(n_splits=1, train_size=0.6, random_state=15)

for new_index, other_index in split.split(df, df['Churn']):
    new_set = df.loc[new_index]
    other_set = df.loc[other_index]


#檢查分割結果:
#可以看到經過stratified sampling的新資料(new_set)的資料比例為原本的60%
#且新資料與原資料churn各類別分佈比例幾乎相同
new_set_size = len(new_set)
other_set_size = len(other_set)

new_set_ratio = new_set_size / (new_set_size + other_set_size)

print("New set size: ", new_set_size, ", ratio: {:.2%}".format(new_set_ratio))

print("")

origin_churn_percent = df['Churn'].value_counts(normalize=True)
print("Origin set churn percentage:\n", origin_churn_percent)

new_churn_percent = new_set['Churn'].value_counts(normalize=True)
print("New set churn percentage:\n", new_churn_percent)

New set size:  3384 , ratio: 60.00%

Origin set churn percentage:
 0    0.831383
1    0.168617
Name: Churn, dtype: float64
New set churn percentage:
 0    0.831265
1    0.168735
Name: Churn, dtype: float64


In [37]:
#第2題
print(new_set['Churn'].value_counts())

0    2813
1     571
Name: Churn, dtype: int64


In [38]:
#第3題 - 資料前處理(1)

#刪除重複多餘的資料(保留一筆)
new_set.drop_duplicates(subset=['CustomerID'], keep='first', inplace=True)
print("剩餘資料筆數：", len(new_set))

#檢查空值
print("\n每個欄位的空值個數：")
print(new_set.isnull().sum())

剩餘資料筆數： 3380

每個欄位的空值個數：
CustomerID                       0
Churn                            0
Tenure                         154
PreferredLoginDevice             0
CityTier                         0
WarehouseToHome                141
PreferredPaymentMode             0
Gender                           0
HourSpendOnApp                 148
NumberOfDeviceRegistered         0
PreferedOrderCat                 0
SatisfactionScore                0
MaritalStatus                    0
NumberOfAddress                  0
Complain                         0
OrderAmountHikeFromlastYear    157
CouponUsed                     140
OrderCount                     160
DaySinceLastOrder              189
CashbackAmount                   0
dtype: int64


In [39]:
#第3題 - 資料前處理(2)
from sklearn.preprocessing import LabelEncoder

# 使用平均數填補空值
new_set['Tenure'].fillna(new_set['Tenure'].mean(), inplace=True)
new_set['WarehouseToHome'].fillna(new_set['WarehouseToHome'].mean(), inplace=True)
new_set['HourSpendOnApp'].fillna(new_set['HourSpendOnApp'].mean(), inplace=True)
new_set['OrderAmountHikeFromlastYear'].fillna(new_set['OrderAmountHikeFromlastYear'].mean(), inplace=True)
new_set['CouponUsed'].fillna(new_set['CouponUsed'].mean(), inplace=True)
new_set['OrderCount'].fillna(new_set['OrderCount'].mean(), inplace=True)
new_set['DaySinceLastOrder'].fillna(new_set['DaySinceLastOrder'].mean(), inplace=True)

# Encoding
le = LabelEncoder()
new_set = pd.get_dummies(new_set, columns=['PreferredLoginDevice'])
new_set = pd.get_dummies(new_set, columns=['PreferredPaymentMode'])
new_set['Gender'] = le.fit_transform(new_set['Gender'])
new_set = pd.get_dummies(new_set, columns=['PreferedOrderCat'])
new_set = pd.get_dummies(new_set, columns=['MaritalStatus'])

# 把預測欄位轉為nominal
new_set['Churn'] = new_set['Churn'].astype('category')

# 刪掉id欄
new_set = new_set.drop('CustomerID', axis=1)

In [40]:
#第4題
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
import numpy as np

X = new_set.drop("Churn", axis=1)
y = new_set["Churn"]

# 建立Logistic Regression模型
lr = LogisticRegression(penalty='l1', C=1, solver='liblinear', max_iter=1000)
lr_scores = cross_val_score(lr, X, y, cv=10,scoring='accuracy')

# 建立SVM模型
svm = SVC(kernel='rbf', C=5, gamma='auto')
svm_scores = cross_val_score(svm, X, y, cv=10,scoring='accuracy')

In [41]:
#第5題

# 印出兩個模型各自交叉驗證10次的平均分數
print("Logistic Regression mean Accuracy:", lr_scores.mean())
print("SVM mean Accuracy:", svm_scores.mean())

Logistic Regression mean Accuracy: 0.8908284023668639
SVM mean Accuracy: 0.9201183431952664


In [42]:
#第6題
from sklearn.model_selection import StratifiedShuffleSplit
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score

all_lr_scores = []
all_svm_scores = []

for i in range(30):
    #重複第1題(取出60%)
    # 這邊就不設定random_state，讓每次取得的60%資料都有所不同
    split = StratifiedShuffleSplit(n_splits=1, train_size=0.6)
    for new_index, other_index in split.split(df, df['Churn']):
        new_set = df.loc[new_index]
        other_set = df.loc[other_index]

        
    #重複第3題(資料前處理)
    # 刪除重複多餘的資料(保留一筆)
    new_set.drop_duplicates(subset=['CustomerID'], keep='first', inplace=True)   
    # 使用平均數填補空值
    new_set['Tenure'].fillna(new_set['Tenure'].mean(), inplace=True)
    new_set['WarehouseToHome'].fillna(new_set['WarehouseToHome'].mean(), inplace=True)
    new_set['HourSpendOnApp'].fillna(new_set['HourSpendOnApp'].mean(), inplace=True)
    new_set['OrderAmountHikeFromlastYear'].fillna(new_set['OrderAmountHikeFromlastYear'].mean(), inplace=True)
    new_set['CouponUsed'].fillna(new_set['CouponUsed'].mean(), inplace=True)
    new_set['OrderCount'].fillna(new_set['OrderCount'].mean(), inplace=True)
    new_set['DaySinceLastOrder'].fillna(new_set['DaySinceLastOrder'].mean(), inplace=True)
    # Encoding
    le = LabelEncoder()
    new_set = pd.get_dummies(new_set, columns=['PreferredLoginDevice'])
    new_set = pd.get_dummies(new_set, columns=['PreferredPaymentMode'])
    new_set['Gender'] = le.fit_transform(new_set['Gender'])
    new_set = pd.get_dummies(new_set, columns=['PreferedOrderCat'])
    new_set = pd.get_dummies(new_set, columns=['MaritalStatus'])
    # 把預測欄位轉為nominal
    new_set['Churn'] = new_set['Churn'].astype('category')
    # 刪掉id欄
    new_set = new_set.drop('CustomerID', axis=1)
    
    #重複第4題(建立模型)
    X = new_set.drop("Churn", axis=1)
    y = new_set["Churn"]
    # 建立Logistic Regression模型
    lr = LogisticRegression(penalty='l1', C=1, solver='liblinear', max_iter=1000)
    lr_scores = cross_val_score(lr, X, y, cv=10,scoring='accuracy')
    all_lr_scores.append(lr_scores.mean())#將該次平均Accuracy紀錄
    # 建立SVM模型
    svm = SVC(kernel='rbf', C=5, gamma='auto')
    svm_scores = cross_val_score(svm, X, y, cv=10,scoring='accuracy')
    all_svm_scores.append(svm_scores.mean())#將該次平均Accuracy紀錄

lr_average = sum(all_lr_scores) / len(all_lr_scores)
svm_average = sum(all_svm_scores) / len(all_svm_scores)
print("Logistic Regression 30次的平均Accuracy:", lr_average)
print("SVM 30次的平均Accuracy:", svm_average)

Logistic Regression 30次的平均Accuracy: 0.8897074856061771
SVM 30次的平均Accuracy: 0.9227659410066447


In [43]:
#第7題
import numpy as np
from scipy import stats

# 計算平均數
lr_mean = np.mean(all_lr_scores)
svm_mean = np.mean(all_svm_scores)

# 計算標準差
lr_std = np.std(all_lr_scores, ddof=1)
svm_std = np.std(all_svm_scores, ddof=1)

# 計算t-value和p-value
t_value, p_value = stats.ttest_rel(all_lr_scores,all_svm_scores)

# 輸出結果
print("LR平均數：", lr_mean)
print("SVM平均數：", svm_mean)
print("LR標準差：", lr_std)
print("SVM標準差：", svm_std)
print("t-value：", t_value)
print("p-value：", p_value)

LR平均數： 0.8897074856061771
SVM平均數： 0.9227659410066446
LR標準差： 0.00370198693263988
SVM標準差： 0.003440356920035281
t-value： -39.430845644247476
p-value： 9.547713390544607e-27


## 結論

若將顯著水準設置為0.05，以上得出的p-value小於此顯著水準，所以可以拒絕虛無假設，即表示兩模型的平均分數在統計上是存在差異的。

所以結論就是在此次實驗中，SVM模型的表現優於Logistic Regression模型。