In [76]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

The company wants to know: What kinds of passengers are most likely to cancel? What patterns can be detected? And how can this insight be turned into action—either to reduce cancellations, or to plan around them more effectively?

Logistic 回归, 随机森林

## Data import and cleaning

In [104]:
df = pd.read_csv('data/cruise_cancellations.csv')

In [87]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2000 entries, 0 to 1999
Data columns (total 20 columns):
 #   Column                     Non-Null Count  Dtype  
---  ------                     --------------  -----  
 0   age                        2000 non-null   int64  
 1   booking_lead_time          2000 non-null   int64  
 2   trip_length                2000 non-null   int64  
 3   cabin_type                 2000 non-null   object 
 4   group_size                 2000 non-null   int64  
 5   loyalty_status             1016 non-null   object 
 6   paid_in_full               2000 non-null   int64  
 7   prior_cruises              2000 non-null   int64  
 8   customer_income            2000 non-null   float64
 9   email_engagement_score     2000 non-null   float64
 10  phone_verified             2000 non-null   int64  
 11  has_insurance              2000 non-null   int64  
 12  on_mailing_list            2000 non-null   int64  
 13  website_visits_last_month  2000 non-null   int64

In [88]:
df.head()

Unnamed: 0,age,booking_lead_time,trip_length,cabin_type,group_size,loyalty_status,paid_in_full,prior_cruises,customer_income,email_engagement_score,phone_verified,has_insurance,on_mailing_list,website_visits_last_month,survey_participation,preferred_contact_method,travel_history_score,referral_source,gift_certificate_used,canceled
0,56,131,7,Oceanview,5,,1,1,134181.36,0.41,1,0,1,4,No,Phone,62.7,Friend,0,0
1,69,293,5,Oceanview,5,,1,2,104770.24,0.34,1,0,1,1,No,Email,56.59,Friend,0,0
2,46,171,7,Balcony,3,,0,1,109135.01,0.38,1,0,0,3,Yes,Phone,57.58,Ad,0,1
3,32,183,3,Balcony,2,Silver,0,0,80108.53,0.58,0,0,0,2,No,Email,14.75,Search Engine,0,1
4,60,364,10,Oceanview,3,Silver,0,2,73401.55,0.5,1,0,1,3,Partial,Phone,47.49,Ad,0,0


In [89]:
df.isnull().sum()

age                            0
booking_lead_time              0
trip_length                    0
cabin_type                     0
group_size                     0
loyalty_status               984
paid_in_full                   0
prior_cruises                  0
customer_income                0
email_engagement_score         0
phone_verified                 0
has_insurance                  0
on_mailing_list                0
website_visits_last_month      0
survey_participation           0
preferred_contact_method       0
travel_history_score           0
referral_source                0
gift_certificate_used          0
canceled                       0
dtype: int64

In [90]:
df['loyalty_status'].unique()

array([nan, 'Silver', 'Gold', 'Platinum'], dtype=object)

In [106]:
df['loyalty_status'] = df['loyalty_status'].fillna('None')

## logistic regression

In [92]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [111]:
# 特征列和目标列
features = [
    'age', 'booking_lead_time', 'trip_length', 'cabin_type', 'group_size', 
    'loyalty_status', 'paid_in_full', 'prior_cruises', 'customer_income', 
    'email_engagement_score', 'phone_verified', 'has_insurance', 
    'on_mailing_list', 'website_visits_last_month', 'survey_participation', 
    'preferred_contact_method', 'travel_history_score', 'referral_source', 
    'gift_certificate_used'
]
target = 'canceled'

# 选择特征和目标
X = df[features]
y = df[target]

# 编码分类变量和标准化数值变量
# 定义分类变量列和数值变量列
categorical_cols = ['cabin_type', 'loyalty_status', 'survey_participation', 'preferred_contact_method', 'referral_source']
numeric_cols = ['age', 'booking_lead_time', 'trip_length', 'group_size', 'paid_in_full', 'prior_cruises', 
                'customer_income', 'email_engagement_score', 'phone_verified', 'has_insurance', 
                'on_mailing_list', 'website_visits_last_month', 'travel_history_score', 'gift_certificate_used']

# 使用 Pipeline 处理数值列和分类列
numeric_transformer = 'passthrough'  # 无需标准化 对于数值型特征，不进行处理
categorical_transformer = OneHotEncoder(handle_unknown='ignore')  # 使用独热编码处理分类变量 使用 One-Hot 编码categorical variables

# 创建预处理步骤
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_cols),
        ('cat', categorical_transformer, categorical_cols)
    ])

# 4. 创建逻辑回归模型
lr_model = LogisticRegression(max_iter=1000) # 最大迭代次数为 1000

# 5. 创建管道，将预处理和模型训练步骤封装成一个流程
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', lr_model)
])

# 6. 拆分数据集：训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=654)

# 7. 看是否强线性相关
# 设定相关性阈值
threshold = 0.8

# 计算 OneHot 编码后的特征的相关性
X_train_transformed = preprocessor.fit_transform(X_train)
corr_matrix = pd.DataFrame(X_train_transformed).corr()

# 创建一个空的列表用于存储相关性较高的特征对
high_corr_pairs = []

# 遍历相关性矩阵中的每一对特征
for i in range(len(corr_matrix.columns)):
    for j in range(i):
        if abs(corr_matrix.iloc[i, j]) > threshold:
            # 如果相关性高于阈值，添加到列表中
            high_corr_pairs.append((corr_matrix.columns[i], corr_matrix.columns[j], corr_matrix.iloc[i, j]))

# 打印相关性高的特征对
if high_corr_pairs:
    for pair in high_corr_pairs:
        print(f"Feature pair: {pair[0]} and {pair[1]} - Correlation: {pair[2]}")
else:
    print("No significant multicollinearity.\n")


# 8. 训练模型
pipeline.fit(X_train, y_train)

# 9. 预测结果并评估模型
y_pred = pipeline.predict(X_test)

# 输出准确率和分类报告
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))


No significant multicollinearity.

Accuracy: 0.72

Classification Report:
               precision    recall  f1-score   support

           0       0.74      0.93      0.82       284
           1       0.55      0.21      0.30       116

    accuracy                           0.72       400
   macro avg       0.64      0.57      0.56       400
weighted avg       0.68      0.72      0.67       400



STOP: TOTAL NO. OF ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [112]:
import statsmodels.api as sm

# 通过 ColumnTransformer 预处理数据
X_train_transformed = preprocessor.fit_transform(X_train)

# 添加常数列（截距项）
X_train_transformed = sm.add_constant(X_train_transformed)

# 拟合逻辑回归模型
log_reg = sm.Logit(y_train, X_train_transformed)
result = log_reg.fit()

# 输出模型的详细摘要
print(result.summary())


         Current function value: 0.507341
         Iterations: 35
                           Logit Regression Results                           
Dep. Variable:               canceled   No. Observations:                 1600
Model:                          Logit   Df Residuals:                     1572
Method:                           MLE   Df Model:                           27
Date:                Tue, 22 Apr 2025   Pseudo R-squ.:                  0.1347
Time:                        14:02:20   Log-Likelihood:                -811.75
converged:                      False   LL-Null:                       -938.15
Covariance Type:            nonrobust   LLR p-value:                 1.542e-38
                 coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------
const          0.1286   7.21e+06   1.78e-08      1.000   -1.41e+07    1.41e+07
x1            -0.0011      0.003     -0.324      0.746      -0.00

