# 1、必要库的载入

In [65]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# 2、加载并清洗数据

In [66]:
# 2.1 加载数据
df = pd.read_csv('/home/mw/input/survey6263/mcdonalds.csv')

# 2.2 数据清洗
# 2.2.1 检查缺失值
print('缺失值情况：')
print(df.isnull().sum())
# 2.2.2 处理异常值（年龄范围在18 - 100岁为合理范围）
df = df[(df['Age'] >= 18) & (df['Age'] <= 100)]

# 查看数据集行数和列数
rows, columns = df.shape

if rows < 100 and columns < 20:
    # 短表数据（行数少于100且列数少于20）查看全量数据信息
    print('数据全部内容信息：')
    print(df.to_csv(sep='\t', na_rep='nan'))
else:
    # 长表数据查看数据前几行信息
    print('数据前几行内容信息：')
    print(df.head().to_csv(sep='\t', na_rep='nan'))

缺失值情况：
Index             0
yummy             0
convenient        0
spicy             0
fattening         0
greasy            0
fast              0
cheap             0
tasty             0
expensive         0
healthy           0
disgusting        0
Like              0
Age               0
VisitFrequency    0
Gender            0
dtype: int64
数据前几行内容信息：
	Index	yummy	convenient	spicy	fattening	greasy	fast	cheap	tasty	expensive	healthy	disgusting	Like	Age	VisitFrequency	Gender
0	1	No	Yes	No	Yes	No	Yes	Yes	No	Yes	No	No	-3	61	Every three months	Female
1	2	Yes	Yes	No	Yes	Yes	Yes	Yes	Yes	Yes	No	No	+2	51	Every three months	Female
2	3	No	Yes	Yes	Yes	Yes	Yes	No	Yes	Yes	Yes	No	+1	62	Every three months	Female
3	4	Yes	Yes	No	Yes	Yes	Yes	Yes	Yes	No	No	Yes	+4	69	Once a week	Female
4	5	No	Yes	No	Yes	Yes	Yes	Yes	No	No	Yes	No	+2	49	Once a month	Male



# 3、可视化设置

In [67]:
# 3.1 设置图片清晰度
plt.rcParams['figure.dpi'] = 300
# 3.2 设置中文字体
plt.rcParams['font.sans-serif'] = ['WenQuanYi Zen Hei']
# 3.3 解决负号显示问题
plt.rcParams['axes.unicode_minus'] = False

# 4、顾客基础特征分析

## 4.1 顾客年龄分布和性别分布

In [68]:
import matplotlib.pyplot as plt
import seaborn as sns

# 年龄分布
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.histplot(df['Age'], bins=20, kde=False)
plt.title('Age distribution')

# 性别分布
plt.subplot(1, 2, 2)
gender_counts = df['Gender'].value_counts()
sns.barplot(x=gender_counts.index, y=gender_counts.values)
plt.title('Gender distribution')

plt.tight_layout()
plt.show()

print('顾客年龄分布：')
print(df['Age'].describe())
print('\n顾客性别分布：')
print(df['Gender'].value_counts())

findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of 

顾客年龄分布：
count    1453.000000
mean       44.604955
std        14.221178
min        18.000000
25%        33.000000
50%        45.000000
75%        57.000000
max        71.000000
Name: Age, dtype: float64

顾客性别分布：
Gender
Female    788
Male      665
Name: count, dtype: int64


### 4.1.1 年龄分布

从年龄分布的统计数据来看，样本数量为 1453 人。平均年龄约为 44.6 岁，标准差 14.22 表明年龄分布有一定的离散程度。最小值 18 岁和最大值 71 岁界定了年龄的范围，中位数 45 岁接近平均值，说明年龄分布大致较为对称。25% 分位数为 33 岁，75% 分位数为 57 岁，这意味着一半的顾客年龄在 33 - 57 岁之间。

### 4.1.2 性别分布

在性别分布方面，女性顾客有 788 人，男性顾客有 665 人，女性顾客数量多于男性顾客。

## 4.2 顾客光顾频率与年龄、性别的关系

In [69]:
plt.figure(figsize=(12, 6))
plt.subplot(1, 2, 1)
sns.boxplot(x='VisitFrequency', y='Age', data=df)
plt.title('Frequency VS Age')
plt.xticks(rotation=45)

plt.subplot(1, 2, 2)
sns.countplot(x='VisitFrequency', hue='Gender', data=df)
plt.title('Frequency VS Gender')
plt.xticks(rotation=45)

plt.tight_layout()
plt.show()

print('光顾频率与年龄的关系：')
print(df.groupby('VisitFrequency')['Age'].describe())
print('\n光顾频率与性别的关系：')
print(pd.crosstab(df['VisitFrequency'], df['Gender']))

findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of 

光顾频率与年龄的关系：
                       count       mean        std   min    25%   50%    75%  \
VisitFrequency                                                                 
Every three months     342.0  46.666667  13.692261  18.0  36.00  49.0  57.75   
More than once a week   54.0  37.944444  15.598036  18.0  24.25  33.0  49.25   
Never                  131.0  51.167939  13.732137  19.0  41.00  55.0  61.50   
Once a month           439.0  42.305239  13.671713  18.0  31.00  41.0  53.00   
Once a week            235.0  38.753191  13.136640  18.0  28.00  38.0  49.00   
Once a year            252.0  49.285714  13.342475  18.0  37.75  52.0  60.00   

                        max  
VisitFrequency               
Every three months     70.0  
More than once a week  70.0  
Never                  70.0  
Once a month           71.0  
Once a week            70.0  
Once a year            70.0  

光顾频率与性别的关系：
Gender                 Female  Male
VisitFrequency                     
Every three months    

### 4.2.1 光顾频率与年龄的关系

从数据中可以推测，不同光顾频率的顾客年龄分布存在差异。例如，“Never” 光顾的顾客平均年龄相对较高，为 51.17 岁左右，而 “More than once a week” 光顾的顾客平均年龄较低，为 37.94 岁左右。这可能意味着年龄较大的顾客光顾的频率较低，而年轻顾客更倾向于频繁光顾。标准差方面，各频率的标准差数值较为接近，说明年龄的离散程度在不同光顾频率下大致相似。

### 4.2.2 光顾频率与性别的关系

整体上看，在各个光顾频率下，女性和男性顾客数量没有特别大的差距。在 “Every three months” 和 “Once a month” 这两个频率上，女性顾客数量略多于男性顾客；而在 “Once a year” 这个频率上，女性顾客数量优势相对明显一些。不过在 “More than once a week” 和 “Once a week” 这些较高频率的光顾情况中，男女顾客数量基本持平。这可能暗示在不同光顾频率的偏好上，性别差异不是非常显著，但在部分频率上女性可能更积极一些。

## 4.3 顾客对麦当劳各方面评价的分布

In [70]:
# 提取评价列
evaluation_columns = ['yummy', 'convenient', 'spicy', 'fattening', 'greasy', 'fast', 'cheap', 'tasty', 'expensive', 'healthy', 'disgusting']

# 创建画布
plt.figure(figsize=(15, 10))

# 绘制每个评价的分布柱状图
for i, column in enumerate(evaluation_columns):
    plt.subplot(3, 4, i + 1)
    value_counts = df[column].value_counts()
    sns.barplot(x=value_counts.index, y=value_counts.values)
    plt.title(f'{column} distribution')

plt.tight_layout()
plt.show()

# 查看每个评价的分布情况
for column in evaluation_columns:
    print(f'{column}评价分布：')
    print(df[column].value_counts())

findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of 

yummy评价分布：
yummy
Yes    803
No     650
Name: count, dtype: int64
convenient评价分布：
convenient
Yes    1319
No      134
Name: count, dtype: int64
spicy评价分布：
spicy
No     1317
Yes     136
Name: count, dtype: int64
fattening评价分布：
fattening
Yes    1260
No      193
Name: count, dtype: int64
greasy评价分布：
greasy
Yes    765
No     688
Name: count, dtype: int64
fast评价分布：
fast
Yes    1308
No      145
Name: count, dtype: int64
cheap评价分布：
cheap
Yes    870
No     583
Name: count, dtype: int64
tasty评价分布：
tasty
Yes    936
No     517
Name: count, dtype: int64
expensive评价分布：
expensive
No     933
Yes    520
Name: count, dtype: int64
healthy评价分布：
healthy
No     1164
Yes     289
Name: count, dtype: int64
disgusting评价分布：
disgusting
No     1100
Yes     353
Name: count, dtype: int64


1. **便捷性（convenient）**：绝大多数顾客（1319 人）认为麦当劳是便捷的，仅有 134 人持否定态度。这表明麦当劳在店铺分布、点餐流程等便捷性方面表现出色，得到了顾客的广泛认可。  
2. **辛辣程度（spicy）**：高达 1317 人觉得麦当劳不辣，只有 136 人认为辣。这可能反映出麦当劳的菜品整体辛辣口味不突出，适合大多数不嗜辣的顾客。  
3. **易发胖（fattening）**：1260 人认为麦当劳食物容易使人发胖，仅有 193 人不这么觉得。这或许与麦当劳的食物多为高热量、高脂肪的快餐食品有关。  
4. **油腻程度（greasy）**：认为油腻（765 人）和不油腻（688 人）的人数差距相对较小。这说明麦当劳食物的油腻感在顾客中的评价存在一定分歧，可能与不同的菜品选择有关。  
5. **速度（fast）**：1308 人认可麦当劳的速度，145 人不认可。这显示麦当劳在快餐速度方面总体表现良好，但仍有小部分顾客觉得有待提高。  
6. **价格便宜（cheap）**：870 人觉得价格便宜，583 人认为不便宜。这表明麦当劳在价格方面有一定的性价比，但也有相当一部分顾客认为价格偏高。  
7. **美味（tasty）**：936 人觉得美味，517 人不觉得。整体上顾客对麦当劳食物的口味评价较好，但也有近三分之一的顾客不太满意。  
8. **价格昂贵（expensive）**：933 人不觉得昂贵，520 人觉得贵。这与 “价格便宜” 的评价相互印证，大部分顾客认为麦当劳价格尚可。  
9. **健康（healthy）**：多达 1164 人认为麦当劳食物不健康，只有 289 人认为健康。这与大众对快餐食品普遍不健康的认知相符。  
10. **令人厌恶（disgusting）**：1100 人不觉得麦当劳令人厌恶，353 人有厌恶感。说明绝大多数顾客对麦当劳的整体印象是可以接受的，但仍有一部分顾客体验不佳。

## 4.4 顾客喜好与各评价之间的相关性

In [71]:
import re

# 使用正则表达式提取 Like 列中的数字部分并转换为数值型
df['Like'] = df['Like'].apply(lambda x: int(re.findall(r'\d+', x)[0]))

# 将评价列进行编码
for column in evaluation_columns:
    df[column] = df[column].map({'Yes': 1, 'No': 0})

# 计算相关系数矩阵
correlation_matrix = df[evaluation_columns + ['Like']].corr()

# 绘制热力图
plt.figure(figsize=(10, 8))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', vmin=-1, vmax=1)
plt.title('Customer preferences VS Reviews')
plt.show()

print('顾客喜好与各评价之间的相关系数矩阵：')
print(correlation_matrix.round(2))

findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of 

顾客喜好与各评价之间的相关系数矩阵：
            yummy  convenient  spicy  fattening  greasy  fast  cheap  tasty  \
yummy        1.00        0.25   0.01      -0.09   -0.15  0.11   0.11   0.69   
convenient   0.25        1.00   0.03       0.04   -0.11  0.24   0.15   0.29   
spicy        0.01        0.03   1.00      -0.04    0.05  0.02   0.02   0.06   
fattening   -0.09        0.04  -0.04       1.00    0.32  0.05  -0.03  -0.09   
greasy      -0.15       -0.11   0.05       0.32    1.00 -0.06  -0.07  -0.16   
fast         0.11        0.24   0.02       0.05   -0.06  1.00   0.25   0.15   
cheap        0.11        0.15   0.02      -0.03   -0.07  0.25   1.00   0.14   
tasty        0.69        0.29   0.06      -0.09   -0.16  0.15   0.14   1.00   
expensive   -0.06       -0.16   0.05       0.09    0.15 -0.20  -0.72  -0.10   
healthy      0.25        0.10   0.11      -0.34   -0.21  0.03   0.13   0.23   
disgusting  -0.42       -0.34   0.03       0.15    0.32 -0.14  -0.13  -0.44   
Like         0.13       -0.21  -0

* 与顾客喜好（Like）相关性较强的因素：  
	* disgusting（令人厌恶）与Like的相关系数为 0.19，呈正相关。这表明如果顾客觉得食物令人厌恶，那么他们的喜好程度可能会降低，这个结果符合常理。  
	* healthy（健康）与Like的相关系数为 0.11，有一定的正相关，说明顾客在一定程度上会偏好他们认为健康的食物。  
* 其他因素之间的相关性：  
	* yummy（美味）和tasty（可口）的相关系数高达 0.69，这表明这两个评价维度高度相关，顾客对美味和可口的认知较为一致。  
	* cheap（便宜）和expensive（昂贵）的相关系数为 -0.72，呈现很强的负相关，这是符合逻辑的，因为价格便宜和昂贵是相反的概念。

# 5、顾客口味偏好分析

In [72]:
import matplotlib.pyplot as plt

# 统计口味相关特征的分布（选取 spicy, yummy, tasty, greasy 作为口味相关特征）
taste_features = ['spicy', 'yummy', 'tasty', 'greasy']
fig, axes = plt.subplots(2, 2, figsize=(12, 8))
axes = axes.ravel()

for i, feature in enumerate(taste_features):
    value_counts = df[feature].value_counts()
    axes[i].pie(value_counts, labels=value_counts.index, autopct='%1.1f%%', startangle=90)
    axes[i].set_title(f'{feature} distribution')

plt.tight_layout()
plt.show()

# 查看具体比例
for feature in taste_features:
    print(df[feature].value_counts(normalize=True))

findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of 

spicy
0    0.906401
1    0.093599
Name: proportion, dtype: float64
yummy
1    0.55265
0    0.44735
Name: proportion, dtype: float64
tasty
1    0.644184
0    0.355816
Name: proportion, dtype: float64
greasy
1    0.526497
0    0.473503
Name: proportion, dtype: float64


从这些比例数据中我们可以推测出，大部分顾客不太倾向于辛辣口味，选择不辛辣的比例高达 90.64%。对于美味和好吃这两个相对主观的口味感受，认为是的比例分别为 55.27% 和 64.42%，说明整体上顾客对食品的评价较为积极，但仍有一定比例的顾客持否定态度。而对于油腻这个特征，认为食品油腻和不油腻的顾客比例较为接近，几乎各占一半。

# 6、价格敏感性分析

In [73]:
# 统计认为便宜和昂贵的比例
cheap_counts = df['cheap'].value_counts(normalize=True)
expensive_counts = df['expensive'].value_counts(normalize=True)

# 绘制柱状图
fig, axes = plt.subplots(1, 2, figsize=(10, 5))

axes[0].bar(cheap_counts.index, cheap_counts)
axes[0].set_title('Thinking cheap')
axes[0].set_xlabel('cheap or not')
axes[0].set_ylabel('scale')

axes[1].bar(expensive_counts.index, expensive_counts)
axes[1].set_title('Thinking expensive')
axes[1].set_xlabel('expensive or not')
axes[1].set_ylabel('scale')

plt.tight_layout()
plt.show()

print("认为便宜的比例：")
print(cheap_counts)
print("认为昂贵的比例：")
print(expensive_counts)

findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of 

认为便宜的比例：
cheap
1    0.598761
0    0.401239
Name: proportion, dtype: float64
认为昂贵的比例：
expensive
0    0.64212
1    0.35788
Name: proportion, dtype: float64


从这些数据可以推测，约 59.88% 的人认为价格便宜，而约 64.21% 的人不认为价格昂贵。这表明大部分人对当前价格持有相对积极的态度，整体价格可能处于消费者可接受的范围内。不过仍有一定比例（40.12% 认为不便宜，35.79% 认为昂贵）的消费者对价格存在一定的敏感度，这部分消费者可能是价格敏感型客户，商家在进行价格调整或者促销活动时需要重点考虑这部分人群的需求。

# 7、消费频率预测

In [74]:
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

# 对 object 类型数据进行编码
label_encoders = {}
for column in df.columns:
    if df[column].dtype == 'object':
        le = LabelEncoder()
        df[column] = le.fit_transform(df[column])
        label_encoders[column] = le

# 准备特征和目标变量
X = df.drop(['VisitFrequency', 'Index'], axis=1)
y = df['VisitFrequency']

# 划分训练集和测试集
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report

# 定义不同的模型
models = {
    'Logistic Regression': LogisticRegression(max_iter=1000),
    'Decision Tree': DecisionTreeClassifier(),
    'Random Forest': RandomForestClassifier(),
    'Support Vector Machine': SVC()
}

# 训练和评估每个模型
for name, model in models.items():
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    accuracy = accuracy_score(y_test, y_pred)
    print(f'{name} 准确率: {accuracy:.4f}')
    print(f'{name} 分类报告:\n', classification_report(y_test, y_pred))
    print('-' * 50)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


Logistic Regression 准确率: 0.3849
Logistic Regression 分类报告:
               precision    recall  f1-score   support

           0       0.28      0.26      0.27        61
           1       0.00      0.00      0.00        13
           2       0.51      0.61      0.56        33
           3       0.40      0.68      0.50        80
           4       0.40      0.19      0.26        42
           5       0.36      0.23      0.28        62

    accuracy                           0.38       291
   macro avg       0.32      0.33      0.31       291
weighted avg       0.36      0.38      0.35       291

--------------------------------------------------
Decision Tree 准确率: 0.2955
Decision Tree 分类报告:
               precision    recall  f1-score   support

           0       0.24      0.28      0.26        61
           1       0.00      0.00      0.00        13
           2       0.35      0.39      0.37        33
           3       0.34      0.39      0.36        80
           4       0.29      

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


# 8、顾客画像分类

## 8.1 确定最佳簇数

In [75]:
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score

# 尝试不同的簇数
silhouette_scores = []
for k in range(2, 11):
    kmeans = KMeans(n_clusters=k, random_state=42)
    kmeans.fit(X)
    labels = kmeans.labels_
    score = silhouette_score(X, labels)
    silhouette_scores.append((k, score))

# 找到最高轮廓系数对应的簇数
best_k, _ = max(silhouette_scores, key=lambda x: x[1])
print(f'最佳簇数: {best_k}')

最佳簇数: 2


通过计算不同簇数（2 到 10）下的轮廓系数，找到了最佳簇数为 2。轮廓系数用于衡量聚类效果的好坏，较高的轮廓系数表示聚类结果较好，簇内的样本紧密且与其他簇的样本分离程度高。因此，我们可以认为将顾客分为 2 类是比较合适的，能够较好地区分不同特征的顾客群体。

## 8.2 不同簇的特征分析

In [76]:
# 使用最佳簇数进行 KMeans 聚类
kmeans = KMeans(n_clusters=best_k, random_state=42)
df['Cluster'] = kmeans.fit_predict(X)

# 分析不同簇的特征（以年龄和喜欢程度为例）
cluster_analysis = df.groupby('Cluster').agg({
    'Age': 'mean',
    'Like': 'mean'
}).reset_index()

print(cluster_analysis)

   Cluster        Age      Like
0        0  31.419735  2.877761
1        1  56.171835  2.652455


从年龄均值来看，簇 0 的顾客相对年轻，平均年龄约为 31 岁，而簇 1 的顾客平均年龄约为 56 岁，两者存在明显的年龄差异。在喜欢程度方面，两个簇的均值都比较高且较为接近，不过簇 1 的喜欢程度均值略高于簇 0，这可能暗示年龄较大的顾客对相关事物的喜欢程度稍高一些，但差异并不是非常显著。

## 8.3 不同簇的年龄和喜欢程度分布可视化

In [77]:
# 绘制不同簇的年龄和喜欢程度分布
plt.scatter(df['Age'], df['Like'], c=df['Cluster'])
plt.xlabel('Age')
plt.xticks(rotation=45)
plt.ylabel('Liking degree')
plt.title('Age VS liking of different clusters')
plt.show()

findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of the following families were found: WenQuanYi Zen Hei
findfont: Generic family 'sans-serif' not found because none of 