In [18]:
import pandas as pd
import numpy as np

# 读取数据
detail = pd.read_csv('data/detail.csv',encoding='gbk')

# 哑变量处理
data = pd.get_dummies(detail['dishes_name'])
data.to_csv('data/dishes_name_getdummies.csv',sep=',',index=True)

print('进行哑变量处理前的数据样式：\n',detail['dishes_name'][0:5])
print('进行哑变量处理后的数据样式：\n',data.iloc[:5,:4])

进行哑变量处理前的数据样式：
 0     蒜蓉生蚝
1    蒙古烤羊腿
2     大蒜苋菜
3    芝麻烤紫菜
4      蒜香包
Name: dishes_name, dtype: object
进行哑变量处理后的数据样式：
    38度剑南春  42度海之蓝  50度古井贡酒  52度泸州老窖
0       0       0        0        0
1       0       0        0        0
2       0       0        0        0
3       0       0        0        0
4       0       0        0        0


In [3]:
price = pd.cut(detail['amounts'],5)
print('离散化后5条记录售价分布为：\n',price.value_counts())

离散化后5条记录售价分布为：
 (0.823, 36.4]     5461
(36.4, 71.8]      3157
(71.8, 107.2]      839
(142.6, 178.0]     426
(107.2, 142.6]     154
Name: amounts, dtype: int64


In [3]:
# 自定义等频法离散化数据
def sameratecut(data,k):
    w = data.quantile(np.arange(0,1+1.0/k,1.0/k))
    data = pd.cut(data,w)
    return data
# 对菜品售价进行等频法离散化数据
result = sameratecut(detail['amounts'],5).value_counts()
print('菜品数据等频法离散化数据后各个类别数目分布情况：\n',result)    

菜品数据等频法离散化数据后各个类别数目分布情况：
 (18.0, 32.0]     2107
(39.0, 58.0]     2080
(32.0, 39.0]     1910
(1.0, 18.0]      1891
(58.0, 178.0]    1863
Name: amounts, dtype: int64


In [11]:
m = np.arange(0,1+1.0/5,1.0/5)
print(m)

[0.  0.2 0.4 0.6 0.8 1. ]


In [12]:
print(detail['amounts'].max())
print(detail['amounts'].min())

178
1


In [6]:
w = detail['amounts'].quantile(np.arange(0,1+1.0/5,1.0/5))
print(w)

0.0      1.0
0.2     18.0
0.4     32.0
0.6     39.0
0.8     58.0
1.0    178.0
Name: amounts, dtype: float64


In [10]:
df = pd.DataFrame({'A': [1, 2, 3],
                   'B': [pd.Timestamp('2010'),
                         pd.Timestamp('2011'),
                         pd.Timestamp('2012')],
                   'C': [pd.Timedelta('1 days'),
                         pd.Timedelta('2 days'),
                         pd.Timedelta('3 days')]})
m = df.quantile((0.25,0.75), numeric_only=False)
print(m)

        A                   B               C
0.25  1.5 2010-07-02 12:00:00 1 days 12:00:00
0.75  2.5 2011-07-02 12:00:00 2 days 12:00:00


In [72]:
# 自定义数据K-Means聚类离散化函数
def kmeancut(data,k):
    from sklearn.cluster import KMeans
    # 建立模型
    kmodel = KMeans(n_clusters=k)
    # 训练模型
    kmodel.fit(data.values.reshape((len(data),1)))
    # 输出聚类中心并排序
    c = pd.DataFrame(kmodel.cluster_centers_).sort_values(0)
    w = c.rolling(2).mean().iloc[1:]
    w = [0]+list(w[0])+[data.max()]
    data = pd.cut(data,w)
    return data

# 菜品售价等频法离散化
result=kmeancut(detail['amounts'],5).value_counts()
print('菜品售价聚类离散化后各个类别数目分布状况为：\n',result)

菜品售价聚类离散化后各个类别数目分布状况为：
 (22.31, 43.51]       3690
(43.51, 73.945]      2474
(0.0, 22.31]         2454
(73.945, 131.858]     993
(131.858, 178.0]      426
Name: amounts, dtype: int64


In [43]:
from sklearn.cluster import KMeans
    
kmodel = KMeans(n_clusters=5)  # 设置要形成的团簇数目以及要生成的质心数。
print(kmodel)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=5, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)


In [48]:
kmodel1= kmodel.fit(detail['amounts'].values.reshape((len(detail['amounts']),1)))
print(kmodel1)

KMeans(algorithm='auto', copy_x=True, init='k-means++', max_iter=300,
       n_clusters=5, n_init=10, n_jobs=None, precompute_distances='auto',
       random_state=None, tol=0.0001, verbose=0)


In [50]:
# 输出聚类中心并排序
c = pd.DataFrame(kmodel.cluster_centers_).sort_values(0)
print(c)

            0
4   12.179707
1   32.439295
3   54.580437
0   93.310171
2  170.406103


In [70]:
# 相邻两项求中点，作为边界点
w = c.rolling(2).mean().iloc[1:]
print(w)
# 把末边界点加上
w = [0]+list(w[0])+[m]
print(w)

            0
1   22.309501
3   43.509866
0   73.945304
2  131.858137
[0, 22.30950099721165, 43.50986596648579, 73.9453038692032, 131.85813724238798, 178]


In [71]:
data = pd.cut(detail['amounts'],w)
print(data)

0        (43.51, 73.945]
1        (43.51, 73.945]
2         (22.31, 43.51]
3         (22.31, 43.51]
4           (0.0, 22.31]
              ...       
10032     (22.31, 43.51]
10033     (22.31, 43.51]
10034     (22.31, 43.51]
10035       (0.0, 22.31]
10036     (22.31, 43.51]
Name: amounts, Length: 10037, dtype: category
Categories (5, interval[float64]): [(0.0, 22.31] < (22.31, 43.51] < (43.51, 73.945] < (73.945, 131.858] < (131.858, 178.0]]
