In [25]:
####################################################################
# 3.10 离散化，对连续运营数据做逻辑分层
import pandas as pd
from sklearn.cluster import KMeans
from sklearn import preprocessing

# 读取数据
df = pd.read_table('data7.txt', names=['id', 'amount', 'income', 'datetime', 'age'])  # 读取数据文件
print (df.head(5))  # 打印输出前5条数据


      id  amount  income             datetime    age
0  15093    1390   10.40  2017-04-30 19:24:13   0-10
1  15062    4024    4.68  2017-04-27 22:44:59  70-80
2  15028    6359    3.84  2017-04-27 10:07:55  40-50
3  15012    7759    3.70  2017-04-04 07:28:18  30-40
4  15021     331    4.25  2017-04-08 11:14:00  70-80


In [26]:

# 针对时间数据的离散化
for i, signle_data in enumerate(df['datetime']):  # 循环得到索引和对应值
    single_data_tmp = pd.to_datetime(signle_data)  # 将时间转换为datetime格式
    df['datetime'][i] = single_data_tmp.weekday()  # 离散化为周几
print (df.head(5))  # 打印输出前5条数据



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """


      id  amount  income datetime    age
0  15093    1390   10.40        6   0-10
1  15062    4024    4.68        3  70-80
2  15028    6359    3.84        3  40-50
3  15012    7759    3.70        1  30-40
4  15021     331    4.25        5  70-80


In [27]:
# 针对多值离散数据的离散化
map_df = pd.DataFrame([['0-10', '0-40'], ['10-20', '0-40'], ['20-30', '0-40'], ['30-40', '0-40'], ['40-50', '40-80'],
                       ['50-60', '40-80'], ['60-70', '40-80'], ['70-80', '40-80'], ['80-90', '>80'], ['>90', '>80']],
                      columns=['age', 'age2'])  # 定义一个要转换的新区间
df_tmp = df.merge(map_df, left_on='age', right_on='age', how='inner')  # 数据框关联匹配
df = df_tmp.drop('age', 1)  # 丢弃名为age的列
print (df.head(5))  # 打印输出前5条数据



      id  amount  income datetime  age2
0  15093    1390   10.40        6  0-40
1  15064    7952    4.40        0  0-40
2  15080     503    5.72        5  0-40
3  15068    1668    3.19        5  0-40
4  15019    6710    3.20        0  0-40


In [28]:
# 针对连续数据的离散化
# 方法1：自定义分箱区间实现离散化
bins = [0, 200, 1000, 5000, 10000]  # 自定义区间边界
df['amount1'] = pd.cut(df['amount'], bins)  # 使用边界做离散化
print (df.head(5))  # 打印输出前5条数据


      id  amount  income datetime  age2        amount1
0  15093    1390   10.40        6  0-40   (1000, 5000]
1  15064    7952    4.40        0  0-40  (5000, 10000]
2  15080     503    5.72        5  0-40    (200, 1000]
3  15068    1668    3.19        5  0-40   (1000, 5000]
4  15019    6710    3.20        0  0-40  (5000, 10000]


In [29]:
# 方法2 使用聚类法实现离散化
data = df['amount']  # 获取要聚类的数据，名为amount的列
data_reshape = data.values.reshape((data.shape[0], 1))  # 转换数据形状
model_kmeans = KMeans(n_clusters=4, random_state=0)  # 创建KMeans模型并指定要聚类数量
keames_result = model_kmeans.fit_predict(data_reshape)  # 建模聚类
df['amount2'] = keames_result  # 新离散化的数据合并到原数据框
print (df.head(5))  # 打印输出前5条数据


      id  amount  income datetime  age2        amount1  amount2
0  15093    1390   10.40        6  0-40   (1000, 5000]        2
1  15064    7952    4.40        0  0-40  (5000, 10000]        1
2  15080     503    5.72        5  0-40    (200, 1000]        2
3  15068    1668    3.19        5  0-40   (1000, 5000]        2
4  15019    6710    3.20        0  0-40  (5000, 10000]        1


In [30]:
# 方法3：使用4分位数实现离散化
df['amount3'] = pd.qcut(df['amount'], 4, labels=['bad', 'medium', 'good', 'awesome'])  # 按四分位数进行分隔
df = df.drop('amount', 1)  # 丢弃名为amount的列
print (df.head(5))  # 打印输出前5条数据


      id  income datetime  age2        amount1  amount2  amount3
0  15093   10.40        6  0-40   (1000, 5000]        2      bad
1  15064    4.40        0  0-40  (5000, 10000]        1  awesome
2  15080    5.72        5  0-40    (200, 1000]        2      bad
3  15068    3.19        5  0-40   (1000, 5000]        2      bad
4  15019    3.20        0  0-40  (5000, 10000]        1  awesome


In [35]:
# 针对连续数据的二值化
binarizer_scaler = preprocessing.Binarizer(threshold=df['income'].mean())  # 建立Binarizer模型对象
income_tmp = binarizer_scaler.fit_transform(df['income'])  # Binarizer标准化转换


ValueError: Expected 2D array, got 1D array instead:
array=[ 10.4    4.4    5.72   3.19   3.2    4.21   4.34   2.55   4.44   2.85
   3.61   4.53   4.68   4.25   3.6    2.71   2.6    4.37   5.16   4.07
   4.52   5.25   6.58   3.72   5.55   4.76   3.84   3.34   3.35   4.21
   3.13   3.7    3.86   3.7    4.13   3.53   2.89   4.3    3.23   5.1
   4.57   5.07   1.76   4.45   4.22   4.81   4.04   4.29   4.43   4.67
   3.46   6.12   4.88   4.07   4.29   4.53   2.78   5.1    4.48   4.56
   2.04   3.02   2.9    3.15   4.09   2.97   2.17   3.85   3.4    4.43
   4.59   3.77   3.07   1.21   3.21   2.49   2.78   4.67   3.52   4.29
   6.36   3.3    5.43   4.97   4.58   4.81   5.69   4.39   6.69   4.59
   2.89   5.3    4.7    3.57   4.49   3.03   5.09   3.74   3.28   3.92].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [None]:
income_tmp.resize(df['income'].shape)  # 转换数据形状
df['income'] = income_tmp  # Binarizer标准化转换
print (df.head(5))  # 打印输出前5条数据