In [1]:
import numpy as np
import pandas as pd

In [24]:
data = pd.read_csv("credit_card.csv")
t = data.drop("CUST_ID", axis=1)
display(t.head())
display(t.info())

Unnamed: 0,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE
0,40.900749,0.818182,95.4,0.0,95.4,0.0,0.166667,0.0,0.083333,0.0,0,2,1000.0,201.802084,139.509787,0.0,12
1,3202.467416,0.909091,0.0,0.0,0.0,6442.945483,0.0,0.0,0.0,0.25,4,0,7000.0,4103.032597,1072.340217,0.222222,12
2,2495.148862,1.0,773.17,773.17,0.0,0.0,1.0,1.0,0.0,0.0,0,12,7500.0,622.066742,627.284787,0.0,12
3,1666.670542,0.636364,1499.0,1499.0,0.0,205.788017,0.083333,0.083333,0.0,0.083333,1,1,7500.0,0.0,,0.0,12
4,817.714335,1.0,16.0,16.0,0.0,0.0,0.083333,0.083333,0.0,0.0,0,1,1200.0,678.334763,244.791237,0.0,12


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8950 entries, 0 to 8949
Data columns (total 17 columns):
BALANCE                             8950 non-null float64
BALANCE_FREQUENCY                   8950 non-null float64
PURCHASES                           8950 non-null float64
ONEOFF_PURCHASES                    8950 non-null float64
INSTALLMENTS_PURCHASES              8950 non-null float64
CASH_ADVANCE                        8950 non-null float64
PURCHASES_FREQUENCY                 8950 non-null float64
ONEOFF_PURCHASES_FREQUENCY          8950 non-null float64
PURCHASES_INSTALLMENTS_FREQUENCY    8950 non-null float64
CASH_ADVANCE_FREQUENCY              8950 non-null float64
CASH_ADVANCE_TRX                    8950 non-null int64
PURCHASES_TRX                       8950 non-null int64
CREDIT_LIMIT                        8949 non-null float64
PAYMENTS                            8950 non-null float64
MINIMUM_PAYMENTS                    8637 non-null float64
PRC_FULL_PAYMENT           

None

In [25]:
credit_limit_means = np.mean(t["CREDIT_LIMIT"])
t["CREDIT_LIMIT"].fillna(credit_limit_means, inplace=True)
minimum_payments = np.mean(t["MINIMUM_PAYMENTS"])
t["MINIMUM_PAYMENTS"].fillna(minimum_payments, inplace=True)
t.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8950 entries, 0 to 8949
Data columns (total 17 columns):
BALANCE                             8950 non-null float64
BALANCE_FREQUENCY                   8950 non-null float64
PURCHASES                           8950 non-null float64
ONEOFF_PURCHASES                    8950 non-null float64
INSTALLMENTS_PURCHASES              8950 non-null float64
CASH_ADVANCE                        8950 non-null float64
PURCHASES_FREQUENCY                 8950 non-null float64
ONEOFF_PURCHASES_FREQUENCY          8950 non-null float64
PURCHASES_INSTALLMENTS_FREQUENCY    8950 non-null float64
CASH_ADVANCE_FREQUENCY              8950 non-null float64
CASH_ADVANCE_TRX                    8950 non-null int64
PURCHASES_TRX                       8950 non-null int64
CREDIT_LIMIT                        8950 non-null float64
PAYMENTS                            8950 non-null float64
MINIMUM_PAYMENTS                    8950 non-null float64
PRC_FULL_PAYMENT           

In [26]:
class KMeans:
    '''聚类'''
    
    def __init__(self, k, times):
        '''初始化
        
        Parameters
        ----
        k：聚类个数
        times:迭代'''
        
        self.k = k
        self.times = times
        
    def fit(self, X):
        
        X = np.asarray(X)
        # 设置随机种子，以便于可以产生相同的随机序列
        np.random.seed(0)
        # 随机选k个点作为聚类中心
        self.cluster_centers_ = X[np.random.randint(0, len(X), self.k)]
        self.labels_ = np.zeros(len(X))
        
        for t in range(self.times):
            for index, x in enumerate(X):
                # 计算每个样本与聚类中心的距离，按行加起来
                dis = np.sqrt(np.sum((x - self.cluster_centers_) ** 2, axis=1))
                # 将第index个点设置label，表示所属组，范围0～k-1
                self.labels_[index] = dis.argmin()
            # 更新聚类中心
            for i in range(self.k):
                #按列求平均
                self.cluster_centers_[i] = np.mean(X[self.labels_ == i], axis=0)
                
    def predict(self, X):
        
        X = np.asarray(X)
        result = np.zeros(len(X))
        
        for index, x in enumerate(X):
            dis = np.sqrt(np.sum((x - self.cluster_centers_) ** 2, axis=1))
            result[index] = dis.argmin()
            
        return result

In [29]:
class StandardScaler:
    '''标准化'''
    
    def fit(self, X):
        '''根据传递的样本，计算每个特征列的均值与标准差
        
        Parameters
        ----
        X:类数组
        '''
        
        X = np.asarray(X)
        self.std_ = np.std(X, axis=0)
        self.mean_ = np.mean(X, axis=0)
        
    def transform(self, X):
        '''将每一列都标准化处理，每一列都变成标准正态分布'''
        
        return (X-self.mean_) / self.std_
    
    def fit_transform(self, X):
        '''结合'''
        
        self.fit(X)
        return  self.transform(X)

In [30]:
kmeans = KMeans(3, 50)
s = StandardScaler()
t = s.fit_transform(t)
kmeans.fit(t)

In [31]:
kmeans.cluster_centers_

array([[-0.13250517,  0.38396014,  0.55319597,  0.38986912,  0.59151247,
        -0.35462745,  1.08707117,  0.62583153,  0.96430331, -0.46005718,
        -0.35288661,  0.72499417,  0.18428084,  0.15024783, -0.02776193,
         0.45766311,  0.16900625],
       [ 1.43243838,  0.38534342, -0.19872339, -0.13772596, -0.21696301,
         1.68573836, -0.50913115, -0.22007232, -0.44640401,  1.73841497,
         1.61451714, -0.25467627,  0.81766899,  0.64046137,  0.52465339,
        -0.40078194, -0.09633581],
       [-0.2983691 , -0.37018768, -0.32813152, -0.23188633, -0.34964097,
        -0.2138462 , -0.61251205, -0.37250704, -0.54476158, -0.15533952,
        -0.19566198, -0.43160306, -0.34990389, -0.27815162, -0.12363342,
        -0.20712182, -0.09054989]])

In [35]:
data[kmeans.labels_ == 0].head()

Unnamed: 0,CUST_ID,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE
2,C10003,2495.148862,1.0,773.17,773.17,0.0,0.0,1.0,1.0,0.0,0.0,0,12,7500.0,622.066742,627.284787,0.0,12
5,C10006,1809.828751,1.0,1333.28,0.0,1333.28,0.0,0.666667,0.0,0.583333,0.0,0,8,1800.0,1400.05777,2407.246035,0.0,12
6,C10007,627.260806,1.0,7091.01,6402.63,688.38,0.0,1.0,1.0,1.0,0.0,0,64,13500.0,6354.314328,198.065894,1.0,12
7,C10008,1823.652743,1.0,436.2,0.0,436.2,0.0,1.0,0.0,1.0,0.0,0,12,2300.0,679.065082,532.03399,0.0,12
10,C10011,1293.124939,1.0,920.12,0.0,920.12,0.0,1.0,0.0,1.0,0.0,0,12,1200.0,1083.301007,2172.697765,0.0,12


In [36]:
data[kmeans.labels_ == 1].head()

Unnamed: 0,CUST_ID,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE
1,C10002,3202.467416,0.909091,0.0,0.0,0.0,6442.945483,0.0,0.0,0.0,0.25,4,0,7000.0,4103.032597,1072.340217,0.222222,12
15,C10016,6886.213231,1.0,1611.7,0.0,1611.7,2301.491267,0.5,0.0,0.5,0.166667,4,11,8000.0,1993.439277,2109.90649,0.0,12
23,C10024,3800.151377,0.818182,4248.35,3454.56,793.79,7974.415626,1.0,0.083333,0.916667,0.333333,13,13,9000.0,9479.043842,1425.426525,0.0,12
24,C10025,5368.571219,1.0,0.0,0.0,0.0,798.949863,0.0,0.0,0.0,0.363636,4,0,6000.0,1422.726707,1657.002877,0.0,11
28,C10029,7152.864372,1.0,387.05,204.55,182.5,2236.145259,0.666667,0.166667,0.416667,0.833333,16,8,10500.0,1601.448347,1648.851345,0.0,12


In [37]:
data[kmeans.labels_ == 2].head()

Unnamed: 0,CUST_ID,BALANCE,BALANCE_FREQUENCY,PURCHASES,ONEOFF_PURCHASES,INSTALLMENTS_PURCHASES,CASH_ADVANCE,PURCHASES_FREQUENCY,ONEOFF_PURCHASES_FREQUENCY,PURCHASES_INSTALLMENTS_FREQUENCY,CASH_ADVANCE_FREQUENCY,CASH_ADVANCE_TRX,PURCHASES_TRX,CREDIT_LIMIT,PAYMENTS,MINIMUM_PAYMENTS,PRC_FULL_PAYMENT,TENURE
0,C10001,40.900749,0.818182,95.4,0.0,95.4,0.0,0.166667,0.0,0.083333,0.0,0,2,1000.0,201.802084,139.509787,0.0,12
3,C10004,1666.670542,0.636364,1499.0,1499.0,0.0,205.788017,0.083333,0.083333,0.0,0.083333,1,1,7500.0,0.0,,0.0,12
4,C10005,817.714335,1.0,16.0,16.0,0.0,0.0,0.083333,0.083333,0.0,0.0,0,1,1200.0,678.334763,244.791237,0.0,12
8,C10009,1014.926473,1.0,861.49,661.49,200.0,0.0,0.333333,0.083333,0.25,0.0,0,5,7000.0,688.278568,311.963409,0.0,12
9,C10010,152.225975,0.545455,1281.6,1281.6,0.0,0.0,0.166667,0.166667,0.0,0.0,0,3,11000.0,1164.770591,100.302262,0.0,12
