In [1]:
import numpy as np
import pandas as pd

In [8]:
data = pd.read_csv(r"cancer.csv")
data.duplicated().any()
#data.drop_duplicates(inplace=True)
data.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,type
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [4]:
class LogisticRegression:
    '''逻辑回归'''
    
    def __init__(self, alpha, times):
        '''初始化
        
        Parameters
        ----
        alpha: float
        times:int'''
        
        self.alpha = alpha
        self.times = times
        
    def sigmoid(self, z):
        '''sigmoid实现
        
        Parameters
        -----
        z:float z = w.T * X
        '''
        return 1.0 / (1.0 + np.exp(-z))
        
    def fit(self, X, y):
        '''训练Ｘ
        
        Parameters
        ---
        X:[样本数量，样本特征]
        
        y:[样本数量]
        '''
        X = np.asarray(X)
        y = np.asarray(y)
        
        # 创建初始权重，全部设置0，多一个截距权重
        self.w_ = np.zeros(1 + X.shape[1])
        
        # 创建loss列表  （预测值 - 真实值）的平方和除以二
        self.loss_ = []
        
        # 进行循环
        for i in range(self.times):
            # 计算预测值
            z = np.dot(X, self.w_[1:]) + self.w_[0]
            p = self.sigmoid(z)
            # 计算预测值和真实值的差距
            # 逻辑回归的目标函数 J(w) = -sum(yi * log(s(zi))+ (1-yi) * log(1-s(zi)) ) 
            cost = -np.sum( y * np.log(p) + (1-y) * np.log(1-p) )
            # 将损失值加入到损失列表当中
            self.loss_.append(  cost  )
            # 调整w向梯度相反的方向: 权重(j) = 权重(j) + 学习率 * sum((y - y_hat) * x(j))
            # loss（所有数据集的error总和）对每一个w求导，即是w应该变化的方向
            # y_hat对w求导的时候，得到的是负的x(j)，由于权重 = 权重 - 权重变化率，所以负负得正，变成加号
            self.w_[0] += self.alpha * np.sum(y - p)
            self.w_[1:] += self.alpha * np.dot(X.T, y - p)
            
    def predict_proba(self, X):
            '''预测！
            
            Parameters
            -----
            X:类数组 [样本数量，特征数量]
            
            Returns
            ----
            resulet:数组,结果，概率值
            '''
            X = np.asarray(X)
            z = np.dot(X, self.w_[1:]) + self.w_[0]
            p = self.sigmoid(z)
            # 转成二维，方便拼接
            p = p.reshape(-1, 1)
            return np.concatenate([1-p, p], axis=1)
        
    def predict(self, X):
            '''预测！
            
            Parameters
            -----
            X:类数组 [样本数量，特征数量]
            
            Returns
            ----
            resulet:数组,结果，概率值
            '''
            
            return np.argmax(self.predict_proba(X), axis=1)

In [5]:
class StandardScaler:
    '''标准化'''
    
    def fit(self, X):
        '''根据传递的样本，计算每个特征列的均值与标准差
        
        Parameters
        ----
        X:类数组
        '''
        
        X = np.asarray(X)
        self.std_ = np.std(X, axis=0)
        self.mean_ = np.mean(X, axis=0)
        
    def transform(self, X):
        '''将每一列都标准化处理，每一列都变成标准正态分布'''
        
        return (X-self.mean_) / self.std_
    
    def fit_transform(self, X):
        '''结合'''
        
        self.fit(X)
        return  self.transform(X)

In [9]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
mean radius                569 non-null float64
mean texture               569 non-null float64
mean perimeter             569 non-null float64
mean area                  569 non-null float64
mean smoothness            569 non-null float64
mean compactness           569 non-null float64
mean concavity             569 non-null float64
mean concave points        569 non-null float64
mean symmetry              569 non-null float64
mean fractal dimension     569 non-null float64
radius error               569 non-null float64
texture error              569 non-null float64
perimeter error            569 non-null float64
area error                 569 non-null float64
smoothness error           569 non-null float64
compactness error          569 non-null float64
concavity error            569 non-null float64
concave points error       569 non-null float64
symmetry error             569 

In [20]:
lr = LogisticRegression(alpha=0.05, times=20)
t = data.sample(len(data), random_state=0)

#数据分组
train_X = t.iloc[:400, :-1]
train_y = t.iloc[:400, -1]
test_X = t.iloc[400:, :-1]
test_y = t.iloc[400:, -1]

#标准化
s = StandardScaler()
train_X = s.fit_transform(train_X)
test_X = s.transform(test_X)

lr.fit(train_X, train_y)

result = lr.predict(test_X)
np.sum((result == test_y) / len(result))



0.9822485207100594