In [1]:
import numpy as np
import pandas as pd

In [32]:
data = pd.read_csv(r"diabetes.csv")
data.head()
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 442 entries, 0 to 441
Data columns (total 11 columns):
age      442 non-null float64
sex      442 non-null float64
bmi      442 non-null float64
bp       442 non-null float64
s1       442 non-null float64
s2       442 non-null float64
s3       442 non-null float64
s4       442 non-null float64
s5       442 non-null float64
s6       442 non-null float64
value    442 non-null float64
dtypes: float64(11)
memory usage: 38.1 KB


In [3]:
class LinearRegression:
    
    def __init__(self, alpha, times):
        '''初始化
        
        parameters
        ----
        alpha:float 学习速率
        
        times:int 迭代次数
        '''
        
        self.alpha = alpha
        self.times = times
        
        
    def fit(self, X, y):
        '''训练Ｘ
        
        Parameters
        ---
        X:[样本数量，样本特征]
        
        y:[样本数量]
        '''
        X = np.asarray(X)
        y = np.asarray(y)
        
        # 创建初始权重，全部设置0，多一个截距权重
        self.w_ = np.zeros(1 + X.shape[1])
        
        # 创建loss列表  （预测值 - 真实值）的平方和除以二
        self.loss_ = []
        
        # 进行循环
        for i in range(self.times):
            # 计算预测值
            y_hat = np.dot(X, self.w_[1:]) + self.w_[0]
            # 计算预测值和真实值的差距
            error = y - y_hat
            # 将损失值加入到损失列表当中
            self.loss_.append(  np.sum(error ** 2) / 2 )
            # 调整w向梯度相反的方向: 权重(j) = 权重(j) + 学习率 * sum((y - y_hat) * x(j))
            # loss（所有数据集的error总和）对每一个w求导，即是w应该变化的方向
            # y_hat对w求导的时候，得到的是负的x(j)，由于权重 = 权重 - 权重变化率，所以负负得正，变成加号
            self.w_[0] += self.alpha * np.sum(error)
            self.w_[1:] += self.alpha * np.dot(X.T, error)
            
    def predict(self, X):
        '''预测
            
            Parameters
            ----
            X:[样本数量，样本特征]
            
            Returns
            ---
        result: 数组 '''
        
        X = np.asarray(X)
        result = np.dot(X, self.w_[1:]) + self.w_[0]
        return result

In [5]:
class StandardScaler:
    '''标准化'''
    
    def fit(self, X):
        '''根据传递的样本，计算每个特征列的均值与标准差
        
        Parameters
        ----
        X:类数组
        '''
        
        X = np.asarray(X)
        self.std_ = np.std(X, axis=0)
        self.mean_ = np.mean(X, axis=0)
        
    def transform(self, X):
        '''将每一列都标准化处理，每一列都变成标准正态分布'''
        
        return (X-self.mean_) / self.std_
    
    def fit_transform(self, X):
        '''结合'''
        
        self.fit(X)
        return  self.transform(X)

In [31]:
lr = LinearRegression(alpha=0.0005, times=20)
t = data.sample(len(data), random_state=0)

#数据分组
train_X = t.iloc[:300, :-1]
train_y = t.iloc[:300, -1]
test_X = t.iloc[300:, :-1]
test_y = t.iloc[300:, -1]

#标准化
s = StandardScaler()
train_X = s.fit_transform(train_X)
test_X = s.transform(test_X)

s2 = StandardScaler()
train_y = s2.fit_transform(train_y)
test_y = s2.transform(test_y)

lr.fit(train_X, train_y)

result = lr.predict(test_X)

display(np.mean((result-test_y) ** 2))
display(lr.loss_)
display(lr.w_)

0.5126091003753368

[150.0,
 104.81715815291793,
 92.80474751979986,
 87.81516415669932,
 84.91261574401571,
 82.96937111157278,
 81.60671703729076,
 80.63462883254861,
 79.9348637882014,
 79.42782315496875,
 79.05837427856775,
 78.78780702497518,
 78.5886955669809,
 78.44146901940647,
 78.3320765209489,
 78.25037713080032,
 78.18901826003997,
 78.14264758474377,
 78.10735505567466,
 78.08027518802791]

array([-6.99440506e-17, -6.17961984e-03, -1.34467690e-01,  2.90980613e-01,
        2.08930383e-01, -4.39682503e-02, -6.11740814e-02, -1.28225713e-01,
        1.00486757e-01,  2.17444253e-01,  1.02966863e-01])

In [30]:
s3 = StandardScaler()
t = s3.fit_transform(t)
t.cov()

Unnamed: 0,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,value
age,1.002268,0.174131,0.185504,0.336187,0.260651,0.21974,-0.075351,0.204303,0.271391,0.302415,0.188315
sex,0.174131,1.002268,0.088361,0.24156,0.035357,0.142961,-0.379949,0.332868,0.150258,0.208605,0.04316
bmi,0.185504,0.088361,1.002268,0.396312,0.250344,0.261762,-0.367643,0.414745,0.44717,0.389561,0.58778
bp,0.336187,0.24156,0.396312,1.002268,0.24302,0.185979,-0.179167,0.258238,0.39437,0.391315,0.442485
s1,0.260651,0.035357,0.250344,0.24302,1.002268,0.898696,0.051636,0.543437,0.51667,0.326455,0.212503
s2,0.21974,0.142961,0.261762,0.185979,0.898696,1.002268,-0.196901,0.661313,0.319075,0.291259,0.174448
s3,-0.075351,-0.379949,-0.367643,-0.179167,0.051636,-0.196901,1.002268,-0.740167,-0.399481,-0.274318,-0.395684
s4,0.204303,0.332868,0.414745,0.258238,0.543437,0.661313,-0.740167,1.002268,0.619258,0.418158,0.431429
s5,0.271391,0.150258,0.44717,0.39437,0.51667,0.319075,-0.399481,0.619258,1.002268,0.465724,0.567167
s6,0.302415,0.208605,0.389561,0.391315,0.326455,0.291259,-0.274318,0.418158,0.465724,1.002268,0.383351
