In [23]:
import numpy as np
import pandas as pd

In [29]:
data = pd.read_csv(r"diabetes.csv")
data.head()
new_columns = pd.DataFrame(data=np.ones((len(data),1)), columns=["Intercept"])
data = pd.concat(objs=[new_columns, data], axis=1)

In [25]:
data.duplicated().any()
data.count()

Intercept    442
age          442
sex          442
bmi          442
bp           442
s1           442
s2           442
s3           442
s4           442
s5           442
s6           442
value        442
dtype: int64

In [83]:
class LinearRegression:
    
    def fit(self, X, y):
        
        X = np.asmatrix(X.copy())
        y = np.asmatrix(y).reshape(-1, 1)
        self.w_ = (X.T * X).I * X.T * y
        
    def predict(self, X):
        
        X = np.asmatrix(X.copy())
        result = X * self.w_
        return np.array(result).ravel()
        

In [85]:
class StandardScaler:
    '''标准化'''
    
    def fit(self, X):
        '''根据传递的样本，计算每个特征列的均值与标准差
        
        Parameters
        ----
        X:类数组
        '''
        
        X = np.asarray(X)
        self.std_ = np.std(X, axis=0)
        self.mean_ = np.mean(X, axis=0)
        
    def transform(self, X):
        '''将每一列都标准化处理，每一列都变成标准正态分布'''
        
        return (X-self.mean_) / self.std_
    
    def fit_transform(self, X):
        '''结合'''
        
        self.fit(X)
        return  self.transform(X)

In [88]:
t = data.sample(len(data), random_state=0)

#数据分组
train_X = t.iloc[:350, :-1]
train_y = t.iloc[:350, -1]
test_X = t.iloc[350:, :-1]
test_y = t.iloc[350:, -1]

display(data.head())
display(train_X.head())
display(train_y.head())

lr = LinearRegression()
lr.fit(train_X, train_y)
result = lr.predict(test_X)

display(lr.w_)
display(np.mean((result - test_y) ** 2))


result_train = lr.predict(train_X)
display(np.mean((result_train - train_y) ** 2))

Unnamed: 0,Intercept,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,value
0,1.0,0.038076,0.05068,0.061696,0.021872,-0.044223,-0.034821,-0.043401,-0.002592,0.019908,-0.017646,151.0
1,1.0,-0.001882,-0.044642,-0.051474,-0.026328,-0.008449,-0.019163,0.074412,-0.039493,-0.06833,-0.092204,75.0
2,1.0,0.085299,0.05068,0.044451,-0.005671,-0.045599,-0.034194,-0.032356,-0.002592,0.002864,-0.02593,141.0
3,1.0,-0.089063,-0.044642,-0.011595,-0.036656,0.012191,0.024991,-0.036038,0.034309,0.022692,-0.009362,206.0
4,1.0,0.005383,-0.044642,-0.036385,0.021872,0.003935,0.015596,0.008142,-0.002592,-0.031991,-0.046641,135.0


Unnamed: 0,Intercept,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6
362,1.0,0.019913,0.05068,0.104809,0.070073,-0.035968,-0.026679,-0.024993,-0.002592,0.003712,0.040343
249,1.0,-0.01278,-0.044642,0.060618,0.052858,0.047965,0.029375,-0.017629,0.034309,0.070211,0.007207
271,1.0,0.038076,0.05068,0.008883,0.04253,-0.042848,-0.021042,-0.039719,-0.002592,-0.018118,0.007207
435,1.0,-0.01278,-0.044642,-0.023451,-0.040099,-0.016704,0.004636,-0.017629,-0.002592,-0.038459,-0.038357
400,1.0,-0.023677,-0.044642,0.045529,0.09073,-0.01808,-0.035447,0.07073,-0.039493,-0.034524,-0.009362


362    321.0
249    215.0
271    127.0
435     64.0
400    175.0
Name: value, dtype: float64

matrix([[  151.56649067],
        [  -33.90272827],
        [ -239.31937112],
        [  507.01463092],
        [  343.42828334],
        [-1137.27388873],
        [  742.29664814],
        [  269.46938112],
        [  243.65715808],
        [  791.58110405],
        [  116.06024853]])

2753.8332301706237

2911.28091321094

In [80]:
np.asmatrix(train_X.iloc[[50]]) * lr.w_

matrix([[156.97502488]])

In [87]:
t.cov()

Unnamed: 0,Intercept,age,sex,bmi,bp,s1,s2,s3,s4,s5,s6,value
Intercept,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
age,0.0,0.002268,0.000394,0.00042,0.000761,0.00059,0.000497,-0.00017,0.000462,0.000614,0.000684,0.689758
sex,0.0,0.000394,0.002268,0.0002,0.000547,8e-05,0.000323,-0.00086,0.000753,0.00034,0.000472,0.158085
bmi,0.0,0.00042,0.0002,0.002268,0.000897,0.000566,0.000592,-0.000832,0.000938,0.001012,0.000881,2.152914
bp,0.0,0.000761,0.000547,0.000897,0.002268,0.00055,0.000421,-0.000405,0.000584,0.000892,0.000885,1.620729
s1,0.0,0.00059,8e-05,0.000566,0.00055,0.002268,0.002033,0.000117,0.001229,0.001169,0.000739,0.778355
s2,0.0,0.000497,0.000323,0.000592,0.000421,0.002033,0.002268,-0.000445,0.001496,0.000722,0.000659,0.638967
s3,0.0,-0.00017,-0.00086,-0.000832,-0.000405,0.000117,-0.000445,0.002268,-0.001675,-0.000904,-0.000621,-1.449309
s4,0.0,0.000462,0.000753,0.000938,0.000584,0.001229,0.001496,-0.001675,0.002268,0.001401,0.000946,1.580234
s5,0.0,0.000614,0.00034,0.001012,0.000892,0.001169,0.000722,-0.000904,0.001401,0.002268,0.001054,2.077412
