In [1]:
# Загрузка библиотек
import numpy as np # для работы с массивами
import pandas as pd # для работы с DataFrame 
from sklearn import datasets # для импорта данных
import seaborn as sns # для визуализации статистических данных
import matplotlib.pyplot as plt # для построения графиков



In [2]:
column_names = ['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD', 'TAX', 'PTRATIO', 'B', 'LSTAT', 'PRICE']
boston_data = pd.read_csv('C:\VS Code/housing.csv', header=None, delimiter=r"\s+", names=column_names)
boston_data.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT,PRICE
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.09,1,296.0,15.3,396.9,4.98,24.0
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242.0,17.8,396.9,9.14,21.6
2,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242.0,17.8,392.83,4.03,34.7
3,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222.0,18.7,394.63,2.94,33.4
4,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222.0,18.7,396.9,5.33,36.2


In [3]:
# составляем матрицу А и вектор целевой переменной
CRIM = boston_data['CRIM']
RM = boston_data['RM']
A = np.column_stack((np.ones(506), CRIM, RM))
y = boston_data[['PRICE']]
print(A)

[[1.0000e+00 6.3200e-03 6.5750e+00]
 [1.0000e+00 2.7310e-02 6.4210e+00]
 [1.0000e+00 2.7290e-02 7.1850e+00]
 ...
 [1.0000e+00 6.0760e-02 6.9760e+00]
 [1.0000e+00 1.0959e-01 6.7940e+00]
 [1.0000e+00 4.7410e-02 6.0300e+00]]


In [4]:
# проверим размерность
print(A.shape)

(506, 3)


In [5]:
w_hat = np.linalg.inv(A.T@A)@A.T@y
print(w_hat.values)

[[-29.24471945]
 [ -0.26491325]
 [  8.39106825]]


In [6]:
# добавились новые данные:
CRIM_new = 0.1
RM_new = 8
# делаем прогноз типичной стоимости дома
PRICE_new = w_hat.iloc[0]+w_hat.iloc[1]*CRIM_new+w_hat.iloc[2]*RM_new
print(PRICE_new.values)

[37.85733519]


In [7]:
new=np.array([[1,CRIM_new,RM_new]])
print('prediction:', (new@w_hat).values)

prediction: [[37.85733519]]


In [8]:
from sklearn.linear_model import LinearRegression
# создаём модель линейной регрессии
model = LinearRegression(fit_intercept=False)
# вычисляем коэффициенты регрессии
model.fit(A, y)
print('w_hat:', model.coef_)
new_prediction = model.predict(new)
print('prediction:', new_prediction)

w_hat: [[-29.24471945  -0.26491325   8.39106825]]
prediction: [[37.85733519]]


In [9]:
CRIM_new = 0.2
RM_new = 6
w_hat =np.array([-29.3, -0.26,  8.4])
PRICE_new = w_hat[0]+w_hat[1]*CRIM_new+w_hat[2]*RM_new
print(PRICE_new)

21.048000000000005


In [10]:
boston_data[['CHAS', 'LSTAT', 'CRIM','RM']].describe()

Unnamed: 0,CHAS,LSTAT,CRIM,RM
count,506.0,506.0,506.0,506.0
mean,0.06917,12.653063,3.613524,6.284634
std,0.253994,7.141062,8.601545,0.702617
min,0.0,1.73,0.00632,3.561
25%,0.0,6.95,0.082045,5.8855
50%,0.0,11.36,0.25651,6.2085
75%,0.0,16.955,3.677083,6.6235
max,1.0,37.97,88.9762,8.78


In [11]:
# составляем матрицу наблюдений и вектор целевой переменной
A = np.column_stack((np.ones(506), boston_data[['CHAS', 'LSTAT', 'CRIM','RM']]))
y = boston_data[['PRICE']]
# вычисляем OLS-оценку для коэффициентов без стандартизации
w_hat=np.linalg.inv(A.T@A)@A.T@y
print(w_hat.values)

[[-1.92052548]
 [ 3.9975594 ]
 [-0.58240212]
 [-0.09739445]
 [ 5.07554248]]


In [12]:
# составляем матрицу наблюдений без дополнительного столбца из единиц
A = boston_data[['CHAS', 'LSTAT', 'CRIM','RM']]
y = boston_data[['PRICE']]
# стандартизируем векторы в столбцах матрицы A
A_cent = A - A.mean()
A_st = A_cent/np.linalg.norm(A_cent, axis=0)
A_st.describe().round(2)

Unnamed: 0,CHAS,LSTAT,CRIM,RM
count,506.0,506.0,506.0,506.0
mean,-0.0,-0.0,-0.0,-0.0
std,0.04,0.04,0.04,0.04
min,-0.01,-0.07,-0.02,-0.17
25%,-0.01,-0.04,-0.02,-0.03
50%,-0.01,-0.01,-0.02,-0.0
75%,-0.01,0.03,0.0,0.02
max,0.16,0.16,0.44,0.16


In [13]:
print(np.linalg.norm(A_st, axis=0))

[1. 1. 1. 1.]


In [14]:
# стандартизируем вектор целевой переменной
y_cent = y - y.mean()
y_st = y_cent/np.linalg.norm(y_cent)

In [15]:
# вычислим OLS-оценку для стандартизированных коэффициентов
w_hat_st=np.linalg.inv(A_st.T@A_st)@A_st.T@y_st
print(w_hat_st.values)

[[ 0.11039956]
 [-0.45220423]
 [-0.09108766]
 [ 0.38774848]]


In [16]:
# матрица Грама
A_st.T @ A_st

Unnamed: 0,CHAS,LSTAT,CRIM,RM
CHAS,1.0,-0.053929,-0.055892,0.091251
LSTAT,-0.053929,1.0,0.455621,-0.613808
CRIM,-0.055892,0.455621,1.0,-0.219247
RM,0.091251,-0.613808,-0.219247,1.0


In [17]:
y = np.array([12, 8])
y_cent = y - y.mean()
y_st = y_cent/np.linalg.norm(y_cent)
y_st

array([ 0.70710678, -0.70710678])

In [18]:
v = np.array([5, 1, 2])
u = np.array([4, 2, 8])
np.corrcoef(v, u)

array([[1.        , 0.05241424],
       [0.05241424, 1.        ]])

In [19]:
x1 = np.array([5.1, 1.8, 2.1, 10.3, 12.1, 12.6])
x2 = np.array([10.2, 3.7, 4.1, 20.5, 24.2, 24.1])
x3 = np.array([2.5, 0.9, 1.1, 5.1, 6.1, 6.3])
cr = np.corrcoef(x1, x2)
cr

array([[1.        , 0.99925473],
       [0.99925473, 1.        ]])

In [20]:
data = {
    'x1': [5.1, 1.8, 2.1, 10.3, 12.1, 12.6],
    'x2': [10.2, 3.7, 4.1, 20.5, 24.2, 24.1],
    'x3': [2.5, 0.9, 1.1, 5.1, 6.1, 6.3]   
    }
df = pd.DataFrame(data, columns=['x1','x2','x3'])
df

Unnamed: 0,x1,x2,x3
0,5.1,10.2,2.5
1,1.8,3.7,0.9
2,2.1,4.1,1.1
3,10.3,20.5,5.1
4,12.1,24.2,6.1
5,12.6,24.1,6.3


In [21]:
cr = df.corr()
cr

Unnamed: 0,x1,x2,x3
x1,1.0,0.999255,0.999837
x2,0.999255,1.0,0.999066
x3,0.999837,0.999066,1.0


In [22]:
np.linalg.matrix_rank(cr)

3

In [23]:
np.linalg.det(cr)

4.862298229242007e-07

In [24]:
x1 = 1
x2 = 4
y = 10.4 + 8 *x1 + 0.5 * x2 + 3 * x1 * x1 + 0.4 * x2 * x2
round(y, 1)

29.8

In [25]:
x = np.array([1, 3, -2, 9])
y = np.array([3, 7, -5, 21])
A = np.array([
    [1, 1, 1, 1],
    [1, 3, -2, 9],
    [1, 9, 4, 81]
]).T

w_hat = np.linalg.inv(A.T@A)@A.T@y
print(w_hat)

[ 0.11446013  2.46095638 -0.01608801]


In [26]:
A = np.array([[1, 1, 1, 1, 1],
             [5, 9, 4, 3, 5],
             [15, 18, 18, 19, 19],
             [7, 6, 7, 7, 7]
             ]).T
print(A)
y = np.array([24, 22, 35, 33, 36])
E = np.eye(4)
# коэффициент регуляризации 
alpha = 1
# получаем оценку коэффициентов регрессии по МНК с регуляризацией Тихонова
w_hat_ridge = np.linalg.inv(A.T@A+alpha*E)@A.T@y
print(w_hat_ridge)

[[ 1  5 15  7]
 [ 1  9 18  6]
 [ 1  4 18  7]
 [ 1  3 19  7]
 [ 1  5 19  7]]
[-0.08523045 -1.70784126  1.91141216  0.7293992 ]


In [29]:
A = np.array([[1, 9, 4], [9, 4, 7], [4, 7, 12]])
A@A.T

array([[ 98,  73, 115],
       [ 73, 146, 148],
       [115, 148, 209]])

In [30]:
eig_values, eig_vectors = np.linalg.eig(A@A.T)
print('Собственные числа: \n', eig_values)

Собственные числа: 
 [391.37004987  45.77088847  15.85906166]


In [31]:
x1 = np.array([1, 2, 1, 1]).T
x2 = np.array([70, 130, 65, 60]).T

C = np.array([
[1, 0.9922],
[0.9922, 1],
])

eig_values, eig_vectors = np.linalg.eig(C)

x1_norm = (x1 - x1.mean()) / np.linalg.norm(x1)
x2_norm = (x2 - x2.mean()) / np.linalg.norm(x2)

x_new = x1_norm * eig_vectors[0][0] + x2_norm * eig_vectors[1][0]

x_new_norm = (x_new - x_new.mean()) / np.linalg.norm(x_new)

print(np.round(x_new_norm, 2))

[-0.24  0.86 -0.29 -0.33]


In [1]:
i = j = [3]
i += j
print(i,j)

[3, 3] [3, 3]
