In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression

In [20]:
df = pd.read_csv('boston_housing.csv', sep = ';', decimal = ',')

In [21]:
#crim, уровень преступности на душу населения по городам.
#zn, доля земли под жилую застройку, зонированная на участки свыше 25 000 кв.м.
#indus, доля акров неторгового бизнеса на город.
#chas, фиктивная переменная реки Чарльз (= 1, если участок ограничивает реку; 0 в противном случае).
#nox, концентрация оксидов азота (частей на 10 млн).
#rm, среднее количество комнат в жилом доме.
#age, доля домовладений, построенных до 1940 года.
#dis, средневзвешенное расстояние до пяти бостонских центров занятости.
#rad, индекс доступности к радиальным магистралям.
#tax, полная ставка налога на имущество за 10 000 долларов.
#ptratio, соотношение учеников и учителей по городам.
#black, 1000(Bk - 0,63)^2, где Bk — доля чернокожих по городам.
#lstat, более низкий статус населения (в процентах).
#medv, средняя стоимость домов, занимаемых владельцами, в 1000 долларов

In [22]:
df.tail(5)

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
501,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67,22.4
502,0.04527,0.0,11.93,0,0.573,6.12,76.7,2.2875,1,273,21.0,396.9,9.08,20.6
503,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.9,5.64,23.9
504,0.10959,0.0,11.93,0,0.573,6.794,89.3,2.3889,1,273,21.0,393.45,6.48,22.0
505,0.04741,0.0,11.93,0,0.573,6.03,80.8,2.505,1,273,21.0,396.9,7.88,11.9


In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   crim     506 non-null    float64
 1   zn       506 non-null    float64
 2   indus    506 non-null    float64
 3   chas     506 non-null    int64  
 4   nox      506 non-null    float64
 5   rm       506 non-null    float64
 6   age      506 non-null    float64
 7   dis      506 non-null    float64
 8   rad      506 non-null    int64  
 9   tax      506 non-null    int64  
 10  ptratio  506 non-null    float64
 11  black    506 non-null    float64
 12  lstat    506 non-null    float64
 13  medv     506 non-null    float64
dtypes: float64(11), int64(3)
memory usage: 55.5 KB


In [24]:
X_train, X_test, y_train, y_test = train_test_split(df.drop('medv',axis=1), 
                                                    df['medv'], 
                                                    test_size=0.3, 
                                                    random_state=888)

In [25]:
print(X_train.shape)
print(X_test.shape)

(354, 13)
(152, 13)


In [26]:
min_model = LinearRegression()

In [27]:
%%time
min_model.fit(X_train, y_train)

CPU times: total: 109 ms
Wall time: 137 ms


In [28]:
pred = min_model.predict(X_test)

In [29]:
test_pred = X_test.copy()


In [30]:
X_test.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat
212,0.21719,0.0,10.59,1,0.489,5.807,53.8,3.6526,4,277,18.6,390.94,16.03
274,0.05644,40.0,6.41,1,0.447,6.758,32.9,4.0776,4,254,17.6,396.9,3.53
12,0.09378,12.5,7.87,0,0.524,5.889,39.0,5.4509,5,311,15.2,390.5,15.71
494,0.27957,0.0,9.69,0,0.585,5.926,42.6,2.3817,6,391,19.2,396.9,13.59
363,4.22239,0.0,18.1,1,0.77,5.803,89.0,1.9047,24,666,20.2,353.04,14.64


In [31]:
test_pred['predict'] = pred
test_pred['real'] = y_test

In [32]:
test_pred.head()

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,predict,real
212,0.21719,0.0,10.59,1,0.489,5.807,53.8,3.6526,4,277,18.6,390.94,16.03,22.332769,22.4
274,0.05644,40.0,6.41,1,0.447,6.758,32.9,4.0776,4,254,17.6,396.9,3.53,35.816878,32.4
12,0.09378,12.5,7.87,0,0.524,5.889,39.0,5.4509,5,311,15.2,390.5,15.71,20.622689,21.7
494,0.27957,0.0,9.69,0,0.585,5.926,42.6,2.3817,6,391,19.2,396.9,13.59,20.22281,24.5
363,4.22239,0.0,18.1,1,0.77,5.803,89.0,1.9047,24,666,20.2,353.04,14.64,20.52832,16.8


In [34]:
from sklearn.metrics import mean_squared_error

In [35]:
pred_nan = min_model.predict(X_test)

In [36]:
print('Ошибка RMSE c random seed 888 =',mean_squared_error(y_test, pred))

Ошибка RMSE c random seed 888 = 20.59502512160336


In [38]:
# разные random seed
pr = []


for i in range(1000):
    X_train, X_test, y_train, y_test = train_test_split(df.drop('medv',axis=1), 
                                                    df['medv'], 
                                                    test_size=0.3, 
                                                    random_state=i)
    min_model = LinearRegression()
    min_model.fit(X_train, y_train)
    pred = min_model.predict(X_test)
    #print(f'Ошибка c random seed={i}, RMSE =',mean_squared_error(y_test, pred))
    pr.append(mean_squared_error(y_test, pred))

pd.Series(pr).mean()

24.324925435872444