In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, root_mean_squared_error, r2_score

In [2]:
boston_train = pd.read_csv('../datasets/boston_housing/train.csv')
boston_train

Unnamed: 0,ID,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat,medv
0,1,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98,24.0
1,2,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14,21.6
2,4,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94,33.4
3,5,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33,36.2
4,7,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311,15.2,395.60,12.43,22.9
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
328,500,0.17783,0.0,9.69,0,0.585,5.569,73.5,2.3999,6,391,19.2,395.77,15.10,17.5
329,502,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67,22.4
330,503,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08,20.6
331,504,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64,23.9


In [3]:
def preprocess(data):
    data = data.copy()
    ids = data["ID"]
    data.drop("ID", axis=1, inplace=True)

    return data, ids

In [5]:
y_train = boston_train["medv"].to_numpy()
x_train = boston_train.drop("medv", axis=1)

In [6]:
x_train_p, x_train_ids = preprocess(x_train)

In [7]:
x_train_p

Unnamed: 0,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat
0,0.00632,18.0,2.31,0,0.538,6.575,65.2,4.0900,1,296,15.3,396.90,4.98
1,0.02731,0.0,7.07,0,0.469,6.421,78.9,4.9671,2,242,17.8,396.90,9.14
2,0.03237,0.0,2.18,0,0.458,6.998,45.8,6.0622,3,222,18.7,394.63,2.94
3,0.06905,0.0,2.18,0,0.458,7.147,54.2,6.0622,3,222,18.7,396.90,5.33
4,0.08829,12.5,7.87,0,0.524,6.012,66.6,5.5605,5,311,15.2,395.60,12.43
...,...,...,...,...,...,...,...,...,...,...,...,...,...
328,0.17783,0.0,9.69,0,0.585,5.569,73.5,2.3999,6,391,19.2,395.77,15.10
329,0.06263,0.0,11.93,0,0.573,6.593,69.1,2.4786,1,273,21.0,391.99,9.67
330,0.04527,0.0,11.93,0,0.573,6.120,76.7,2.2875,1,273,21.0,396.90,9.08
331,0.06076,0.0,11.93,0,0.573,6.976,91.0,2.1675,1,273,21.0,396.90,5.64


In [8]:
model = LinearRegression()

In [9]:
model.fit(x_train_p, y_train) 

0,1,2
,"fit_intercept  fit_intercept: bool, default=True Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered).",True
,"copy_X  copy_X: bool, default=True If True, X will be copied; else, it may be overwritten.",True
,"tol  tol: float, default=1e-6 The precision of the solution (`coef_`) is determined by `tol` which specifies a different convergence criterion for the `lsqr` solver. `tol` is set as `atol` and `btol` of :func:`scipy.sparse.linalg.lsqr` when fitting on sparse training data. This parameter has no effect when fitting on dense data. .. versionadded:: 1.7",1e-06
,"n_jobs  n_jobs: int, default=None The number of jobs to use for the computation. This will only provide speedup in case of sufficiently large problems, that is if firstly `n_targets > 1` and secondly `X` is sparse or if `positive` is set to `True`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",
,"positive  positive: bool, default=False When set to ``True``, forces the coefficients to be positive. This option is only supported for dense arrays. For a comparison between a linear regression model with positive constraints on the regression coefficients and a linear regression without such constraints, see :ref:`sphx_glr_auto_examples_linear_model_plot_nnls.py`. .. versionadded:: 0.24",False


In [10]:
b_0, k_0 = model.intercept_, model.coef_
b_0, k_0

(np.float64(34.0454377635062),
 array([-5.24893379e-02,  4.74448677e-02,  5.38552422e-02,  3.78486439e+00,
        -1.57396571e+01,  3.76883175e+00, -4.62660241e-03, -1.54882312e+00,
         3.28967093e-01, -1.28664959e-02, -8.56975746e-01,  1.16659048e-02,
        -6.00315456e-01]))

In [11]:
boston_test = pd.read_csv('../datasets/boston_housing/test.csv')
boston_test

Unnamed: 0,ID,crim,zn,indus,chas,nox,rm,age,dis,rad,tax,ptratio,black,lstat
0,3,0.02729,0.0,7.07,0,0.469,7.185,61.1,4.9671,2,242,17.8,392.83,4.03
1,6,0.02985,0.0,2.18,0,0.458,6.430,58.7,6.0622,3,222,18.7,394.12,5.21
2,8,0.14455,12.5,7.87,0,0.524,6.172,96.1,5.9505,5,311,15.2,396.90,19.15
3,9,0.21124,12.5,7.87,0,0.524,5.631,100.0,6.0821,5,311,15.2,386.63,29.93
4,10,0.17004,12.5,7.87,0,0.524,6.004,85.9,6.5921,5,311,15.2,386.71,17.10
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
168,496,0.17899,0.0,9.69,0,0.585,5.670,28.8,2.7986,6,391,19.2,393.29,17.60
169,497,0.28960,0.0,9.69,0,0.585,5.390,72.9,2.7986,6,391,19.2,396.90,21.14
170,499,0.23912,0.0,9.69,0,0.585,6.019,65.3,2.4091,6,391,19.2,396.90,12.92
171,501,0.22438,0.0,9.69,0,0.585,6.027,79.7,2.4982,6,391,19.2,396.90,14.33


In [12]:
boston_test_p, boston_test_ids = preprocess(boston_test)

In [13]:
boston_preds = model.predict(boston_test_p)
boston_preds

array([30.59959405, 25.10045908, 18.15899329,  9.30347646, 17.68972915,
       16.65182092, 18.51882129, 15.23793478, 12.77539026, 15.09480043,
       19.32857545, 20.7603967 ,  7.25650407, 13.675262  , 23.76968476,
       22.16124319, 23.23698123, 28.24020488,  6.95192158, 27.57212726,
       20.94591814, 23.88416113, 20.51730788, 21.71863158, 20.92650021,
       22.50487235, 26.03767504, 27.13701546, 29.18599823, 28.34345301,
       35.62031213, 35.20970596, 32.03584105, 21.36328231, 18.15874459,
       20.68968026, 20.21639887, 20.06733566, 19.80862845, 22.80952871,
       14.24251311, 20.5551442 , 20.67918759, 17.47377758, 13.27568977,
       11.76625287,  8.09536665, 15.51847406, 18.62611878, 21.6956843 ,
       20.96229314, 33.66835399, 42.31009306, 26.23920334, 21.98840175,
       24.15677069, 32.20837756, 40.51664053, 35.35519512, 33.70726511,
       29.78892509, 36.76125057, 17.13313212, 16.93836757, 22.7582338 ,
       23.42171226, 24.61986453, 28.27753006, 25.34688276, 30.62

In [14]:
id_preds = {'ID': boston_test_ids, 'medv': boston_preds}

In [15]:
result = pd.DataFrame(id_preds)
result

Unnamed: 0,ID,medv
0,3,30.599594
1,6,25.100459
2,8,18.158993
3,9,9.303476
4,10,17.689729
...,...,...
168,496,16.763294
169,497,13.415179
170,499,21.361446
171,501,20.341302


In [16]:
result.to_csv('boston_housing_submission_two.csv', index=False)