# Preparation

In [1]:
from sklearn.metrics import mean_squared_error
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
import pandas as pd
import numpy as np

X = pd.DataFrame({
    "IDH_per_nation": [0.7, 0.87, 0.83, 0.9, 0.37],
    "Labor": ["Secretary", "Engineer", "Engineer", "Engineer", "Secretary"],
    "Time_service": [1, 3, 5, 4, 4],
    "Income": [1000, 2000, 3000, 4000, 500]
})

X_train = X.drop("Income", axis=1)
y = X["Income"].copy()

X_train

Unnamed: 0,IDH_per_nation,Labor,Time_service
0,0.7,Secretary,1
1,0.87,Engineer,3
2,0.83,Engineer,5
3,0.9,Engineer,4
4,0.37,Secretary,4


In [2]:
num_attribs = ["Time_service", "IDH_per_nation"]
cat_attribs = ["Labor"]

preprocessing = ColumnTransformer([
    ("cat", OneHotEncoder(handle_unknown="ignore"), cat_attribs),
    ("num", StandardScaler(), num_attribs)
], remainder='drop')

In [3]:
X_train_prepared = preprocessing.fit_transform(X_train)
X_train_prepared

array([[ 0.        ,  1.        , -1.76930347, -0.17491415],
       [ 1.        ,  0.        , -0.29488391,  0.69965661],
       [ 1.        ,  0.        ,  1.17953565,  0.49387525],
       [ 1.        ,  0.        ,  0.44232587,  0.85399263],
       [ 0.        ,  1.        ,  0.44232587, -1.87261034]])

In [4]:
lin_reg = LinearRegression()

lin_reg.fit(X_train_prepared, y)

0,1,2
,"fit_intercept  fit_intercept: bool, default=True Whether to calculate the intercept for this model. If set to False, no intercept will be used in calculations (i.e. data is expected to be centered).",True
,"copy_X  copy_X: bool, default=True If True, X will be copied; else, it may be overwritten.",True
,"tol  tol: float, default=1e-6 The precision of the solution (`coef_`) is determined by `tol` which specifies a different convergence criterion for the `lsqr` solver. `tol` is set as `atol` and `btol` of :func:`scipy.sparse.linalg.lsqr` when fitting on sparse training data. This parameter has no effect when fitting on dense data. .. versionadded:: 1.7",1e-06
,"n_jobs  n_jobs: int, default=None The number of jobs to use for the computation. This will only provide speedup in case of sufficiently large problems, that is if firstly `n_targets > 1` and secondly `X` is sparse or if `positive` is set to `True`. ``None`` means 1 unless in a :obj:`joblib.parallel_backend` context. ``-1`` means using all processors. See :term:`Glossary ` for more details.",
,"positive  positive: bool, default=False When set to ``True``, forces the coefficients to be positive. This option is only supported for dense arrays. For a comparison between a linear regression model with positive constraints on the regression coefficients and a linear regression without such constraints, see :ref:`sphx_glr_auto_examples_linear_model_plot_nnls.py`. .. versionadded:: 0.24",False


## Time to see it working

In [5]:
X_test = X.iloc[:2]
test_labels = y.iloc[:2]

X_test_prepared = preprocessing.transform(X_test)

print("labels:", list(test_labels))
print("Predictions:", lin_reg.predict(X_test_prepared))

labels: [1000, 2000]
Predictions: [1121.41719745 2145.70063694]


In [6]:
X_predictions = lin_reg.predict(X_train_prepared)
mse = mean_squared_error(y, X_predictions)
mse


148128.9808917196

In [7]:
rmse = np.sqrt(mse)
rmse

np.float64(384.8752796578648)

In [8]:
# The results are the same because only own "Income" as label
mse = mean_squared_error(y, X_predictions, multioutput="raw_values")
mse

array([148128.98089172])