# Regressão Linear em Python

In [1]:
import pandas as pd
import numpy as np

In [20]:
data = pd.read_csv('C:\Python\california_housing_train.csv')

In [21]:
data.head()

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income,median_house_value
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936,66900.0
1,-114.47,34.4,19.0,7650.0,1901.0,1129.0,463.0,1.82,80100.0
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509,85700.0
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917,73400.0
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.925,65500.0


## Statsmodels

In [15]:
import statsmodels.api as sm
y = 'median_house_value'

In [22]:
data.drop([y], axis=1)

Unnamed: 0,longitude,latitude,housing_median_age,total_rooms,total_bedrooms,population,households,median_income
0,-114.31,34.19,15.0,5612.0,1283.0,1015.0,472.0,1.4936
1,-114.47,34.40,19.0,7650.0,1901.0,1129.0,463.0,1.8200
2,-114.56,33.69,17.0,720.0,174.0,333.0,117.0,1.6509
3,-114.57,33.64,14.0,1501.0,337.0,515.0,226.0,3.1917
4,-114.57,33.57,20.0,1454.0,326.0,624.0,262.0,1.9250
...,...,...,...,...,...,...,...,...
16995,-124.26,40.58,52.0,2217.0,394.0,907.0,369.0,2.3571
16996,-124.27,40.69,36.0,2349.0,528.0,1194.0,465.0,2.5179
16997,-124.30,41.84,17.0,2677.0,531.0,1244.0,456.0,3.0313
16998,-124.30,41.80,19.0,2672.0,552.0,1298.0,478.0,1.9797


In [23]:
modelo = sm.OLS(data[y], data.drop([y], axis=1))
res = modelo.fit()
print(res.summary())

                                 OLS Regression Results                                
Dep. Variable:     median_house_value   R-squared (uncentered):                   0.901
Model:                            OLS   Adj. R-squared (uncentered):              0.901
Method:                 Least Squares   F-statistic:                          1.927e+04
Date:                Mon, 05 Aug 2024   Prob (F-statistic):                        0.00
Time:                        15:04:33   Log-Likelihood:                     -2.1492e+05
No. Observations:               17000   AIC:                                  4.298e+05
Df Residuals:                   16992   BIC:                                  4.299e+05
Df Model:                           8                                                  
Covariance Type:            nonrobust                                                  
                         coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------

In [18]:
from sklearn.preprocessing import StandardScaler

In [24]:
X = data.drop([y], axis=1)
scaler = StandardScaler()
normalized_X = scaler.fit_transform(X)
normalized_X = pd.DataFrame(normalized_X, columns=X.columns)
normalized_X['intercept'] = 1

modelo = sm.OLS(data[y], normalized_X)
res = modelo.fit()
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:     median_house_value   R-squared:                       0.641
Model:                            OLS   Adj. R-squared:                  0.641
Method:                 Least Squares   F-statistic:                     3798.
Date:                Mon, 05 Aug 2024   Prob (F-statistic):               0.00
Time:                        15:04:38   Log-Likelihood:            -2.1365e+05
No. Observations:               17000   AIC:                         4.273e+05
Df Residuals:                   16991   BIC:                         4.274e+05
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                         coef    std err          t      P>|t|      [0.025      0.975]
--------------------------------------------------------------------------------------
longitude           -8.65e+04   1583

## Scikit_learn

In [25]:
data.shape

(17000, 9)

In [28]:
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

# Dividir os dados em conjunto de treino e validação
Xtrain, Xval, ytrain, yval = train_test_split(X, data[y], test_size=0.3, random_state=0)
print(Xtrain.shape, Xval.shape, ytrain.shape, yval.shape)

# Criar um pipeline com normalização e regressão linear
modelo = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', LinearRegression())
])

# Treinar o modelo
modelo.fit(Xtrain, ytrain)

# Fazer previsões
p = modelo.predict(Xval)


(11900, 8) (5100, 8) (11900,) (5100,)


In [29]:
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(yval, p))

69057.71585050465

In [31]:
coeficientes = modelo.named_steps['regressor'].coef_
print(coeficientes)

[-86347.90780568 -91686.55531605  14820.25508291 -18738.10696631
  50487.93188416 -42342.52251271  14973.15005562  77778.8185199 ]


## Scikit-learn standardizado

In [33]:
Xtrain, Xval, ytrain, yval = train_test_split(X, data[y], test_size=0.3, random_state=0)
print(Xtrain.shape, Xval.shape, ytrain.shape, yval.shape)

# Criar um pipeline com normalização e regressão linear
modelo = Pipeline([
    ('scaler', StandardScaler()),
    ('regressor', LinearRegression())
])

# Treinar o modelo
modelo.fit(Xtrain, ytrain)

# Fazer previsões
p = modelo.predict(Xval_scaled)

(11900, 8) (5100, 8) (11900,) (5100,)




In [34]:
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(yval, p))

3818671.710461007

In [35]:
print(coeficientes)

[-86347.90780568 -91686.55531605  14820.25508291 -18738.10696631
  50487.93188416 -42342.52251271  14973.15005562  77778.8185199 ]


# Fim.