In [1]:
import pandas as pd 
import numpy as np

import matplotlib.pyplot as plt 
import seaborn as sns 

from sklearn import datasets

import statsmodels.api as sm
import statsmodels.formula.api as smf

from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import MinMaxScaler

- MedInc        median income in block group
        - MedInc        median income in block group
        - HouseAge      median house age in block group
        - AveRooms      average number of rooms per household
        - AveBedrms     average number of bedrooms per household
        - Population    block group population
        - AveOccup      average number of household members
        - Latitude      block group latitude
        - Longitude     block group longitude

In [2]:
df , y = datasets.fetch_california_housing(as_frame=True,return_X_y=True)
df['MedHouseVal'] = y
colunas ={'MedInc':'receita_media_bloco',
          'HouseAge': 'idade_casa',
          'AveRooms':'media_quartos',
         'AveBedrms':'media_banheiros',
         'Population':'populacao_bloco',
         'AveOccup':'media_moradores'}
df.rename(columns=colunas,inplace=True)

In [3]:
df

Unnamed: 0,receita_media_bloco,idade_casa,media_quartos,media_banheiros,populacao_bloco,media_moradores,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [5]:
df[np.abs(df.receita_media_bloco-df.receita_media_bloco.mean()) <= (3*df.receita_media_bloco.std())]

Unnamed: 0,receita_media_bloco,idade_casa,media_quartos,media_banheiros,populacao_bloco,media_moradores,Latitude,Longitude,MedHouseVal
0,8.3252,41.0,6.984127,1.023810,322.0,2.555556,37.88,-122.23,4.526
1,8.3014,21.0,6.238137,0.971880,2401.0,2.109842,37.86,-122.22,3.585
2,7.2574,52.0,8.288136,1.073446,496.0,2.802260,37.85,-122.24,3.521
3,5.6431,52.0,5.817352,1.073059,558.0,2.547945,37.85,-122.25,3.413
4,3.8462,52.0,6.281853,1.081081,565.0,2.181467,37.85,-122.25,3.422
...,...,...,...,...,...,...,...,...,...
20635,1.5603,25.0,5.045455,1.133333,845.0,2.560606,39.48,-121.09,0.781
20636,2.5568,18.0,6.114035,1.315789,356.0,3.122807,39.49,-121.21,0.771
20637,1.7000,17.0,5.205543,1.120092,1007.0,2.325635,39.43,-121.22,0.923
20638,1.8672,18.0,5.329513,1.171920,741.0,2.123209,39.43,-121.32,0.847


In [None]:
indexnmes = df.query('media_moradores > 5 or media_quartos > 10 or media_banheiros > 2 or populacao_bloco < 300 or populacao_bloco > 4000').index

df.drop(indexnmes,inplace=True)


In [None]:
df_scalado = df.copy()

features = df_scalado[['receita_media_bloco', 'idade_casa', 'media_quartos', 'media_banheiros','populacao_bloco', 'media_moradores','MedHouseVal']]
scaler = StandardScaler().fit(features.values)
features = scaler.transform(features.values)

In [None]:
df_scalado[['receita_media_bloco', 'idade_casa', 'media_quartos', 'media_banheiros','populacao_bloco', 'media_moradores','MedHouseVal']] = features
df_scalado

In [None]:
modelo = 'MedHouseVal ~ receita_media_bloco + idade_casa + media_quartos + media_banheiros + populacao_bloco + media_moradores + Latitude + Longitude'

In [None]:
reg = smf.ols(modelo ,data = df_scalado).fit()
reg.summary()

In [None]:
df_scalado['resid_log'] = reg.resid

sns.scatterplot(x = 'idade_casa', y = 'resid_log', data = df_scalado, alpha = .75)
plt.axhline(y=0, color='r', linestyle='--')

In [None]:
sns.scatterplot(x = (df_scalado['receita_media_bloco']), y = (df_scalado['resid_log']), alpha = .75)
plt.axhline(y=0, color='r', linestyle='--')

In [None]:
df.hist(figsize=(12, 10), bins=30, edgecolor="black")
plt.subplots_adjust(hspace=0.7, wspace=0.4)

In [None]:
df_min = df.copy()

features = df_min[['receita_media_bloco', 'idade_casa', 'media_quartos', 'media_banheiros','populacao_bloco', 'media_moradores','MedHouseVal']]
scaler = MinMaxScaler().fit(features.values)
features = scaler.transform(features.values)

df_min[['receita_media_bloco', 'idade_casa', 'media_quartos', 'media_banheiros','populacao_bloco', 'media_moradores','MedHouseVal']] = features


In [None]:
modelo = 'MedHouseVal ~ receita_media_bloco + idade_casa + media_quartos + media_banheiros + populacao_bloco + media_moradores + Latitude + Longitude'

reg1 = smf.ols(modelo ,data = df_min).fit()
reg1.summary()

In [None]:
df_min['resid_log'] = reg.resid

sns.scatterplot(x = 'idade_casa', y = 'resid_log', data = df_min, alpha = .75)
plt.axhline(y=0, color='r', linestyle='--')

In [None]:
sns.scatterplot(x = (df_min['receita_media_bloco']+.1), y = np.log(df_min['resid_log']), alpha = .75)
plt.axhline(y=0, color='r', linestyle='--')