In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import scipy.stats as stats

In [35]:
dataset = pd.read_csv("train_HP.csv")

In [22]:
def getNanColNames(df):
    colnames = df.columns
    cols_con_na = []
    for col in colnames:
        if(df[col].isnull().sum() > 0):
            cols_con_na.append(col)
    return cols_con_na

In [23]:
def plot_density_variable(df, variable):
    
    plt.figure(figsize = (15,6))
    plt.subplot(121)
    df[variable].hist(bins=30)
    plt.title(variable)
    
    plt.subplot(122)
    stats.probplot(df[variable], dist="norm", plot=plt)
    plt.show()

In [24]:
def getContinuesCols(df):
    colnames = df.columns
    numeric_continues_vars = []
    for col in colnames:
        unique_values =len (df[col].unique())
        if((df[col].dtype != 'object') and (unique_values > 30)):
            numeric_continues_vars.append(col)
    return numeric_continues_vars

### Verificación de NaN

In [26]:
#Columnas con NaN

cols_con_nan = getNanColNames(dataset)
cols_con_nan

['LotFrontage',
 'Alley',
 'MasVnrType',
 'MasVnrArea',
 'BsmtQual',
 'BsmtCond',
 'BsmtExposure',
 'BsmtFinType1',
 'BsmtFinType2',
 'Electrical',
 'FireplaceQu',
 'GarageType',
 'GarageYrBlt',
 'GarageFinish',
 'GarageQual',
 'GarageCond',
 'PoolQC',
 'Fence',
 'MiscFeature']

In [27]:
# Tipo de columnas con NaN

dataset[cols_con_nan].dtypes

LotFrontage     float64
Alley            object
MasVnrType       object
MasVnrArea      float64
BsmtQual         object
BsmtCond         object
BsmtExposure     object
BsmtFinType1     object
BsmtFinType2     object
Electrical       object
FireplaceQu      object
GarageType       object
GarageYrBlt     float64
GarageFinish     object
GarageQual       object
GarageCond       object
PoolQC           object
Fence            object
MiscFeature      object
dtype: object

In [28]:
# Proporción de NaN

dataset[cols_con_nan].isnull().mean()

LotFrontage     0.177397
Alley           0.937671
MasVnrType      0.005479
MasVnrArea      0.005479
BsmtQual        0.025342
BsmtCond        0.025342
BsmtExposure    0.026027
BsmtFinType1    0.025342
BsmtFinType2    0.026027
Electrical      0.000685
FireplaceQu     0.472603
GarageType      0.055479
GarageYrBlt     0.055479
GarageFinish    0.055479
GarageQual      0.055479
GarageCond      0.055479
PoolQC          0.995205
Fence           0.807534
MiscFeature     0.963014
dtype: float64

In [36]:
#Comprobación de datos nulos antes de imputar media o mediana:

dataset['LotFrontage'].isnull().sum()

259

### Imputación de Media y Mediana

In [38]:
#Se imputa la media o media según la mejor correlación.

meanValue = np.round(dataset['LotFrontage'].mean(), 0)
medianValue = np.round(dataset['LotFrontage'].median(), 0)

dataset_FL_meanImp = dataset['LotFrontage'].fillna(meanValue)
dataset_FL_medianImp = dataset['LotFrontage'].fillna(medianValue)

valor_media = np.corrcoef(dataset_FL_meanImp, dataset['LotFrontage'])[0,1]
valor_mediana = np.corrcoef(dataset_FL_medianImp, dataset['LotFrontage'])[0,1]

if (valor_media > valor_mediana):
    dataset['LotFrontage'] = dataset['LotFrontage'].fillna(meanValue)
else:
    dataset['LotFrontage'] = dataset['LotFrontage'].fillna(medianValue)


In [39]:
#Comprobación de datos nulos luego de imputar la media o mediana:

dataset['LotFrontage'].isnull().sum()

0

In [40]:
dataset_temp = dataset

### Transformaciones

In [43]:
# Transformación Logaritmica

dataset_temp['LotArea_log'] = np.log(dataset['LotArea'])
np.corrcoef(dataset_temp['LotArea_log'], dataset_temp['SalePrice'])[0, 1]

0.3885202679345965

In [44]:
# Tranformación Inversa

dataset_temp['LotArea_inv'] = (1 / dataset['LotArea'])
np.corrcoef(dataset_temp['LotArea_inv'], dataset_temp['SalePrice'])[0, 1]

-0.2950744036106421

In [45]:
#Tranformación Polinomial de Orden 2

dataset_temp['LotArea_quadratic'] = (dataset['LotArea']**2)
np.corrcoef(dataset_temp['LotArea_quadratic'], dataset_temp['SalePrice'])[0, 1]

0.11446844700039834

In [47]:
#Transformación de Box Cox:

dataset_temp['LotArea_boxCox'], lambdaX = stats.boxcox(dataset_temp['LotArea'])
np.corrcoef(dataset_temp['LotArea_boxCox'], dataset_temp['SalePrice'])[0, 1]


0.38951441164829037

In [48]:
#Transformación de YeoJohnson

dataset_temp['LotArea_YJ'], lambdaX = stats.yeojohnson(dataset_temp['LotArea'])
np.corrcoef(dataset_temp['LotArea_YJ'], dataset_temp['SalePrice'])[0, 1]

0.3895178427128208

In [49]:
# La correlación mas alta es la de YeoJohnson con 0.389517, entonces:

dataset['LotArea_YJ'], lambdaX = stats.yeojohnson(dataset['LotArea'])
dataset

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,MoSold,YrSold,SaleType,SaleCondition,SalePrice,LotArea_log,LotArea_inv,LotArea_quadratic,LotArea_boxCox,LotArea_YJ
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,2,2008,WD,Normal,208500,9.041922,0.000118,71402500,10.433680,10.427531
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,5,2007,WD,Normal,181500,9.169518,0.000104,92160000,10.602809,10.596445
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,9,2008,WD,Normal,223500,9.328123,0.000089,126562500,10.813973,10.807340
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,2,2006,WD,Abnorml,140000,9.164296,0.000105,91202500,10.595874,10.589519
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,12,2008,WD,Normal,250000,9.565214,0.000070,203347600,11.131570,11.124530
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,...,8,2007,WD,Normal,175000,8.976768,0.000126,62678889,10.347575,10.341536
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,...,2,2010,WD,Normal,210000,9.486076,0.000076,173580625,11.025301,11.018398
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,...,5,2010,WD,Normal,266500,9.109636,0.000111,81757764,10.523351,10.517089
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,...,4,2010,WD,Normal,142125,9.181632,0.000103,94420089,10.618900,10.612516


### Exportar archivo a CSV

In [50]:
dataset.to_csv('Dataset_final.csv')