In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [6]:
train = pd.read_csv("./house-prices-advanced-regression-techniques/train.csv")
test = pd.read_csv("./house-prices-advanced-regression-techniques/test.csv")#archivo a modificar para obtener 
#predicción usando todas las variables

In [7]:
##vista de los datos de entrenamiento
train.head(2)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500


In [8]:
##Revisión de variables
data = pd.concat([train.drop('SalePrice', axis=1), test], keys=['train', 'test'])
data.drop(['Id'], axis=1, inplace=True)

years = ['YearBuilt', 'YearRemodAdd', 'GarageYrBlt', 'YrSold']
metrics = ['LotFrontage', 'LotArea', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
         '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'GarageArea', 'WoodDeckSF', 
         'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal']
mask = (data[years] > 2018).any(axis=1) 
data.loc[mask, 'GarageYrBlt'] = data[mask]['YearBuilt']

###separación variables numericas y categoricas
num_feats = ['LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 
             'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'ExterQual', 'ExterCond', 
             'BsmtQual', 'BsmtCond', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', 
             'HeatingQC', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
             'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
             'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
             'Fireplaces', 'FireplaceQu', 'GarageYrBlt',
             'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond',
             'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
             'ScreenPorch', 'PoolArea', 'PoolQC', 'MiscVal',
             'YrSold']    

grades = ['OverallQual', 'OverallCond', 'ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond',
          'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC']

literal = ['Ex', 'Gd', 'TA', 'Fa', 'Po']

num = [9, 7, 5, 3, 2]

G = dict(zip(literal, num))

data[grades] = data[grades].replace(G)

cat_feats = data.drop(num_feats, axis=1).columns

In [9]:
##Transformación logarítmica
price = np.log1p(train['SalePrice'])
#asimetría
skewed_feats = data.loc['train'][metrics].apply(lambda x: x.skew(skipna=True)) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index

data[skewed_feats] = np.log1p(data[skewed_feats])

In [10]:
#Revisió y manejo de datos faltantes
data.isnull().sum()[data.isnull().sum() > 0]
feats = ['MSZoning', 'Utilities', 'Exterior1st', 'Exterior2nd', 'Electrical', 'Functional',
         'SaleType']
model = data.loc['train'].groupby('Neighborhood')[feats].apply(lambda x: x.mode().iloc[0])

for f in feats:
    data[f].fillna(data['Neighborhood'].map(model[f]), inplace=True)
data['LotFrontage'] = data['LotFrontage'].fillna(data.loc['train', 'LotFrontage'].median()) #mediana
data['KitchenQual'].fillna(data['OverallQual'], inplace=True)#reemplazamos con otra variable

In [11]:
bsmt = ['BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 
        'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'BsmtFullBath',
        'BsmtHalfBath', 
        'TotalBsmtSF']
fire = ['Fireplaces', 'FireplaceQu']
garage = ['GarageQual', 'GarageCond', 'GarageType', 'GarageFinish', 'GarageCars', 
          'GarageArea', 'GarageYrBlt']
masn = ['MasVnrType', 'MasVnrArea']
others = ['Alley', 'Fence', 'PoolQC', 'MiscFeature']

cats = data.columns[data.dtypes == 'object']
nums = list(set(data.columns) - set(cats))

data['MasVnrType'].replace({'None': np.nan}, inplace=True)

data[cats] = data[cats].fillna('0')
data[nums] = data[nums].fillna(0)

In [12]:
data.isnull().sum().sum()#revisión final nulos 

0

In [13]:
#cambio de tipo de dato
data['MSSubClass'] = data['MSSubClass'].astype('object', copy=False)
data['MoSold'] = data['MoSold'].astype('object', copy=False)#objeto
data['BsmtFullBath'] = data['BsmtFullBath'].astype('int64', copy=False)#entero
data['BsmtHalfBath'] = data['BsmtHalfBath'].astype('int64', copy=False)
data['GarageCars'] = data['GarageCars'].astype('int64', copy=False)
data[years] = data[years].astype('int64', copy=False)

In [14]:
categorical_data = pd.concat((data.loc['train'][cat_feats], price), axis=1)#agrupar variables categoricas

low = 0.05 * data.loc['train'].shape[0] 

for feat in cat_feats:        

    order = ((categorical_data.groupby(feat).mean()).sort_values(by='SalePrice', 
                                                      ascending=False).index.values.tolist())
    for i in range(0, len(order)):
        N = (categorical_data[categorical_data[feat] == order[i]]
             .count().max())
        j = i
        while (N < low) & (N != 0):
            j += 1

            if (j > len(order) - 1):
                j = i - 1
                break
            else: 
                N += (categorical_data[categorical_data[feat] == order[j]]
                      .count().max())
        if j < i:
            lim = len(order)
        else:
            lim = j

        for k in range(i, lim):
            categorical_data.replace({feat: {order[k]: order[j]}},
                                 inplace=True)
            data.replace({feat: {order[k]: order[j]}},
                                     inplace=True)            
    uniD = data[feat].unique()
    order = categorical_data[feat].unique()

    for i in uniD:
        if i not in order:
            ind = np.argsort(order - i)[0]
            data.replace({feat: {i: order[ind]}}, inplace=True)


In [15]:
#generación dummys
for feat in categorical_data.columns[:-1]:    
    uni = categorical_data.groupby(feat).mean().sort_values(by='SalePrice').index
    if (len(uni) < 2):
            data.drop(feat, axis=1, inplace=True)
    elif len(uni) < 3:
        print("{}: {}".format(feat, uni))
        data[feat].replace({uni[0]: 0, uni[1]: 1}, inplace=True)
        data[feat] = data[feat].astype('int8')
    else:
        data[feat] = data[feat].astype('category')
finaldata = pd.get_dummies(data)
black_list = bsmt + fire + garage + masn + others
for feat in finaldata.columns:
    if ('_0' in feat) and (feat.split("_")[0] in black_list):
        finaldata.drop(feat, axis=1, inplace=True)

MSZoning: Index(['RM', 'RL'], dtype='object', name='MSZoning')
Alley: Index(['Grvl', '0'], dtype='object', name='Alley')
LotShape: Index(['Reg', 'IR1'], dtype='object', name='LotShape')
LandContour: Index(['Lvl', 'Low'], dtype='object', name='LandContour')
LandSlope: Index(['Gtl', 'Mod'], dtype='object', name='LandSlope')
Condition1: Index(['Feedr', 'Norm'], dtype='object', name='Condition1')
RoofStyle: Index(['Gable', 'Hip'], dtype='object', name='RoofStyle')
BsmtFinType2: Index(['Rec', 'Unf'], dtype='object', name='BsmtFinType2')
CentralAir: Index(['N', 'Y'], dtype='object', name='CentralAir')
Electrical: Index(['FuseA', 'SBrkr'], dtype='object', name='Electrical')
Functional: Index(['Min2', 'Typ'], dtype='object', name='Functional')
PavedDrive: Index(['N', 'Y'], dtype='object', name='PavedDrive')
Fence: Index(['MnPrv', '0'], dtype='object', name='Fence')
SaleType: Index(['WD', 'New'], dtype='object', name='SaleType')


In [16]:
X_test = finaldata.loc['test']
X_train = finaldata.loc['train']
y_train = price
m = X_train.mean()
std = X_train.std()

X_train = (X_train - m) / std
X_test = (X_test - m) / std

In [18]:
#### Modelo Regresión Lasso
from sklearn.linear_model import LassoCV
Ls = LassoCV()
Ls.fit(X_train, y_train)
y_prediccion = Ls.predict(X_test)
#Revisión de variables más importantes en la regresión para usar en modelo sencillo
maxcoef = np.argsort(-np.abs(Ls.coef_))
coef = Ls.coef_[maxcoef]
for i in range(0, 10):
    print("{:.<005} {:< 010.4e}".format(finaldata.columns[maxcoef[i]], coef[i]))

GrLivArea  1.1707e-01
OverallQual  8.0499e-02
OverallCond  4.0788e-02
GarageCars  3.7414e-02
YearBuilt  3.4131e-02
1stFlrSF  3.1023e-02
Neighborhood_NridgHt  2.9418e-02
LotArea  2.7743e-02
BsmtFinSF1  2.6273e-02
MSZoning  2.0974e-02


In [26]:

######### Modificar por usuario ###########


línea_a_predecir=1 #Poner el número de línea a predecir, 0 está reservado al encabezado


print("El precio de la casa en la línea:",línea_a_predecir ," es ",round(y_prediccion[línea_a_predecir-1]*10000,2))

El precio de la casa en la línea: 1  es  116046.01


In [38]:
#### Modelo sencillo con top 5 variables importantes #######
Ls2 = LassoCV()
Ls2.fit(train[["GrLivArea","OverallQual","OverallCond","GarageCars","YearBuilt"]], train[["SalePrice"]])


######### Modificar por usuario ###########

### Digitar las variables 
GrLivArea=896 #Above grade (ground) living area square feet
OverallQual=5 #Rates the overall material and finish of the house 1-10, 10 Very excellent 
OverallCond=6 #Rates the overall condition of the house 1-10, 10 Very excellent 
GarageCars=1 #Size of garage in car capacity
YearBuilt=1961 #Original construction date

y_est = Ls2.predict(np.asarray([[GrLivArea,OverallQual,OverallCond,GarageCars,YearBuilt]]))

print("El precio de la casa con los datos proporcionados es",y_est[0])

El precio de la casa con los datos proporcionados es 111402.6052366842


  y = column_or_1d(y, warn=True)
