In [1]:
import os
import sys
import re
import random
import math
import scipy
import numpy as np
import pandas as pd
from time import time
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import matplotlib.gridspec as gridspec
import matplotlib.colors as colors
import matplotlib.cm as cm

from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import LabelBinarizer
from sklearn.cross_validation import train_test_split
from sklearn.model_selection import KFold # import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import export_graphviz
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
from sklearn.pipeline import Pipeline


from mpl_toolkits.mplot3d import Axes3D
cmapRdYlGn = cm.get_cmap('RdYlGn')
cmap = cm.coolwarm


challenge_data_folder = "./challenge_data"

from minisom import MiniSom



## Cleaning

In [15]:
pdAllData_Categ = pd.read_csv(challenge_data_folder+"/train.csv",keep_default_na=False) #Load training data NA filter FALSE to keep Not available data
#Categ indicates that no label encoding has been performed, the categorical features are untouched
columnTypesDict = pdAllData_Categ.dtypes.to_dict() #Infer part of the data schema
columnNames = list(columnTypesDict.keys()) #List of all features name

At a first, the only information we have on the data set resides in the description file therefore we tried to match the data and the description file to make sure the data is clean and clear. One must first match features names and then specified values in the description file and in the dataset.

In [35]:
##Data cleaning
def dataCleaning(description_file_Path):
    """Checks in the data for not mentioned values in the description file given."""
    pathDescr = (challenge_data_folder+description_file_Path)
    txtDescripData = open(pathDescr).readlines()
    columnValues={}
    columnIndex=[]
    nbHeader = 7
    NoHeaderTxtDescripData = txtDescripData[nbHeader:]
    #Parse the description file to retrieve all the lines that corresponds to features    
    for feature in columnNames:
        if feature == 'Id': continue
        for index,line in enumerate(NoHeaderTxtDescripData):
            if feature in line: #Match features name and stated name in the description
                if(line.split(feature)[0]=='' and line.split(feature)[1][0]==':' ): #Make sure exactly this feature in this line 
                    columnIndex.append((feature,index)) #We retrieved the line where the feature is described
                pass

    NoSpecificValfeatures = []
    SpecificVal = {} #Will contain as keys column names and values the specified values in the description file
    
    #Scan the cocument to retrieve specfic values for each featurz
    for index,featureLine in enumerate(columnIndex):

        startLine= featureLine[1]+2  #RTF file format 
        if index == len(columnIndex)-1: endLine = len(NoHeaderTxtDescripData)-1
        else: endLine = columnIndex[index+1][1]-1
        values = []
        if endLine <= startLine: #No specified value in the description file
            NoSpecificValfeatures.append(featureLine[0])
            continue
        for i in range(startLine,endLine):
            line = NoHeaderTxtDescripData[i] 
            values.append(line.split('\t')[0].strip())
        SpecificVal[featureLine[0]]=values #All values specified are retrieved and kept in this dictionary
        
    #Retrieve Bad values for the columns with specified values
    dicBadValues = []
    for factor in SpecificVal:

        #For a factor search for any value not specified in the description file 
        #In terms of type of valu

        typeCol = columnTypesDict[factor]
        if typeCol == 'int64': valSpec = [int(i) for i in SpecificVal[factor]]
        else: valSpec = SpecificVal[factor]
        pdInter = pdAllData_Categ.apply(lambda x: x[factor] not in valSpec,axis=1)
        potentialBadValues = pdAllData_Categ[pdAllData_Categ.apply(lambda x: x[factor] not in valSpec,axis=1)]
        nbBadVal = len(potentialBadValues)

        if nbBadVal>0:
            badValues = [factor]
            badValues.append(potentialBadValues.groupby(factor)[factor].count().to_dict())
            dicBadValues.append(badValues)
    #Retrieve Bad values for the columns with not specified values such as year of construction of 

    for feature in NoSpecificValfeatures:
        dtypeColumn = columnTypesDict[feature]
        if dtypeColumn=='float64': typeFun = float #infered type -> check if any values do not convert to this type
        elif dtypeColumn=='int64': typeFun = int
        else: typeFun=lambda x: True
        pdInter = pdAllData_Categ[pdAllData_Categ.apply(lambda x: (x[feature]=='NA') and typeFun(x[feature]) ,axis=1)]
        if len(pdInter)>0:
            badValues=[feature]
            badValues.append(pdInter.groupby(feature)[feature].count().to_dict())
            dicBadValues.append(badValues)
    return SpecificVal,NoSpecificValfeatures,dicBadValues

In [37]:
bad_description_file_Path = "/Bad_Description.rtf"
bad_specificVal,bad_noSpecificValfeatures,bad_dicBadValues = dataCleaning(bad_description_file_Path)
bad_dicBadValues

[['MSZoning', {'C (all)': 9}],
 ['Neighborhood', {'NAmes': 179}],
 ['BldgType', {'2fmCon': 28, 'Duplex': 41, 'Twnhs': 37}],
 ['Exterior2nd', {'Brk Cmn': 7, 'CmentBd': 49, 'Wd Shng': 29}],
 ['MasVnrType', {'NA': 6}],
 ['LotFrontage', {'NA': 210}],
 ['MasVnrArea', {'NA': 6}],
 ['GarageYrBlt', {'NA': 67}]]

In [36]:
description_file_Path = "/Data description.rtf"
SpecificVal,NoSpecificValfeatures,dicBadValues = dataCleaning(description_file_Path)
print(NoSpecificValfeatures)
dicBadValues

['LotFrontage', 'LotArea', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'FullBath', 'HalfBath', 'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 'YrSold']


[['MSZoning', {'C (all)': 9}],
 ['BldgType', {'Twnhs': 37}],
 ['MasVnrType', {'NA': 6}],
 ['LotFrontage', {'NA': 210}],
 ['MasVnrArea', {'NA': 6}],
 ['GarageYrBlt', {'NA': 67}]]

In [20]:
print('When GarageYrBlt = NA, GarageType has values ',set(pdAllData_Categ[pdAllData_Categ.apply(lambda x: x.GarageYrBlt=='NA',axis=1)].GarageType.values))
print('When MasVnrArea = NA, MasVnrArea has values',set(pdAllData_Categ[pdAllData_Categ.apply(lambda x: x.MasVnrArea=='NA',axis=1)].MasVnrType.values))

When GarageYrBlt = NA, GarageType has values  {'NA'}
When MasVnrArea = NA, MasVnrArea has values {'NA'}


<div class='alert alert-warning'>
First of all, some column names were wrong in the description file erors. By deducing form the data, we changed 
Bedroom to BedroomAbvGr and 
Kitchen to KitchenAbvGr.<br>

THere are also errors in the values described.<br>
Most errors with the initial description are easily corrected by slighlty modifying the description file but some are more complex and persist such as :<br>
<li>C(all)</li>
<li>Twnhs</li>
They have no direct typo or writings and the other values pecified are included in the data.<br>
Although those values are not explicit in the description file, they contain information so we decided to keep them eventhough no direct explainition of those category is available.<br>
<br>
Otherwise, the other NA values can contain information or be explained by other specified values. The two cases above state that when a property has no garage, the year the garage has been built is not available and no MasVnrType implies no MasVnrArea which is reasonable but important to take into account.<br>

To conclude on this, no purely invalid values have been spotted in the data. Plus the NA values are hard to distinguish between not measured or simply absent of the good that's why we decieded to keep them.
</div>


## Encoding Categorical Features 

<div class='alert alert-warning'>
        &emsp; The data may be clean, it contains unconsistent datatype; strings, int and float. However, as mentioned before, we decided to use a tree model to solve this problem and tree models give exceeding results on datasets with numerical features. Therefore the first step of our pipeline is to transform string values to numbers which is encode categorical features.<br>
        &emsp;Encoding categorical features consists in transforming discrete string values to numerical values . To do so, many techniques exists, we decided to study two approches: label encoding and one hot encoding.<br><br>
        &emsp;Label encoding consists in simply associating to a string value of a categorical feature to a int. For exemple, the Alley featurn has possible values Grvl, Pave and NA. A possible encoding would replace all Grvl values by 0, Pave values by 1 and NA values by 2. The main disadvantage of this simple technique is that it introduces an order between the numerical values that can lead to misinterpretation by the model of this feature.
        &emsp;The one-hot encoding tries to cope with this problem by intoducing dummy variable. In short, for each possible values, we introduce a new column which values will be 1 where the original features had this value in the frist place 0 otherwise. In the case of the alley, we would add three column Alley_Pave, Alley_Grvl and Alley_NA. If a house has no alley then we would have 0 in Alley_Pave and Alley_Grvl and a 1 in the Alley-NA column. 
</div>

In [39]:
def encodeCategoricalFeatures(Data,labelEncode=True,oneHotEncode=False):
    """Encode the input dataframe Data using the label or one hot encoding technique """
    if labelEncode: #Label Encoding technique
        lb_make = LabelEncoder()
        for featureCol in Data:
            if Data[featureCol].dtype=='object':
                Data[featureCol] = lb_make.fit_transform(Data[featureCol])
            else:
                continue
        return Data
    elif oneHotEncode:
        data_frames =[]
        for featureCol in Data:
            if Data[featureCol].dtype=='object' and featureCol in SpecificVal:
                lb_style = LabelBinarizer()
                lb_results = lb_style.fit_transform(Data[featureCol])
                if(len(lb_style.classes_)==2):columnsNames = [featureCol] #Simple binarisation -> on column resulting for two classes 
                else: columnsNames = lb_style.classes_ #One columnfor each possible values 

                lb_results = pd.DataFrame(lb_results, columns=[featureCol+"_"+name for name in columnsNames])
                data_frames.append(lb_results)
            elif Data[featureCol].dtype=='object': #NA values in a number column
                lb_make = LabelEncoder()
                data_frames.append(pd.DataFrame(lb_make.fit_transform(Data[featureCol]),columns=[featureCol]))
            else:
                data_frames.append(Data[featureCol])
        else: raise Exception('Choose an encoding technique labelEncode=True or oneHotEncode=True')
        return pd.concat(data_frames,axis=1)    