# DataProcessing Notebook - AAAG

In [1]:
# Basics Importation 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('ggplot')

# For Detailed Stats Output
import statsmodels.api as sm
# The linear regression models 
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, LassoCV, ElasticNetCV
# Instantiating the linear regression models
ols = LinearRegression()
ridge = Ridge()
lasso  = Lasso()
lassocv = LassoCV()
elasticnet = ElasticNet()
elasticnetcv = ElasticNetCV()
# The tree model
from sklearn import tree
# Instantiating the tree model (regression type)
regressor = tree.DecisionTreeRegressor()
# The ensemble model for random forest and bagging
from sklearn import ensemble
# Instantiating the ensemble models
randomForest = ensemble.RandomForestRegressor()
bagging      = ensemble.BaggingRegressor()
# Instantiating the boost models
gbm          = ensemble.GradientBoostingRegressor()
abr          = ensemble.AdaBoostRegressor()
# xg boost
import xgboost as xgb
# lg boost
import lightgbm as lgb
# K mean clustering
from sklearn.cluster import KMeans
kmeans = KMeans()
# The model selection for cross validation, k fold splits, train_test_split, grid search etc. 
import sklearn.model_selection as ms
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
# Some automatic feature selection functions (recursive finding, best feature selection etc.)
import sklearn.feature_selection as fs
# Importing the different error evaluation/metrics
from sklearn import metrics
from sklearn.metrics import silhouette_score

# Making it so that we can see all columns of the dataframe
pd.set_option('display.max_columns', None)

# Import data
df_train=pd.read_csv('df_train_cleaned.csv',index_col='Id')
df_test=pd.read_csv('df_test_cleaned.csv',index_col='Id')
df_total=pd.read_csv('df_total_cleaned.csv',index_col='Id')

In [2]:
nominal_var_processed=['MSZoning','Street','Alley','LotShape','LandContour','LotConfig','LandSlope',\
             'Neighborhood','Condition1','Condition2','BldgType','HouseStyle','RoofStyle','RoofMatl',\
             'Exterior1st','Exterior2nd','MasVnrType','Foundation',\
             'BsmtFinType1','Heating','CentralAir',\
             'Electrical','Functional','GarageType','GarageFinish',\
             'PavedDrive','Fence','MiscFeature','SaleType','SaleCondition','MSSubClass',\
             'WoodDeckSF','OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea','MoSold']
ordinal_var_processed=['OverallQual','OverallCond','BsmtFullBath','BsmtHalfBath','FullBath',\
             'HalfBath','BedroomAbvGr','TotRmsAbvGrd','Fireplaces','GarageCars',\
             'YrSold','HeatingQC','KitchenQual','ExterQual','ExterCond','BsmtQual',\
             'BsmtCond','BsmtExposure','FireplaceQu','GarageQual','GarageCond']
cont_var_processed=['LotFrontage','LotArea','YearBuilt','YearRemodAdd','MasVnrArea','TotalBsmtSF',\
          '1stFlrSF','2ndFlrSF','GrLivArea','GarageYrBlt','GarageArea']
cont_var_for_tuning=ordinal_var_processed+cont_var_processed

In [3]:
def missingValuesInfo(df):
    total = df.isnull().sum().sort_values(ascending = False)
    percent = round(df.isnull().sum().sort_values(ascending = False)/len(df)*100, 2)
    temp = pd.concat([total, percent], axis = 1,keys= ['Total', 'Percent'])
    return temp.loc[(temp['Total'] > 0)]

In [4]:
def feature_standardize(data,scaleType='standardize'):
    '''
    - Accepts a dataframe column
    '''
    if scaleType not in ['standardize', 'normalize']: 
        raise ValueError('%s is not a valid choice' %(scaleType))
    mean_value=np.mean(data)
    standard_dev=np.std(data)
    min_value=np.min(data)
    max_value=np.max(data)
    if scaleType == 'standardize':
        return((data-mean_value)/standard_dev) 
    elif scaleType == 'normalize':
        return((data-min_value)/(max_value-min_value))

In [5]:
def dummify_column(dataframe,column_name):
    '''
    ### NOTE!!! ### vector operation not working yet!!!! ###
    - dataframe takes the entire dataframe you are working on
    - column_name takes a list of strings, where the strings are the column names
    '''
    for feature in column_name:
        dummified_feature = pd.get_dummies(dataframe.loc[:,feature], prefix=feature, prefix_sep='__',drop_first=True)
        dummified_df = pd.concat([dataframe.drop(feature,axis=1),dummified_feature],axis=1,sort='False')
    return dummified_df

In [6]:
def delete_outliers(dataframe,column_name,threshold=5):
    '''
    - Accepts a full dataframe
    - Specify column_name to indicate which columns to check for outliers
    - Column name accepts list of strings, where the strings are the column names
    - threshold is the number of deviations that will be used for removing outliers
    '''
    for feature in column_name:
        standard_dev=np.std(dataframe.loc[:,feature])
        dataframe=dataframe.drop(dataframe[dataframe.loc[:,feature]>standard_dev*threshold].index,axis=0)
    return dataframe


In [7]:
# Check to see if nominal_var is contained in df
nominal_var_processed=['MSZoning','Street','Alley','LotShape','LandContour','LotConfig','LandSlope',\
             'Neighborhood','Condition1','Condition2','BldgType','HouseStyle','RoofStyle','RoofMatl',\
             'Exterior1st','Exterior2nd','MasVnrType','Foundation',\
             'BsmtFinType1','Heating','CentralAir',\
             'Electrical','Functional','GarageType','GarageFinish',\
             'PavedDrive','Fence','MiscFeature','SaleType','SaleCondition','MSSubClass',\
             'WoodDeckSF','OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea','MoSold']

######## check for df_train ############################
i=0
for feature in nominal_var_processed:
    if not df_train.loc[:,feature].empty:
        i+=1
print(np.array(nominal_var_processed).size==i)

######## check for df_test ############################
i=0
for feature in nominal_var_processed:
    if not df_test.loc[:,feature].empty:
        i+=1
print(np.array(nominal_var_processed).size==i)

######## check for df_total ############################
i=0
for feature in nominal_var_processed:
    if not df_total.loc[:,feature].empty:
        i+=1
print(np.array(nominal_var_processed).size==i)

True
True
True


In [8]:
# dummify the nominal categorical columns in df_train ############################
df_train_copy=df_train.copy()
for feature in nominal_var_processed:
    df_train_copy=dummify_column(df_train_copy,[feature])
dummified_df_train=df_train_copy

# dummify the nominal categorical columns in df_test ############################
df_test_copy=df_test.copy()
for feature in nominal_var_processed:
    df_test_copy=dummify_column(df_test_copy,[feature])
dummified_df_test=df_test_copy

# dummify the nominal categorical columns in df_total ############################
df_total_copy=df_total.copy()
for feature in nominal_var_processed:
    df_total_copy=dummify_column(df_total_copy,[feature])
dummified_df_total=df_total_copy

# Creating an undummified dataframe for reference, for df_train ##################
undummified_df_train=df_train.copy()

# Creating an undummified dataframe for reference, for df_test ##################
undummified_df_test=df_test.copy()

# Creating an undummified dataframe for reference, for df_total ##################
undummified_df_total=df_total.copy()

print(dummified_df_train.shape)
print(dummified_df_test.shape)
print(dummified_df_total.shape)

print(undummified_df_train.shape)
print(undummified_df_test.shape)
print(undummified_df_total.shape)

print(dummified_df_train.isnull().sum().sum())
print(dummified_df_test.isnull().sum().sum())
print(dummified_df_total.isnull().sum().sum())
print('1459 nan for df_total because of SalePrice absence')

print(undummified_df_train.isnull().sum().sum())
print(undummified_df_test.isnull().sum().sum())
print(undummified_df_total.isnull().sum().sum())
print('1459 nan for df_total because of SalePrice absence')

(1460, 234)
(1459, 222)
(2919, 235)
(1460, 71)
(1459, 70)
(2919, 71)
0
0
1459
1459 nan for df_total because of SalePrice absence
0
0
1459
1459 nan for df_total because of SalePrice absence


In [9]:
#########################################################################
# FOR df_train or df_total ONLY!!!! Not for df_test!!!
#########################################################################
# For chosen continuous features, remove outliers where outlier >= 5 std
dummified_df_train_copy=dummified_df_train.copy()
for feature in ['LotFrontage','LotArea','MasVnrArea','TotalBsmtSF','1stFlrSF','2ndFlrSF','GrLivArea','GarageArea']:
    dummified_df_train_copy=delete_outliers(dummified_df_train_copy,[feature],threshold=5)
no_outlier_dummified_df_train=dummified_df_train_copy 

dummified_df_total_copy=dummified_df_total.copy()
for feature in ['LotFrontage','LotArea','MasVnrArea','TotalBsmtSF','1stFlrSF','2ndFlrSF','GrLivArea','GarageArea']:
    dummified_df_total_copy=delete_outliers(dummified_df_total_copy,[feature],threshold=5)
no_outlier_dummified_df_total=dummified_df_total_copy 

# For the Undummified dataframe and for all continuous features, remove outliers where outlier >= 5 std
undummified_df_train_copy=undummified_df_train.copy()
for feature in ['LotFrontage','LotArea','MasVnrArea','TotalBsmtSF','1stFlrSF','2ndFlrSF','GrLivArea','GarageArea']:
    undummified_df_train_copy=delete_outliers(undummified_df_train_copy,[feature],threshold=5)
no_outlier_undummified_df_train=undummified_df_train_copy 

undummified_df_total_copy=undummified_df_total.copy()
for feature in ['LotFrontage','LotArea','MasVnrArea','TotalBsmtSF','1stFlrSF','2ndFlrSF','GrLivArea','GarageArea']:
    undummified_df_total_copy=delete_outliers(undummified_df_total_copy,[feature],threshold=5)
no_outlier_undummified_df_total=undummified_df_total_copy 
#########################################################################
#########################################################################
#########################################################################
# Creating dataframes with no outlier taken out
# This is for maintaining integrity of df_test
with_outlier_dummified_df_train=dummified_df_train.copy()
with_outlier_dummified_df_test=dummified_df_test.copy()
with_outlier_dummified_df_total=dummified_df_total.copy()
with_outlier_undummified_df_train=undummified_df_train.copy()
with_outlier_undummified_df_test=undummified_df_test.copy()
with_outlier_undummified_df_total=undummified_df_total.copy()

# check for Nan's
print('Check for Nan. 1459 for df_total')
print(with_outlier_dummified_df_train.isnull().sum().sum())
print(with_outlier_dummified_df_test.isnull().sum().sum())
print(with_outlier_dummified_df_total.isnull().sum().sum())
print(with_outlier_undummified_df_train.isnull().sum().sum())
print(with_outlier_undummified_df_test.isnull().sum().sum())
print(with_outlier_undummified_df_total.isnull().sum().sum())
print('\n')
print('No Outlier dummified and undummified df_train and df_total:')
print(no_outlier_dummified_df_train.shape)
print(no_outlier_dummified_df_total.shape)
print(no_outlier_undummified_df_train.shape)
print(no_outlier_undummified_df_total.shape)
print('\n')
print('With Outlier dummified and undummified df_train, df_test and df_total:')
print(with_outlier_dummified_df_train.shape)
print(with_outlier_dummified_df_test.shape)
print(with_outlier_dummified_df_total.shape)
print(with_outlier_undummified_df_train.shape)
print(with_outlier_undummified_df_test.shape)
print(with_outlier_undummified_df_total.shape)

Check for Nan. 1459 for df_total
0
0
1459
0
0
1459


No Outlier dummified and undummified df_train and df_total:
(1215, 234)
(2399, 235)
(1215, 71)
(2399, 71)


With Outlier dummified and undummified df_train, df_test and df_total:
(1460, 234)
(1459, 222)
(2919, 235)
(1460, 71)
(1459, 70)
(2919, 71)


In [10]:
# Check if ordinal variables are still there:
ordinal_var_processed=['OverallQual','OverallCond','BsmtFullBath','BsmtHalfBath','FullBath',\
             'HalfBath','BedroomAbvGr','TotRmsAbvGrd','Fireplaces','GarageCars',\
             'YrSold','HeatingQC','KitchenQual','ExterQual','ExterCond','BsmtQual',\
             'BsmtCond','BsmtExposure','FireplaceQu','GarageQual','GarageCond']

######## check for no_outlier_dummified_df_train ############################
i=0
for feature in ordinal_var_processed:
    if not no_outlier_dummified_df_train.loc[:,feature].empty:
        i+=1
print(np.array(ordinal_var_processed).size==i)
######## check for with_outlier_dummified_df_test ############################
i=0
for feature in ordinal_var_processed:
    if not with_outlier_dummified_df_test.loc[:,feature].empty:
        i+=1
print(np.array(ordinal_var_processed).size==i)
######## check for no_outlier_dummified_df_total ############################
i=0
for feature in ordinal_var_processed:
    if not no_outlier_dummified_df_total.loc[:,feature].empty:
        i+=1
print(np.array(ordinal_var_processed).size==i)

True
True
True


In [11]:
# Check if continuous variables are still there:
cont_var_processed=['LotFrontage','LotArea','YearBuilt','YearRemodAdd','MasVnrArea','TotalBsmtSF',\
          '1stFlrSF','2ndFlrSF','GrLivArea','GarageYrBlt','GarageArea']
######## check for no_outlier_dummified_df_train ############################
i=0
for feature in cont_var_processed:
    if not no_outlier_dummified_df_train.loc[:,feature].empty:
        i+=1
print(np.array(cont_var_processed).size==i)
######## check for with_outlier_dummified_df_test ############################
i=0
for feature in cont_var_processed:
    if not with_outlier_dummified_df_test.loc[:,feature].empty:
        i+=1
print(np.array(cont_var_processed).size==i)
######## check for no_outlier_dummified_df_total ############################
i=0
for feature in cont_var_processed:
    if not no_outlier_dummified_df_total.loc[:,feature].empty:
        i+=1
print(np.array(cont_var_processed).size==i)

True
True
True


In [12]:
# Dataframes to choose from:
# no_outlier_dummified_df_train
# no_outlier_dummified_df_total
# no_outlier_undummified_df_train
# no_outlier_undummified_df_total

# with_outlier_dummified_df_train
# with_outlier_dummified_df_test
# with_outlier_dummified_df_total
# with_outlier_undummified_df_train
# with_outlier_undummified_df_test
# with_outlier_undummified_df_total
########################################################################
# Standardizing/normalizing continuous and ordinal variables only
cont_var_for_tuning=ordinal_var_processed+cont_var_processed
########################################################################
standardized_no_outlier_dummified_df_train=no_outlier_dummified_df_train.copy()
standardized_no_outlier_dummified_df_train[cont_var_for_tuning]=standardized_no_outlier_dummified_df_train[cont_var_for_tuning].apply(lambda x:feature_standardize(x,scaleType='standardize'),axis=0)

standardized_no_outlier_dummified_df_total=no_outlier_dummified_df_total.copy()
standardized_no_outlier_dummified_df_total[cont_var_for_tuning]=standardized_no_outlier_dummified_df_total[cont_var_for_tuning].apply(lambda x:feature_standardize(x,scaleType='standardize'),axis=0)

standardized_with_outlier_dummified_df_total=with_outlier_dummified_df_total.copy()
standardized_with_outlier_dummified_df_total[cont_var_for_tuning]=standardized_with_outlier_dummified_df_total[cont_var_for_tuning].apply(lambda x:feature_standardize(x,scaleType='standardize'),axis=0)

normalized_no_outlier_dummified_df_total=no_outlier_dummified_df_total.copy()
normalized_no_outlier_dummified_df_total[cont_var_for_tuning]=normalized_no_outlier_dummified_df_total[cont_var_for_tuning].apply(lambda x:feature_standardize(x,scaleType='normalize'),axis=0)

standardized_with_outlier_dummified_df_test=with_outlier_dummified_df_test.copy()
standardized_with_outlier_dummified_df_test[cont_var_for_tuning]=standardized_with_outlier_dummified_df_test[cont_var_for_tuning].apply(lambda x:feature_standardize(x,scaleType='standardize'),axis=0)
##########################################
# Undummified versions for categorical grouping
##########################################
standardized_no_outlier_undummified_df_train=no_outlier_undummified_df_train.copy()
standardized_no_outlier_undummified_df_train[cont_var_for_tuning]=standardized_no_outlier_undummified_df_train[cont_var_for_tuning].apply(lambda x:feature_standardize(x,scaleType='standardize'),axis=0)
standardized_no_outlier_undummified_df_total=no_outlier_undummified_df_total.copy()
standardized_no_outlier_undummified_df_total[cont_var_for_tuning]=standardized_no_outlier_undummified_df_total[cont_var_for_tuning].apply(lambda x:feature_standardize(x,scaleType='standardize'),axis=0)
standardized_with_outlier_undummified_df_test=with_outlier_undummified_df_test.copy()
standardized_with_outlier_undummified_df_test[cont_var_for_tuning]=standardized_with_outlier_undummified_df_test[cont_var_for_tuning].apply(lambda x:feature_standardize(x,scaleType='standardize'),axis=0)

###############################################
print('df_total: 1203 columns, df_train: 922 columns, df_test: 902 columns')
print(standardized_no_outlier_dummified_df_train.shape)
print(standardized_no_outlier_dummified_df_total.shape)
print(standardized_with_outlier_dummified_df_total.shape)
print(normalized_no_outlier_dummified_df_total.shape)
print(standardized_with_outlier_dummified_df_test.shape)
print('\n')
# # Note that the all standardized_baseline_df has Nan's for some categoricals due to outlier removal and standardizing. These columns are removed.
# all_standardized_baseline_df.dropna(axis=1,inplace=True)

# check for Nan's
print('df_total has 1209 or 1459 nans because of SalePrice')
print(standardized_no_outlier_dummified_df_train.isnull().sum().sum())
print(standardized_no_outlier_dummified_df_total.isnull().sum().sum())
print(standardized_with_outlier_dummified_df_total.isnull().sum().sum())
print(normalized_no_outlier_dummified_df_total.isnull().sum().sum())
print(standardized_with_outlier_dummified_df_test.isnull().sum().sum())
missingValuesInfo(standardized_no_outlier_dummified_df_total)

df_total: 1203 columns, df_train: 922 columns, df_test: 902 columns
(1215, 234)
(2399, 235)
(2919, 235)
(2399, 235)
(1459, 222)


df_total has 1209 or 1459 nans because of SalePrice
0
1209
1459
1209
0


Unnamed: 0,Total,Percent
SalePrice,1209,50.4


In [13]:
# Exporting 
standardized_no_outlier_dummified_df_train.to_csv('standardized_no_outlier_dummified_df_train.csv')
standardized_no_outlier_dummified_df_total.to_csv('standardized_no_outlier_dummified_df_total.csv')
standardized_with_outlier_dummified_df_total.to_csv('standardized_with_outlier_dummified_df_total.csv')
normalized_no_outlier_dummified_df_total.to_csv('normalized_no_outlier_dummified_df_total.csv')
standardized_with_outlier_dummified_df_test.to_csv('standardized_with_outlier_dummified_df_test.csv')

standardized_no_outlier_undummified_df_train.to_csv('standardized_no_outlier_undummified_df_train.csv')
standardized_no_outlier_undummified_df_total.to_csv('standardized_no_outlier_undummified_df_total.csv')
standardized_with_outlier_undummified_df_test.to_csv('standardized_with_outlier_undummified_df_test.csv')