# DataProcessing Notebook - AAAG

In [1]:
# Basics Importation 
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
plt.style.use('ggplot')

# For Detailed Stats Output
import statsmodels.api as sm
# The linear regression models 
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, LassoCV, ElasticNetCV
# Instantiating the linear regression models
ols = LinearRegression()
ridge = Ridge()
lasso  = Lasso()
lassocv = LassoCV()
elasticnet = ElasticNet()
elasticnetcv = ElasticNetCV()
# The tree model
from sklearn import tree
# Instantiating the tree model (regression type)
regressor = tree.DecisionTreeRegressor()
# The ensemble model for random forest and bagging
from sklearn import ensemble
# Instantiating the ensemble models
randomForest = ensemble.RandomForestRegressor()
bagging      = ensemble.BaggingRegressor()
# Instantiating the boost models
gbm          = ensemble.GradientBoostingRegressor()
abr          = ensemble.AdaBoostRegressor()
# xg boost
import xgboost as xgb
# lg boost
import lightgbm as lgb
# K mean clustering
from sklearn.cluster import KMeans
kmeans = KMeans()
# The model selection for cross validation, k fold splits, train_test_split, grid search etc. 
import sklearn.model_selection as ms
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
# Some automatic feature selection functions (recursive finding, best feature selection etc.)
import sklearn.feature_selection as fs
# Importing the different error evaluation/metrics
from sklearn import metrics
from sklearn.metrics import silhouette_score

# Making it so that we can see all columns of the dataframe
pd.set_option('display.max_columns', None)

# Import data
df_train=pd.read_csv('df_train_GrLivArea_clean.csv',index_col='Id')
df_test=pd.read_csv('df_test_GrLivArea_clean.csv',index_col='Id')
df_total=pd.read_csv('df_total_GrLivArea_clean.csv',index_col='Id')

In [2]:
nominal_var_processed=['MSZoning','Street','Alley','LotShape','LandContour','LotConfig','LandSlope',\
             'Neighborhood','Condition1','Condition2','BldgType','HouseStyle','RoofStyle','RoofMatl',\
             'Exterior1st','Exterior2nd','MasVnrType','Foundation',\
             'BsmtFinType1','Heating','CentralAir',\
             'Electrical','Functional','GarageType','GarageFinish',\
             'PavedDrive','Fence','MiscFeature','SaleType','SaleCondition','MSSubClass',\
             'WoodDeckSF','OpenPorchSF','EnclosedPorch','3SsnPorch','ScreenPorch','PoolArea','MoSold']
ordinal_var_processed=['OverallQual','OverallCond','BedroomAbvGr','Fireplaces',\
             'YrSold','HeatingQC','KitchenQual','ExterQual','ExterCond','BsmtQual',\
             'BsmtCond','BsmtExposure','FireplaceQu','GarageQual','GarageCond','TotalBaths']
cont_var_processed=['LotFrontage','LotArea','YearBuilt','YearRemodAdd','MasVnrArea',\
          'GarageYrBlt','TotalSF','CarCapacity']
cont_var_for_tuning=ordinal_var_processed+cont_var_processed

In [3]:
def missingValuesInfo(df):
    total = df.isnull().sum().sort_values(ascending = False)
    percent = round(df.isnull().sum().sort_values(ascending = False)/len(df)*100, 2)
    temp = pd.concat([total, percent], axis = 1,keys= ['Total', 'Percent'])
    return temp.loc[(temp['Total'] > 0)]

In [4]:
def feature_standardize(data,scaleType='standardize'):
    '''
    - Accepts a dataframe column
    '''
    if scaleType not in ['standardize', 'normalize']: 
        raise ValueError('%s is not a valid choice' %(scaleType))
    mean_value=np.mean(data)
    standard_dev=np.std(data)
    min_value=np.min(data)
    max_value=np.max(data)
    if scaleType == 'standardize':
        return((data-mean_value)/standard_dev) 
    elif scaleType == 'normalize':
        return((data-min_value)/(max_value-min_value))

In [5]:
def dummify_column(dataframe,column_name):
    '''
    ### NOTE!!! ### vector operation not working yet!!!! ###
    - dataframe takes the entire dataframe you are working on
    - column_name takes a list of strings, where the strings are the column names
    '''
    for feature in column_name:
        dummified_feature = pd.get_dummies(dataframe.loc[:,feature], prefix=feature, prefix_sep='__',drop_first=True)
        dummified_df = pd.concat([dataframe.drop(feature,axis=1),dummified_feature],axis=1,sort='False')
    return dummified_df

In [6]:
def delete_outliers(dataframe,column_name,threshold=5):
    '''
    - Accepts a full dataframe
    - Specify column_name to indicate which columns to check for outliers
    - Column name accepts list of strings, where the strings are the column names
    - threshold is the number of deviations that will be used for removing outliers
    '''
    for feature in column_name:
        standard_dev=np.std(dataframe.loc[:,feature])
        dataframe=dataframe.drop(dataframe[dataframe.loc[:,feature]>standard_dev*threshold].index,axis=0)
    return dataframe


In [7]:
######## check for df_train ############################
i=0
for feature in nominal_var_processed:
    if not df_train.loc[:,feature].empty:
        i+=1
print(np.array(nominal_var_processed).size==i)

######## check for df_test ############################
i=0
for feature in nominal_var_processed:
    if not df_test.loc[:,feature].empty:
        i+=1
print(np.array(nominal_var_processed).size==i)

######## check for df_total ############################
i=0
for feature in nominal_var_processed:
    if not df_total.loc[:,feature].empty:
        i+=1
print(np.array(nominal_var_processed).size==i)

True
True
True


In [8]:
# dummify the nominal categorical columns in df_train ############################
df_train_copy=df_train.copy()
for feature in nominal_var_processed:
    df_train_copy=dummify_column(df_train_copy,[feature])
dummified_df_train=df_train_copy

# dummify the nominal categorical columns in df_test ############################
df_test_copy=df_test.copy()
for feature in nominal_var_processed:
    df_test_copy=dummify_column(df_test_copy,[feature])
dummified_df_test=df_test_copy

# dummify the nominal categorical columns in df_total ############################
df_total_copy=df_total.copy()
for feature in nominal_var_processed:
    df_total_copy=dummify_column(df_total_copy,[feature])
dummified_df_total=df_total_copy

# Creating an undummified dataframe for reference, for df_train ##################
undummified_df_train=df_train.copy()

# Creating an undummified dataframe for reference, for df_test ##################
undummified_df_test=df_test.copy()

# Creating an undummified dataframe for reference, for df_total ##################
undummified_df_total=df_total.copy()

print(dummified_df_train.shape)
print(dummified_df_test.shape)
print(dummified_df_total.shape)

print(undummified_df_train.shape)
print(undummified_df_test.shape)
print(undummified_df_total.shape)

print(dummified_df_train.isnull().sum().sum())
print(dummified_df_test.isnull().sum().sum())
print(dummified_df_total.isnull().sum().sum())
print('1459 nan for df_total because of SalePrice absence')

print(undummified_df_train.isnull().sum().sum())
print(undummified_df_test.isnull().sum().sum())
print(undummified_df_total.isnull().sum().sum())
print('1459 nan for df_total because of SalePrice absence')

(1457, 225)
(1459, 214)
(2916, 226)
(1457, 63)
(1459, 62)
(2916, 63)
0
0
1459
1459 nan for df_total because of SalePrice absence
0
0
1459
1459 nan for df_total because of SalePrice absence


In [9]:
######## check for dummified_df_train ############################
i=0
for feature in ordinal_var_processed:
    if not dummified_df_train.loc[:,feature].empty:
        i+=1
print(np.array(ordinal_var_processed).size==i)
######## check for dummified_df_test ############################
i=0
for feature in ordinal_var_processed:
    if not dummified_df_test.loc[:,feature].empty:
        i+=1
print(np.array(ordinal_var_processed).size==i)
######## check for dummified_df_total ############################
i=0
for feature in ordinal_var_processed:
    if not dummified_df_total.loc[:,feature].empty:
        i+=1
print(np.array(ordinal_var_processed).size==i)

True
True
True


In [10]:
# Check if continuous variables are still there:
######## check for no_outlier_dummified_df_train ############################
i=0
for feature in cont_var_processed:
    if not dummified_df_train.loc[:,feature].empty:
        i+=1
print(np.array(cont_var_processed).size==i)
######## check for with_outlier_dummified_df_test ############################
i=0
for feature in cont_var_processed:
    if not dummified_df_test.loc[:,feature].empty:
        i+=1
print(np.array(cont_var_processed).size==i)
######## check for no_outlier_dummified_df_total ############################
i=0
for feature in cont_var_processed:
    if not dummified_df_total.loc[:,feature].empty:
        i+=1
print(np.array(cont_var_processed).size==i)

True
True
True


In [11]:
standardized_dummified_df_train=dummified_df_train.copy()
standardized_dummified_df_train[cont_var_for_tuning]=standardized_dummified_df_train[cont_var_for_tuning].apply(lambda x:feature_standardize(x,scaleType='standardize'),axis=0)

standardized_dummified_df_total=dummified_df_total.copy()
standardized_dummified_df_total[cont_var_for_tuning]=standardized_dummified_df_total[cont_var_for_tuning].apply(lambda x:feature_standardize(x,scaleType='standardize'),axis=0)

standardized_dummified_df_test=dummified_df_test.copy()
standardized_dummified_df_test[cont_var_for_tuning]=standardized_dummified_df_test[cont_var_for_tuning].apply(lambda x:feature_standardize(x,scaleType='standardize'),axis=0)
##########################################
# Undummified versions for categorical grouping
##########################################
standardized_undummified_df_train=undummified_df_train.copy()
standardized_undummified_df_train[cont_var_for_tuning]=standardized_undummified_df_train[cont_var_for_tuning].apply(lambda x:feature_standardize(x,scaleType='standardize'),axis=0)
standardized_undummified_df_total=undummified_df_total.copy()
standardized_undummified_df_total[cont_var_for_tuning]=standardized_undummified_df_total[cont_var_for_tuning].apply(lambda x:feature_standardize(x,scaleType='standardize'),axis=0)
standardized_undummified_df_test=undummified_df_test.copy()
standardized_undummified_df_test[cont_var_for_tuning]=standardized_undummified_df_test[cont_var_for_tuning].apply(lambda x:feature_standardize(x,scaleType='standardize'),axis=0)

###############################################
print(standardized_dummified_df_train.shape)

print(standardized_dummified_df_total.shape)

print(standardized_dummified_df_test.shape)
print('\n')

# check for Nan's

print(standardized_dummified_df_train.isnull().sum().sum())
print(standardized_dummified_df_total.isnull().sum().sum())
print(standardized_dummified_df_test.isnull().sum().sum())


(1457, 225)
(2916, 226)
(1459, 214)


0
1459
0


In [12]:
intersection_total_train=set(standardized_dummified_df_total.columns)&set(standardized_dummified_df_test.columns)
intersection_cols=set(standardized_dummified_df_train.columns)&intersection_total_train
intersection_cols_with_Sales=list(intersection_cols)+['SalePrice']

In [13]:
standardized_dummified_df_total.loc[:,intersection_cols_with_Sales].to_csv('standardized_dummified_df_total_GrLivArea_clean.csv')
standardized_dummified_df_test.loc[:,intersection_cols].to_csv('standardized_dummified_df_test_GrLivArea_clean.csv')
standardized_dummified_df_train.loc[:,intersection_cols_with_Sales].to_csv('standardized_dummified_df_train_GrLivArea_clean.csv')

standardized_undummified_df_total.to_csv('standardized_undummified_df_total_GrLivArea_clean.csv')
standardized_undummified_df_test.to_csv('standardized_undummified_df_test_GrLivArea_clean.csv')
standardized_undummified_df_train.to_csv('standardized_undummified_df_train_GrLivArea_clean.csv')