# Feature Engineering

Steps:
1. Missing Values
2. Temporal Vales
3. Categorical Variables: remove rare labels
4. Standardise the values of the variables to the same range


In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
#to visualize the columns in the dataframe
pd.pandas.set_option('display.max_columns',None)

In [None]:
dataset = pd.read_csv('train.csv')
dataset.head()

In [None]:
#Always remember there are way always be a chance of data leakage so we need to split the data first and 

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(dataset,dataset['SalePrice'],test_size=0.1,random_state=0)

In [None]:
X_train.shape,X_test.shape

## Missing Values

In [None]:
## Capture all nan values
## First handle categorical features which are missing
features_nan = [feature for feature in dataset.columns if dataset[feature].isnull().sum()>1 and dataset[feature].dtypes =='O']

for feature in features_nan:
    print("{} : {}% missing valeues".format(feature,np.round(dataset[feature].isnull().mean(),4)))

In [None]:
#Replace missing values with a new label o.e missing is the label for missing values
def replace_cat_feature(dataset,features_nan):
    data = dataset.copy()
    data[features_nan]= data[features_nan].fillna('Missing')
    return data

dataset = replace_cat_feature(dataset,features_nan)
dataset[features_nan].isnull().sum()

Here we can now obserse that there are no nan values in dataset

In [None]:
dataset.head()

In [None]:
# Check for numerical variables that contains the missing values
numerical_with_nan = [feature for feature in dataset.columns if dataset[feature].isnull().sum()>1 and dataset[feature].dtypes!='O']

#Numerical nan variables and percenmtage of missing values

for feature in numerical_with_nan:
    print("{}: {}% missing value".format(feature,np.around(dataset[feature].isnull().mean(),4)))

We have obsereved in data analysis phase that there are many outliers in the dataset. Hence we need to replace numerical missing values with the median and mode

In [None]:
## Replace the numerical missing values

for feature in numerical_with_nan:
    ## replace by using median since there are outliers
    median_value = dataset[feature].median()
    
    ## create a new feature to capture nan values
    dataset[feature+'nan'] = np.where(dataset[feature].isnull(),1,0) #if nan then replace with 1 else 0
    #Here new column is created which resembles whether the values is nan or not with 1 and 
    dataset[feature].fillna(median_value,inplace=True) #replacing with median
    
dataset[numerical_with_nan].isnull().sum()
    

In [None]:
dataset.head(50)

## Temporal variables

In [None]:
for feature in ['YearBuilt','YearRemodAdd','GarageYrBlt']:
    dataset[feature] = dataset['YrSold']-dataset[feature]

In [None]:
dataset.head()

In [None]:
dataset[['YearBuilt','YearRemodAdd','GarageYrBlt']].head()

## Numerical Variables
Since the numerical variabled are skewed we will perform log normal distribution

In [None]:
dataset.head()

In [None]:
## gaussian or normal distribution or log normal distribution
import numpy as np
num_features = ['LotFrontage','LotArea','1stFlrSF','2ndFlrSF','GrLivArea','SalePrice']

for feature in num_features:
    dataset[feature] = np.log(dataset[feature])

In [None]:
dataset.head()

## Handling Rare Categorical Feature
These are the categorical feature which doesm't affect to analysis.
We will remove variables that present less than 1% of the observations.
These are not much weight in the dataset.
Hence we will be converting them into new lable.

In [None]:
categorical_features=[feature for feature in dataset.columns if dataset[feature].dtype=='O']

In [None]:
categorical_features

In [None]:
len(categorical_features)

In [None]:
## to get the percentage of each feature with respect to the whole dataset
for feature in categorical_features:
    temp = dataset.groupby(feature)['SalePrice'].count()/len(dataset)
    temp_df = temp[temp>0.01].index
    dataset[feature]=np.where(dataset[feature].isin(temp_df),dataset[feature],'Rare_var')

In [None]:
dataset.head()

## Feature Scaling
Many features measures in different units
Hence we need to transform them in similar types hence machine learning algorithms in better way

In [None]:
# Feature scaling 
feature_scale = [feature for feature in dataset.columns if feature not in ['Id','SalePrice']]


from sklearn.preprocessing import MinMaxScaler
scaler = MinMaxScaler()
scaler.fit(dataset[feature_scale])


In [None]:
scaler.transform(dataset[feature_scale])

In [None]:
#transform the train and test set and add on the Id and SalePrice variables
data = pd.concat([dataset[['Id','SalePrice']].reset_index(drop=True),
                   pd.DataFrame(scaler.transform(dataset[feature_scale]),columns=feature_scale)],
                    axis=1
                   )

In [None]:
data.head()

In [None]:
data.to_csv('X_train.csv',index=False)