In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [None]:
df = pd.read_csv(r'E:\Python Projects\Machine Learning\House pricing\Advanced House kaggle\test.csv')

In [None]:
col_names = list(df.columns)

In [None]:
numerical = [feature for feature in col_names if df[feature].dtypes != 'O' and feature != 'Id']

In [None]:
categorical = [feature for feature in col_names if feature not in numerical]

## Numerical Variable  - Descrete (unique val count < 25), Continous 

In [None]:
descrete = [feature for feature in numerical if len(df[feature].unique())<25]

In [None]:
for feature in descrete:
    print(feature,':',len(df[feature].unique()),':',df[feature].isna().sum())

In [None]:
for feature in descrete:
    df[feature].fillna(df[feature].median(),inplace=True)

### There are no null values in the descrete numerical features

In [None]:
continous = [feature for feature in numerical if feature not in descrete]

In [None]:
# Null values in continous
null_val_continous = [feature for feature in continous if df[feature].isna().sum()>1]

In [None]:
for feature in null_val_continous:
    print(feature,df[feature].isna().sum())

### ['LotFrontage', 'MasVnrArea', 'GarageYrBlt'] have null values so check for corelation with the continous numerical variables

In [None]:
# Correlation between continous features
corr_continous = df[continous].corr()

In [None]:
corr_continous.loc['GarageYrBlt'].sort_values(ascending = False)

### GarageYrBlt and YrBuilt have a positive corelation  YearBuilt   0.825667 can apply linear model
         Fit linear model and predict the null values and then update the dataframe

In [None]:
def lm_imputation(df,x,y):
    # Get the missing indices 
    y_missing_index = df[df[y].isnull()].index.tolist()
    
    # convert feature and target to numpy array and drop missing indices in both
    
    x_numpy = df[x].to_numpy().reshape(-1,1)
    x_final = np.delete(x_numpy,y_missing_index).reshape(-1,1)
    
    y_numpy = df[y].to_numpy().reshape(-1,1)
    y_final = np.delete(y_numpy,y_missing_index).reshape(-1,1)
    
    #Fit linear model and predict outcome and place in dataframe
    lm = LinearRegression()
    lm.fit(x_final,y_final)
    
    pred = x_numpy[y_missing_index]
    
    missing_imputed = np.round(lm.predict(pred),0).reshape(-1,1)
    
    df.loc[df[y].isnull(),y] = missing_imputed

In [None]:
from sklearn.linear_model import LinearRegression
x = 'YearBuilt'
y = 'GarageYrBlt'
lm_imputation(df,x,y)

### Since  'LotFrontage' and 'MasVnrArea' have no relations with other varaible impute with median as there are outliers

In [None]:
df['LotFrontage'].fillna(df['LotFrontage'].median(),inplace = True)

In [None]:
df['MasVnrArea'].fillna(df['MasVnrArea'].median(),inplace = True)

### Check for normality in continous variables 

In [None]:
import scipy.stats as stats

In [None]:
df1 = df.copy()

In [None]:
# Histogram and QQ Plot
for feature in continous:
    plt.figure(figsize=(15,6))
    plt.subplot(1,2,1)
    plt.hist(df[feature],bins = 25)
    plt.title(feature)
    
    plt.subplot(1,2,2)
    stats.probplot(df[feature],dist = 'norm',plot = plt)
    plt.show()

In [None]:
# Log transformation Histogram plot and QQ plot
for feature in continous:

    plt.figure(figsize=(15,6))
    plt.subplot(1,2,1)
    plt.hist(np.log(df[feature]+1),bins = 25)
    plt.title(feature)
    
    plt.subplot(1,2,2)
    stats.probplot(df[feature],dist = 'norm',plot = plt)
    plt.show()

### Following features have been found to follow Gausian after log transformation

In [None]:
continous_log = ['LotFrontage','GarageArea','GrLivArea','1stFlrSF','TotalBsmtSF','LotArea']

In [None]:
for feature in continous_log:
    df[feature] = np.log(df[feature]+1)

In [None]:
# Features that donot follow Gausian Distribution After Log Transformation
continous_1 = [feature for feature in continous if feature not in continous_log]

In [None]:
# After Linear Reression on Missing data it was showing a float value and year cant be float
df['GarageYrBlt'] = df['GarageYrBlt'].astype(int)

In [None]:
for feature in numerical:
    df[feature].fillna(df[feature].median(),inplace=True)

In [None]:
df[numerical].isna().sum()

# Numerical done now for categorical

# Similar categorical - The ordinal ones which have same unique categories


In [None]:
# ordinal features that have similar unique values 'Fa','TA','Gd','Ex'

similar_categorical = [feature for feature in categorical if 'Qual' in feature or 'Cond' in feature or 'QC' in feature
                       or 'Qu' in feature and feature not in ['Condition1','Condition2','SaleCondition']]

In [None]:
# 3 Unwanted features also came so remove them

similar_categorical = similar_categorical[2:len(similar_categorical)-1]

In [None]:
def ordinal_encoding(df, feature, sorted_list):
    key_val_pair = {}
    for i in range(len(sorted_list)):
        key_val_pair[sorted_list[i]] = i+1
    df[feature] = df[feature].map(key_val_pair)
        

In [None]:
# Ordinal encoding for ordinal features with similar unique values

uniq_sim_categorical = ['Fa','TA','Gd','Ex']
for feature in similar_categorical:
    ordinal_encoding(df,feature,uniq_sim_categorical)
    

In [None]:
# ALL features in similar categorical NA IS 0
for feature in similar_categorical:
    df[feature].fillna(0, inplace = True)

In [None]:
df[similar_categorical].isna().sum()

# Ordinal Categorical features other than similar ones
           LotShape Utilities LandSlope BsmtExpo CentralAir GarrageFinish PavedDrive FenceQC

In [None]:
df.shape

In [None]:
# ord_categorical = ordinalcategorical
ord_categorical = ['LotShape', 'Utilities','LandSlope','BsmtExposure',
                   'BsmtFinType1','BsmtFinType2','CentralAir','GarageFinish','PavedDrive','Fence']

In [None]:
# get unique values and sort them for encoding

for feature in ord_categorical:
    print(feature, df[feature].unique(), df[feature].isna().sum())

# arrange the ordinal features in order ( bad .... good) and run function ordinal encoding on it¶

In [None]:
lotshape = ['Reg','IR1','IR2','IR3'][::-1]
ordinal_encoding(df,'LotShape',lotshape)

utilities = ['NoSeWa','AllPub']
ordinal_encoding(df,'Utilities',utilities)

landslope = ['Gtl','Mod','Sev'][::-1]
ordinal_encoding(df,'LandSlope',landslope)

bsmtexposure = ['No','Mn','Av','Gd']
ordinal_encoding(df,'BsmtExposure',bsmtexposure)

bsmtfintype1 = ['Unf','LwQ','Rec','BLQ','ALQ','GLQ']
ordinal_encoding(df,'BsmtFinType1',bsmtfintype1)

bsmtfintype2 = ['Unf','LwQ','Rec','BLQ','ALQ','GLQ']
ordinal_encoding(df,'BsmtFinType2',bsmtfintype2)

centralair = ['N','Y']
ordinal_encoding(df,'CentralAir',centralair)

garagefinish = ['Unf','RFn','Fin']
ordinal_encoding(df,'GarageFinish',garagefinish)

paveddrive = ['N','P','Y']
ordinal_encoding(df,'PavedDrive',paveddrive)

fence = ['MnWw','GdWo','MnPrv','GdPrv' ]
ordinal_encoding(df,'Fence',fence)

In [None]:
# ALL features in similar categorical NA IS 0
for feature in ord_categorical:
    df[feature].fillna(0, inplace = True)

In [None]:
df[ord_categorical].isna().sum()

# ALL ORDINAL FEATURES¶

In [None]:
df1 = df.copy()

In [None]:
# all ordinal features 
ordinal_features = similar_categorical+ord_categorical

## All nominal features and multi categories nominal features 

In [None]:
# Multi categorical features which have unique values > 10
categorical_multi = ['Neighborhood','Exterior1st','Exterior2nd']

In [None]:
######################
# Storing the multiple categorical features in another variable
categorical_1 = [feature for feature in col_names if df1[feature].dtypes == 'O' and feature != 'MiscFeature'
               and feature not in ['Neighborhood','Exterior1st','Exterior2nd']]

In [None]:
def multi_categorical_encoding(data,feature):
    top_10 = data[feature].value_counts().sort_values(ascending = False).index
    top_10 = top_10[0:10]
    for index in (top_10):
        data[index] = np.where(data[feature] == index,1,0)

In [None]:
for feature in categorical_multi:
    multi_categorical_encoding(df1,feature)

In [None]:
for feature in categorical_multi:
    df1.drop([feature],axis=1,inplace=True)

## For Remaining CATEGORICAL Features label Encoding

In [None]:
# Drop because it has 95% Missing Values
df1.drop(['MiscFeature'],axis=1,inplace=True)

In [None]:
col_names_df1 = df1.columns.tolist()

In [None]:
df1_categorical = [feature for feature in col_names_df1 if df1[feature].dtypes == 'O']

In [None]:
for feature in df1_categorical:
    print(feature,df1[feature].isna().sum())

In [None]:
# 0 Means non existance of an item like NA Alley.. and string '0' because LabelEncoding needs entire string or int
for feature in col_names_df1:
    df1[feature].fillna('0',inplace=True)

In [None]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()

In [None]:
for feature in df1_categorical:
    df1[feature]=le.fit_transform(df1[feature])

In [None]:
df1.to_csv('C:\\Users\\test\\Desktop\\test1.csv')