#### Importing relevant libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression 
from sklearn.model_selection import train_test_split

#### Importing grouped test data

In [2]:
from read_path_module import read_data_relative_path

df_train = read_data_relative_path(relative_dataset_path = './data/kaggle/created/homes_grouped.csv',
                        data_type='csv'
                       )

#### T

In [3]:
# df_filter = df_train.loc[(df_train['SalePrice'] > med - (1.5 * std) ) & (df_train['SalePrice'] < med + (mult * std))]


# np.log(df_filter['SalePrice'])

####  Part 1.7: Feature imputation: Part 1 => fill in true nulls        

- **Electrical:** only one missing value; use mode imputation since it's categorical ==> column to be dropped  
- **LotFrontage:** to use uniform random imputation   

In [4]:
df_train['LotFrontage'] = df_train['LotFrontage'].mask(df_train['LotFrontage'].isnull(), np.random.uniform(df_train['LotFrontage'].min(), df_train['LotFrontage'].max(), size = df_train['LotFrontage'].shape))
df_train['GarageYrBlt'] = df_train['GarageYrBlt'].mask(df_train['GarageYrBlt'].isnull(), np.random.uniform(df_train['GarageYrBlt'].min(), df_train['GarageYrBlt'].max(), size = df_train['GarageYrBlt'].shape))
df_train['MasVnrArea'] = df_train['MasVnrArea'].mask(df_train['MasVnrArea'].isnull(), np.random.uniform(df_train['MasVnrArea'].min(), df_train['MasVnrArea'].max(), size = df_train['MasVnrArea'].shape))


####  Part 1.8:  Feature imputation: Part 2 => fill None for nulls that means not having specified feature     

- fill in the nulls that have no respective indicated features 

In [5]:
df_train.fillna('Nothing', inplace = True)  

#### Drop non-needed columns and save a copy of the dataframe as df 

In [6]:
df = df_train.drop(['Id'], axis = 1)
df = df.drop(['Unnamed: 0'], axis = 1)   

#### Dummify categorical features

In [7]:
categorical = ['Alley', 'BldgType_group', 'BsmtCond_group', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'BsmtQual_group', 'CentralAir', 'Condition1_group', 'Electrical_group', 'ExterCond_group', 'ExterQual', 'Exterior1st_group', 'Exterior2nd_group', 'Fence', 'FireplaceQu', 'Foundation_group', 'GarageCond_group', 'GarageFinish', 'GarageQual', 'GarageType', 'HeatingQC_group', 'HouseStyle_group', 'KitchenQual', 'LandContour_group', 'LandSlope', 'LotConfig_group', 'LotShape_group', 'MS_Zoning_group', 'MasVnrType_group', 'Neighborhood', 'PavedDrive', 'PoolQC', 'RoofStyle_group', 'SaleCondition_group', 'SaleType_group']
df_1 = df[categorical]
df_dum = pd.get_dummies(df_1, drop_first = True)  # the prefix for the prefix of the new columns  

#### Define numerical dataframe and concatenate with dummified one

In [8]:
df_num = df[['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'YearRemodAdd', 
             'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 
             'LowQualFinSF', 'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath', 'BedroomAbvGr',
             'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces', 'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 
             'OpenPorchSF', 'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal', 'MoSold', 
             'YrSold', 'SalePrice']] 

df = pd.concat([df_num, df_dum], axis = 1) 

#### Create target X and Y variables for multi-linear regression

In [11]:
# Filter out outliers
mult_upper = 4
mult_lower = 1.5
med = df['SalePrice'].median()
mean = df['SalePrice'].mean()
std = df['SalePrice'].std()
df = df.loc[(df['SalePrice'] > med - (mult_lower * std) ) & (df['SalePrice'] < med + (mult_upper * std))]


# create X and y      
X = df.drop(['SalePrice'], axis = 1)
y = np.log(df['SalePrice'])

#### Train test split and fit model

In [29]:
# train_test_split   
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)  

# linear regression   
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

# fit model
lin_reg = LinearRegression()
lin_reg.fit(X_train, y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=None, normalize=False)

#### Show various performance metrics

In [30]:
print(f'ordinary linear regression score: {lin_reg.score(X_train, y_train)}', '\n')  
print(f'ordinary linear regression intercept: {np.exp(lin_reg.intercept_) }', '\n')
coefficients = pd.DataFrame(np.exp(lin_reg.coef_), X.columns, columns = ['Coefficients'])   
coefficients 

ordinary linear regression score: 0.9084664661511658 

ordinary linear regression intercept: 661663.2732275296 



Unnamed: 0,Coefficients
MSSubClass,0.999731
LotFrontage,0.999999
LotArea,1.000002
OverallQual,1.047390
OverallCond,1.045398
...,...
RoofStyle_group_Shed/Flat/Hip/Mansard,0.999346
SaleCondition_group_Normal/Alloca,1.026211
SaleCondition_group_Nothing,0.952053
SaleCondition_group_Parital,0.962015


In [27]:
predictions = lin_reg.predict(X_test)
comparison = pd.DataFrame({'Actual': np.exp(y_test), 'Predictions': np.exp(predictions)})
comparison.head(5)  

Unnamed: 0,Actual,Predictions
1000,82000.0,77928.798259
1333,125500.0,121153.641075
218,311500.0,260004.957476
1323,82500.0,105381.798607
902,180000.0,187931.26789


#### T

#### T

#### T

#### T

#### T

#### T