In [11]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

from scipy import stats
from scipy.stats import yeojohnson


from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor

from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler

In [2]:
df = pd.read_csv("./DF/train.csv")

In [3]:
# Null values for LotFrontage means that the houses have 0 feets since the street to the entrance, so the null values will be replaced with 0
df["LotFrontage"].fillna("None", inplace=True)

# Null values for Alley means that the houses dont have an alley entrance, so the null values will be replaced with None
df["Alley"].fillna("None", inplace=True)

# Null values for MasVnrType and MasVnrArea means that we dont know what kind of Masonry veneer type has the property or the area of it, since there are few null values here they will be replaced with None
df["MasVnrType"].fillna("None", inplace=True)
df["MasVnrArea"].fillna("None", inplace=True)

# FireplaceQu will be filled with None since it's asociate to the reason that there isnt fireplaces.
df["FireplaceQu"].fillna("None", inplace=True)

# BsmtQual, BsmtCond, BsmtExposure, BsmtFinType1, and BsmtFinType2 are related and im not sure how to treat it now, so i'll leave it as is
basement_columns=["BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2"]
df.loc[:, basement_columns] = df.loc[:, basement_columns].fillna("None")

# Since Electrical has only one null value I'll remove it (there are other fields of this row with null values)
null_index = df[df["Electrical"].isnull()].index
df.drop(null_index, axis=0, inplace=True)

# GarageType, GarageYrBlt, GarageFinish, GarageQual, GarageCond null values will be filled with None indicating that the hose doesnt have garage, it based on GarageCars and GarageArea that are 0 for those rows.
garage_columns = ["GarageType", "GarageYrBlt", "GarageFinish", "GarageQual", "GarageCond"]
df.loc[:, garage_columns] = df.loc[:, garage_columns].fillna("None")

# PoolQC, Fence and MiscFeature will be filled with None, indicating that there isnt pool, fence or anothe miscelaneous furniture, this desition is taking in account that other fields as the pool area that indicates the missing of it.
other_columns = ["PoolQC", "Fence" ,"MiscFeature"]
df.loc[:, other_columns] = df.loc[:, other_columns].fillna("None")

In [4]:
object_columns = df.select_dtypes(include=["object"])
df[object_columns.columns] = df[object_columns.columns].astype("category")

lbl = LabelEncoder()
for column in object_columns.columns:
    df[column] = lbl.fit_transform(df[column].astype("str"))

In [5]:
X = df.drop(["SalePrice"], axis=1)
y = df["SalePrice"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

train_data = X_train.join(y_train)

In [12]:
# Skewed Data
skewed = ["MSSubClass", "LotArea", "BsmtFinSF1", "BsmtUnfSF", "WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "LotFrontage", "YearBuilt", "MasVnrArea", "GarageYrBlt"]

transformed_data = []
lambda_values = []

for column in skewed:
    data, lambda_value = stats.yeojohnson(train_data[column])

    transformed_data.append(data)
    lambda_values.append(lambda_value)

new_df = pd.DataFrame(transformed_data).T

In [14]:
# Renaming columns of new df to the same names as on original df
transformed_columns = dict()

for i in range(len(skewed)):
    transformed_columns[i] = skewed[i]

new_df.rename(columns = transformed_columns, inplace=True)

In [15]:
# Replacing the skewed data with the treated data
train_data.drop(skewed, axis=1, inplace=True)

for column in skewed:
    train_data[column] = new_df[column]

In [16]:
# Scaling Data
scaler = StandardScaler()

X_train_s = scaler.fit_transform(X_train)
X_test_s = scaler.fit_transform(X_test)

In [18]:
forest = RandomForestRegressor()
forest.fit(X_train_s, y_train)

y_pred = forest.predict(X_test_s)

# Mean Square Error
mse = mean_squared_error(y_test, y_pred)

# Root Mean Square Error
rmse = np.sqrt(mse)

# Accuracy of the model
acc = forest.score(X_test_s, y_test)

print("Rmse = ", rmse)
print("R2 = ", acc)

Rmse =  25462.706122363827
R2 =  0.8859015434503564
