# Housing Price Prediction

## Extraction and Data Preparation

In [None]:
import pandas as pd

DATA_PATH = "input/house/DS - Assignment Part 1 data set.xlsx"
RNG_SEED = 42

In [None]:
df = pd.read_excel(DATA_PATH)
df.head(10)

In [None]:
from sklearn.model_selection import train_test_split

X, y = df.iloc[:, :-1], df.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, shuffle=True, random_state=RNG_SEED)
X_train.reset_index(inplace=True)
X_test.reset_index(inplace=True)
len(y_train), len(y_test)

## Analysis

In [None]:
y_train.hist(bins=30);

**Observation**: One outlier having price beyond 100.

In [None]:
X_train.iloc[:, 1:].corr().style.background_gradient(cmap='hot')

## Training

In [None]:
def prepare_data(df, **kwargs):
    df2 = df
    if "drop_indexes" in kwargs:
        # List of indexes to drop
        df2 = df2.drop(kwargs["drop_indexes"])
    if "drop_cols" in kwargs:
        df2 = df2.drop(columns=kwargs["drop_cols"])

    return df2

In [None]:
X_train = prepare_data(X_train, drop_cols=["Distance from nearest Metro station (km)", "Transaction date"])
# y_train = prepare_data(y_train, drop_indexes=270)
X_test = prepare_data(X_test, drop_cols=["Distance from nearest Metro station (km)", "Transaction date"])

In [None]:
from lightgbm import LGBMRegressor
from sklearn.svm import SVR
from sklearn.feature_selection import VarianceThreshold
from sklearn.ensemble import RandomForestRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import PolynomialFeatures, MinMaxScaler, StandardScaler
from sklearn.linear_model import LinearRegression, Ridge, BayesianRidge
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline

pipe = Pipeline([
    ("scaler", StandardScaler()),
    # ("feature_generation", PolynomialFeatures(degree=2, interaction_only=True)),
    # ("feature_selection", VarianceThreshold(0.16)),
    ("regressor", RandomForestRegressor())
])
pipe.fit(X_train, y_train)
y_pred = pipe.predict(X_test)
mean_squared_error(y_test, y_pred, squared=False)

In [None]:
# for i in X_train.columns[1:]:
#
#     plt.figure()
#     plt.title(f'{i}')
#     plt.hist(X_train[i], bins=20)

In [None]:
for col in X_train.columns[1:]:
    print(f"Correlation of {col} with Output is: {y_train.corr(X_train[col])}")