In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.preprocessing import OneHotEncoder



In [None]:
df = pd.read_csv("train.csv", index_col=0)
df

In [None]:
#Filter feature data
X_unproc = df[["LotArea","YrSold"]]
ohe_list = ["Neighborhood"]
X_final = pd.DataFrame()

#Do one hot encoding for every collumn specified in ohe_list
for collumn in ohe_list:
    pre_ohe_df = df[[collumn]]
    ohe_df = pd.get_dummies(pre_ohe_df, prefix=collumn)
    X_final = ohe_df.join(X_final)
    # print(X_final)
    # print(ohe_df)

# print(X)
# X["YrSold"] = list(map(lambda year: year - 2006, df["YrSold"])) #example of remapping column values 
# print(list(X["YrSold"]))
X_final = X_unproc.join(X_final)
# X_final.to_csv("temp.csv")
X = X_final.values


y = df["SalePrice"].values
print(X.shape)
# print(y.shape)
# # X = X.reshape(1460, 54)
# print(X.shape)




In [None]:
#Split train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

print(X_train.shape)

# X_train = X_train.reshape(-1, 2)
# print(X_train.shape)

# X_test = X_test.reshape(-1, 1)

# "x_train size: " + str(X_train.size) + " y_train size: " + str(y_train.size) 



In [None]:
#Train LinearRegression model
reg = LinearRegression()
reg.fit(X_train, y_train)


In [None]:
#Evaluate results
# X_test = X_test.reshape(-1, 2)
print(X_test.shape)
y_pred = reg.predict(X_test)
r_squared = reg.score(X_test, y_test)
rmse = mean_squared_error(y_test, y_pred, squared=False)

kf = KFold(n_splits=6, shuffle=True)
cv_scores = cross_val_score(reg, X, y, cv=kf)
cv_mean = np.mean(cv_scores)

print("Mean squared error: " + str(rmse))
print("Root squared error: " + str(r_squared))
print("Mean 6-fold CV: " + str(cv_mean))




In [None]:
#Before evaluating on history, retrieve the history

history = pd.DataFrame(columns=["Attempt", "mean_cv", "rmse"] );
try:
    history = pd.read_csv("history.csv")
    print("Found history.csv with rows amount: " + str(history.shape[0]))
except FileNotFoundError:
    print("history.csv not found, creating new one")
    history.to_csv('history.csv', index = False)


In [None]:
#Evaluate based on history and visualize progress


history.loc[len(history.index)] = [int(len(history.index)), cv_mean, int(rmse)]
print(history[-5:-1])
print("Current attempt: ", history[-1:])
history.to_csv('history.csv', index = False)

plt.plot(history["Attempt"], history["mean_cv"])
min_cv_mean = np.min(history["mean_cv"].values)
max_cv_mean = np.max(history["mean_cv"].values)

visual_margin = (max_cv_mean - min_cv_mean) * 0.5 + 0.001 #Makes the graph more eye friendly

plt.xlim(0, np.size(history["Attempt"].values))
# plt.ylim(min_r_squared - visual_margin, max_r_squared + visual_margin)
plt.ylim(0, 1)
plt.show()
