In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, cross_val_score, KFold



In [None]:
# df = pd.read_csv("train.csv")
df = pd.read_csv("train.csv", index_col=0)
df

In [None]:
#Filter feature data
X_final = pd.DataFrame()

df = df.drop('Fence', axis=1)  
df = df.drop('Alley', axis=1)  
df = df.drop('PoolQC', axis=1)
df = df.drop('MiscFeature', axis=1)
df = df.drop('FireplaceQu', axis=1)

df["LotFrontage"] = df["LotFrontage"].fillna(df["LotFrontage"].mean())
df["MasVnrArea"] = df["MasVnrArea"].fillna(df["MasVnrArea"].mean())
df["GarageYrBlt"] = df["GarageYrBlt"].fillna(2001)

df['BsmtExposure'] = df['BsmtExposure'].fillna(df['BsmtExposure'].mode()[0])
df['BsmtQual'] = df['BsmtQual'].fillna(df['BsmtQual'].mode()[0])
df['BsmtCond'] = df['BsmtCond'].fillna(df['BsmtCond'].mode()[0])

df['GarageCond'] = df['GarageCond'].fillna(df['GarageCond'].mode()[0])
df['GarageQual'] = df['GarageQual'].fillna(df['GarageQual'].mode()[0])
df['GarageType'] = df['GarageType'].fillna(df['GarageType'].mode()[0])
df['GarageFinish'] = df['GarageFinish'].fillna(df['GarageFinish'].mode()[0])

df['BsmtFinType1'] = df['BsmtFinType1'].fillna(df['BsmtFinType1'].mode()[0])
df['BsmtFinType2'] = df['BsmtFinType2'].fillna(df['BsmtFinType2'].mode()[0])

df['Electrical'] = df['Electrical'].fillna(df['Electrical'].mode()[0])
df['MasVnrType'] = df['MasVnrType'].fillna(df['MasVnrType'].mode()[0])

drop_index = df[(df["SalePrice"] < 200000) & (df["OverallQual"] > 8) & (df["GrLivArea"] > 4000)].index
df = df.drop(drop_index, axis = 0)

y = df["SalePrice"].values
# df = df[["Neighborhood"]]
df = df.drop('SalePrice', axis=1)
# df = df.drop('Id', axis=1)
print(df.columns)
df["MSSubClass"] = list(map(lambda sub_class: "MS_" + str(sub_class) , df["MSSubClass"])) #Remap sub_class to strings for one hot encoding


In [None]:

#Do one hot encoding for every column specified in ohe_list
for column in df.columns:
    selected_column = df[[column]]
    c_type = type(selected_column.iloc[0, 0])
    print(str(column) + " of type " + str(c_type))
    if c_type == str or column == "MSSubClass":
        # print("Detected string collumn: " + column)
        ohe_df = pd.get_dummies(selected_column, prefix=column, drop_first=True)
        X_final = ohe_df.join(X_final)
    elif c_type == np.int16 or c_type == np.int32 or c_type == np.int64 or c_type == np.float16 or c_type == np.float32 or c_type == np.float64:
        X_final = selected_column.join(X_final)
    else:
        raise ValueError


        
    print(X_final)
    # print(ohe_df)

X = X_final.values
# X_final.to_csv("temp.csv")
print(X.shape)



In [None]:
#Split train and test data
print("X: " + str(X.shape) + " y: " + str(y.shape))
# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

# print(X_train.shape)
# print(type(X_train))

# X_train = X_train.reshape(-1, 2)
# print(X_train.shape)

# X_test = X_test.reshape(-1, 1)

# "x_train size: " + str(X_train.size) + " y_train size: " + str(y_train.size) 



In [None]:
#Train LinearRegression model
reg = LinearRegression()


In [None]:
#Evaluate results
kf = KFold(n_splits=10, shuffle=True)
cv_scores = cross_val_score(reg, X, y, cv=kf)
cv_mean = np.mean(cv_scores)

# print("Mean squared error: " + str(rmse))
# print("Root squared error: " + str(r_squared))
print("Scores 10-fold CV: " + str(cv_scores))
print("Mean 10-fold CV: " + str(cv_mean))



In [None]:
#Before evaluating on history, retrieve the history

history = pd.DataFrame(columns=["Attempt", "mean_cv", "rmse"] );
try:
    history = pd.read_csv("history.csv")
    print("Found history.csv with rows amount: " + str(history.shape[0]))
except FileNotFoundError:
    print("history.csv not found, creating new one")
    history.to_csv('history.csv', index = False)


In [None]:
#Evaluate based on history and visualize progress

current_attempt = int(len(history.index))
history.loc[len(history.index)] = [current_attempt, cv_mean, int(-1)]
print(history[-5:-1])
print("Current attempt: \n", history[-1:])
history.to_csv('history.csv', index = False)

plt.plot(history["Attempt"], history["mean_cv"])

plt.xlabel("Attempts (current: {:.0f}".format(current_attempt) + ")")
plt.xlim(0, np.size(history["Attempt"].values))
# plt.ylim(min_r_squared - visual_margin, max_r_squared + visual_margin)
plt.ylabel("Mean cross_val_score (current: {:.2f}".format(cv_mean) + ")")
plt.ylim(0, 1)
plt.title("10f Cross-Validation score progress")
plt.show()
