In [None]:
import numpy as np
import pandas as pd
import warnings
warnings.filterwarnings("ignore", message="invalid value encountered in greater")
warnings.filterwarnings("ignore", message="invalid value encountered in less")
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn import preprocessing

# from itertools import combinations | Consider in the future

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

df = pd.read_csv('/kaggle/input/home-data-for-ml-course/train.csv')
test_set  = pd.read_csv('/kaggle/input/home-data-for-ml-course/test.csv')
y = df.SalePrice

df.sample(5)

In [None]:
column_list = df.columns.values.tolist()
column_list = [col for col in column_list if df[col].dtype == 'int64']

int_l = df[column_list]
# print(int_l.sample(5))

for col in column_list:
    plt.scatter(df[col], y, color='blue')
    plt.xlabel(col)
    plt.ylabel('Sale Price')
    # plt.show()

In [None]:
# Simple linear regression
candidates = []
best_model = None
best_variable = 'OverallQual'

for col in column_list:
    X = int_l[[col]].to_numpy()
    y = int_l.SalePrice.to_numpy()

    X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2,random_state=1)
    
    regressor = linear_model.LinearRegression()
    regressor.fit(X_train, y_train)

    y_test_ = regressor.predict(X_test)

    # first true, then predicted
    MSE = mean_squared_error(y_test, y_test_)
    r2 = r2_score(y_test, y_test_)
    
    if r2 > 0:
        '''print(f"{col}: R² = {r2:.3f}, MSE = {MSE:.2f}")
        print('Coefficients: ', regressor.coef_[0])
        print('Intercept: ', regressor.intercept_)'''
        candidates.append((col, r2, MSE))
        if col == 'OverallQual':
            best_model = regressor

candidates.sort(key=lambda x: x[1], reverse=True)
print('Based on R2', candidates)
candidates.sort(key=lambda x: x[2], reverse=False)
# print('Based on MSE', candidates)

# r2 = 0.661, mse = 2420651458

In [None]:
X = df['OverallQual'].to_numpy()
y = df['SalePrice'].to_numpy()

y_pred = best_model.coef_[0] * X + best_model.intercept_

plt.scatter(X, y, color='blue')
plt.plot(X, y_pred, color='red', linewidth=2)
plt.xlabel("Overall Quality")
plt.ylabel("Sale Price")
plt.show()

In [58]:
# Multiple linear regression

corr_matrix = int_l.corr()

correlations_with_target = corr_matrix['SalePrice']
high_corr_features = correlations_with_target[correlations_with_target.abs() >= 0.5].index

filtered_df = int_l[high_corr_features]
# print(filtered_df.corr())

mult_regr_columns = ['OverallQual', 'YearBuilt', '1stFlrSF', 'GrLivArea', 'FullBath']

X = filtered_df[mult_regr_columns].to_numpy()
y = df.SalePrice.to_numpy()

std_scaler = preprocessing.StandardScaler()
X_std = std_scaler.fit_transform(X)

X_train, X_test, y_train, y_test = train_test_split(X_std,y,test_size=0.2,random_state=1)

regressor = linear_model.LinearRegression()
regressor.fit(X_train, y_train)

coef_ =  regressor.coef_
intercept_ = regressor.intercept_

means_ = std_scaler.mean_
std_devs_ = np.sqrt(std_scaler.var_)

coef_original = coef_ / std_devs_
intercept_original = intercept_ - np.sum((means_ * coef_) / std_devs_)

print ('Coefficients: ', coef_original)
print ('Intercept: ', intercept_original)

y_pred = regressor.predict(X_test)

r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)

print(r2, mae)

# r2 = 0.811, mae = 24874

Coefficients:  [23939.82042311   486.39739302    34.89635059    47.47473368
 -3233.33345519]
Intercept:  -1031909.1805498635
0.8111004966194413 24874.04507516773


In [60]:
''' Simple Regression
test_X = test_set[[best_variable]].to_numpy()
test_predictions = best_model.predict(test_X)
'''
# Use the same scaler that was fit on X_train
test_X = test_set[mult_regr_columns].to_numpy()
test_X_std = std_scaler.transform(test_X)

# Make predictions
test_predictions = regressor.predict(test_X_std)

print(test_predictions)

output = pd.DataFrame({'Id': test_set.Id, 'SalePrice': test_predictions})
output.to_csv('submission.csv', index=False)

[112186.36732664 170333.67505971 162379.00302547 ... 138717.68557426
 133360.14674622 228299.1323346 ]
