In [58]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [59]:
os.chdir("/kaggle/input/house-prices-advanced-regression-techniques" )

In [60]:
import cudf as pd
import cupy as np
import matplotlib.pyplot as plt
from cuml.metrics.regression import r2_score, mean_squared_error as mse, mean_absolute_error as mae
from cuml.preprocessing import StandardScaler
from cuml import LinearRegression
import seaborn as sns

In [61]:
train = pd.read_csv( "train.csv" , index_col= [ "Id" ])
test = pd.read_csv( "test.csv" , index_col= [ "Id" ])

In [62]:


print (f'Shape of training data: {train . shape} ' )
print (f'Shape of testing data: {test . shape} ' )

In [63]:
train.info()

In [64]:
plt.figure(figsize = (25, 9))
missings = train.to_pandas().isnull().sum() / len(train)
missings.plot.bar()
plt.axhline(0.8, color= 'r')
plt.axhline(0.5, color= 'b')
plt.xlabel("Columns")
plt.ylabel("Missing Entries Ratio")
plt.xticks(rotation= "vertical")
plt.show()

In [65]:
columns_to_drop = [feature for feature in train.columns if train[feature].isnull().sum() >= 0.45*len(train)]
print(columns_to_drop)


In [66]:
train.drop(columns_to_drop, axis= 1, inplace= True)
test.drop(columns_to_drop, axis= 1, inplace= True)

In [67]:
print(f'Shape of training data: {train.shape}')
print(f'Shape of testing data: {test.shape}')

In [68]:
numerical_features = [column for column in train.columns if train[column].dtype != 'object']
categorical_features = [column for column in train.columns if train[column].dtype == 'object']
print(f'Number of Numerical Features are {len(numerical_features)}')
print(f'Number of Categorical Features are {len(categorical_features)}')

In [69]:
plt.figure(figsize= (30, 30))
sns.heatmap(train[numerical_features].to_pandas().corr(), annot= True)


In [70]:
correlated_numerical_features = ['GarageYrBlt', 'GarageArea']
train.drop(correlated_numerical_features, axis= 1, inplace= True)
test.drop(correlated_numerical_features, axis= 1, inplace= True)

In [71]:
print([feature for feature in categorical_features if train[feature].isna().sum() > 0])
for feature in categorical_features:
    train[feature] = train[feature].fillna(train[feature].mode().iloc[0])
    test[feature] = test[feature].fillna(test[feature].mode().iloc[0])
print([feature for feature in categorical_features if train[feature].isna().sum() > 0])

In [72]:
numerical_features = [column for column in train.columns if train[column].dtype != 'object']
numerical_features.remove('SalePrice')
categorical_features = [column for column in train.columns if train[column].dtype == 'object']


In [73]:
print([feature for feature in numerical_features if train[feature].isna().sum() > 0])
for feature in numerical_features:
    train[feature] = train[feature].fillna(train[feature].median())
    test[feature] = test[feature].fillna(test[feature].median())
print([feature for feature in numerical_features if train[feature].isna().sum() > 0])

In [74]:
len(numerical_features)

In [75]:

Y = pd.Series(train["SalePrice"], dtype = "float64")
X = train.drop("SalePrice", axis= 1)

In [76]:
all_num_data = pd.concat([X[numerical_features], test[numerical_features]])
ss = StandardScaler()
ss.fit(all_num_data)
normalized_X = pd.DataFrame(ss.transform(X[numerical_features]))
normalized_test = pd.DataFrame(ss.transform(test[numerical_features]))
normalized_X.index = X.index
normalized_test.index = test.index
normalized_X.columns = numerical_features

normalized_test.columns = numerical_features

In [77]:
train[categorical_features] = train[categorical_features].astype('category')

In [78]:
from cuml.preprocessing import LabelEncoder
encoder = LabelEncoder()
for feature in categorical_features:
    normalized_X[feature] = encoder.fit_transform(train[feature])
    normalized_test[feature] = encoder.fit_transform(test[feature])
    

In [79]:
normalized_X.head()

In [80]:
normalized_test.head()

In [81]:
normalized_X = normalized_X.values
Y = Y.values

In [82]:
algorithms = ["svd", "eig", "qr", "svd-qr", "svd-jacobi"]
for i, algorithm in enumerate(algorithms):
    lr = LinearRegression(fit_intercept= True, normalize= False, algorithm= algorithm)
    reg = lr.fit(normalized_X, Y)
    Y_pred = lr.predict(normalized_X)
    print(f"For algorithm {algorithm}: ")
    print(f"Mean Squared Error: {mse(Y, Y_pred)}", end= " ")
    print(f"r2_score: {r2_score(Y, Y_pred)}", end= " ")
    print(f"Mean Absolute Error: {mae(Y, Y_pred)}")

In [83]:
final_model = LinearRegression(fit_intercept= True, normalize= False, algorithm= "eig")


In [84]:
final_model.fit(normalized_X, Y)
pred = final_model.predict(normalized_test)

In [85]:
submission = pd.DataFrame({'Id': range(1461, 1461+len(test)), 'SalePrice': pred})


In [None]:
## kaggle notebook link https://www.kaggle.com/akshaykhandelwal2612/notebook7b05b0244e
## github https://github.com/Akshaykhandelwal2612/houseprice.git