In [33]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

We will first load dataset using cudf and then preprocess it so as to make the data usable.

In [34]:
import cuml
import cupy as cp
import cudf
import numpy as np

In [35]:
df_train = cudf.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
df_train

In [36]:
df_test  = cudf.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")
df_test

In [37]:
df_train.info()

In [38]:
df_train.isnull().sum()

We want to remove all the features that hae null values more than 30 percent. We need features that can provide us with atkeats 70 percent of the data if not the features must be remoed as we wont be able to find replacements for null values and even if we did that might vary the results to great extent.

In [39]:
missing_from_df_train = df_train.isnull().sum()/len(df_train)
missing_from_df_train
print("The number of columns where loss is above 45 percent are : ")
(missing_from_df_train>0.3).sum()

In [40]:
drop_from_train = missing_from_df_train[missing_from_df_train>0.3]


We just dropped those 5 columns.


In [41]:
df_train.drop(columns = drop_from_train.keys().to_array(),axis = 1, inplace=True)


In [42]:
df_train.info()

In [43]:
df_train.reset_index(drop=True,inplace = True)


We have two types of features categorical and numerical. For encoding purposes we need to know the number in which they are present or they might result in computational error.

In [44]:
categorical_features=[]
for cf in df_train.columns:
    if(df_train[cf].dtype=='O'):
        categorical_features.append(cf)
        
        
print("categorical_features : ",len(categorical_features))

Since there are either categorical or numerical values, we got 38 categorical values so we are left with 38 numerical values as well. since numerical values are of two types : 1. continuos(Real numbers) 2. discrete (integers) and it is an integral part of preprocessing to know how many of each kind are present in case we apply an algorithm that works well only on discrete values. So we will count that too.

In [45]:
numerical_features=[]
for nf in df_train.columns:
    if(df_train[nf].dtype!='O'):
        numerical_features.append(nf)
        
        
print("numerical_features : ",len(numerical_features))

year_features = [feature for feature in numerical_features if 'Year' in feature or 'Yr' in feature ]
year_features

In [46]:

discrete_features = [dft for dft in numerical_features if len(df_train[dft].unique()) < 25 and
                     dft not in year_features]
len(discrete_features)

In [47]:
continuos_features  = [cft for cft in numerical_features if cft not in discrete_features and cft not in year_features]
len(continuos_features)

In [48]:
df_train.info()

for encoding we should find out the number of unique rows we are dealing with.


In [49]:
for feature in categorical_features:
    print(f'The feature is {feature} and no of unique categories are {len(df_train[feature].unique())}')

Now its time to encode the missing cateorical values and replace the ones in numerical category.
We have to first find out which columns have missing values and for that we will do isnull().sum() operation on every feature and find the percentage.

Its upto us what we want to replace it with. We can remove categorical values with "missing_values" or any other string. But that all depends on the amount of data missing.

In [50]:
categorical_features_null_values= [feature for feature in df_train.columns if df_train[feature].isnull().sum() > 0 
                                    and df_train[feature].dtype == 'O']

for feature in categorical_features_null_values :
    print(f"Amount of missing values in {feature} is {df_train[feature].isnull().sum()/len(df_train[feature])}")

Since dataloss is less than 10 percent we can replace it with "missing value"

In [51]:
def replace_missing_categorical_features(dataset,features):
    data = dataset.copy()
    data[features] = data[features].fillna('Missing_values')
    return data

In [52]:
df_train = replace_missing_categorical_features(df_train, categorical_features)


we want to do the same with numerical values....but in case of numerical values we can't put zero as it may vary the results to great extent. so we replace it either wit mean, median or mode but mode seems a better option since house can have a feature similar to most of the others.

In [53]:
numerical_features_missing_values = [feature for feature in numerical_features if df_train[feature].isnull().sum() > 0]
numerical_features_missing_values

for feature in numerical_features_missing_values:
    df_train[feature] = df_train[feature].fillna(df_train[feature].mode())
   
    

It is essential to scale the inputs so i used standard scaler.

In [54]:
from cuml.preprocessing import StandardScaler
s = StandardScaler()
df_train[numerical_features]=s.fit_transform(df_train[numerical_features])

df_train[numerical_features]

In [55]:
from cuml.preprocessing import LabelEncoder
enc = LabelEncoder()
for feature in categorical_features:
    df_train[feature] = enc.fit_transform(df_train[feature])
   
df_train[categorical_features].head()

In [56]:
for feature in ['YearBuilt','YearRemodAdd','GarageYrBlt']:
    df_train[feature] = df_train['YrSold'] - df_train[feature]
    
    
    

In [57]:
df_train.info()

chosing the important features will help make a better prediction.

In [58]:
df_train = df_train[["Street","OpenPorchSF","WoodDeckSF","OverallQual","YearBuilt","YearRemodAdd","ExterQual","TotalBsmtSF",
                     "1stFlrSF","GrLivArea","PoolArea","Fireplaces","RoofStyle",
                     "FullBath","TotRmsAbvGrd","GarageCars","GarageArea",
                   "MSZoning", "Utilities","BldgType","Heating","KitchenQual","SaleCondition","LandSlope","SalePrice"]]

In [59]:
from sklearn.model_selection import train_test_split
X = df_train.drop('SalePrice',axis=1)
Y = df_train['SalePrice']
x_train, x_valid, y_train, y_valid = train_test_split(X, Y, test_size=0.2, random_state=42)

In [60]:
y_valid.isnull().sum()

In [61]:
from cuml.linear_model import LinearRegression

algo = ['svd', 'eig', 'qr', 'svd-qr', 'svd-jacobi']

for i in algo: 
    lr = LinearRegression(fit_intercept = True, normalize = False,
                      algorithm = i)

    reg = lr.fit(x_train,y_train)

    print(reg.coef_)

    preds=lr.predict(x_valid)
    
    print(preds)
    
    MSE=cuml.metrics.regression.mean_squared_error(y_valid,preds)
    MAE=cuml.metrics.regression.mean_absolute_error(y_valid,preds)
    R2_Score=cuml.metrics.regression.r2_score(y_valid,preds)
    

    print(f"for algo {i} MSE is {MSE}  MAE is {MAE} r2_score is {R2_Score}")
