In [64]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/house-prices-advanced-regression-techniques/sample_submission.csv
/kaggle/input/house-prices-advanced-regression-techniques/data_description.txt
/kaggle/input/house-prices-advanced-regression-techniques/train.csv
/kaggle/input/house-prices-advanced-regression-techniques/test.csv


**Importing training and test datasets**

In [65]:
train_df = pd.read_csv("../input/house-prices-advanced-regression-techniques/train.csv")
test_df = pd.read_csv("../input/house-prices-advanced-regression-techniques/test.csv")

**Preprocessing data**

In [66]:
# The nan values in each feature is found out along with its percentage
nan_count_df_1 = train_df.isnull().sum().sort_values(ascending = False)
nan_proportions_df_1 = ((train_df.isnull().sum() / train_df.isnull().count())*100)

In [67]:
nans_df_1 = pd.concat([nan_count_df_1, nan_proportions_df_1], 
                    keys = ["Total number of nans", 'Total percentage of nans'], axis = 1)

In [68]:
print(nans_df_1.head(10))

              Total number of nans  Total percentage of nans
PoolQC                        1453                 99.520548
MiscFeature                   1406                 96.301370
Alley                         1369                 93.767123
Fence                         1179                 80.753425
FireplaceQu                    690                 47.260274
LotFrontage                    259                 17.739726
GarageYrBlt                     81                  5.547945
GarageCond                      81                  5.547945
GarageType                      81                  5.547945
GarageFinish                    81                  5.547945


In [69]:
# The features with nan count more than 100 are dropped
train_df.drop((nans_df_1[nans_df_1["Total number of nans"] > 100]).index, axis =1, inplace = True)

train_df.isnull().sum().sort_values(ascending =  False).head(10)


GarageType      81
GarageCond      81
GarageYrBlt     81
GarageFinish    81
GarageQual      81
BsmtFinType2    38
BsmtExposure    38
BsmtFinType1    37
BsmtCond        37
BsmtQual        37
dtype: int64

In [70]:
train_df.select_dtypes(exclude = ["int", "float"]).isnull().sum()

MSZoning          0
Street            0
LotShape          0
LandContour       0
Utilities         0
LotConfig         0
LandSlope         0
Neighborhood      0
Condition1        0
Condition2        0
BldgType          0
HouseStyle        0
RoofStyle         0
RoofMatl          0
Exterior1st       0
Exterior2nd       0
MasVnrType        8
ExterQual         0
ExterCond         0
Foundation        0
BsmtQual         37
BsmtCond         37
BsmtExposure     38
BsmtFinType1     37
BsmtFinType2     38
Heating           0
HeatingQC         0
CentralAir        0
Electrical        1
KitchenQual       0
Functional        0
GarageType       81
GarageFinish     81
GarageQual       81
GarageCond       81
PavedDrive        0
SaleType          0
SaleCondition     0
dtype: int64

In [71]:
# Drop the column Id as it is not needed
train_df.drop(['Id'], axis = 1, inplace = True)

In [72]:
#  Split the columns to numerical and categorical types

num_df_1 = [col for col in train_df.select_dtypes(include = [np.number])]
cat_df_1 = [col for col in train_df.select_dtypes(exclude = ['int', 'float'])]

In [73]:
train_df.select_dtypes(include = [np.number]).isnull().sum()

MSSubClass        0
LotArea           0
OverallQual       0
OverallCond       0
YearBuilt         0
YearRemodAdd      0
MasVnrArea        8
BsmtFinSF1        0
BsmtFinSF2        0
BsmtUnfSF         0
TotalBsmtSF       0
1stFlrSF          0
2ndFlrSF          0
LowQualFinSF      0
GrLivArea         0
BsmtFullBath      0
BsmtHalfBath      0
FullBath          0
HalfBath          0
BedroomAbvGr      0
KitchenAbvGr      0
TotRmsAbvGrd      0
Fireplaces        0
GarageYrBlt      81
GarageCars        0
GarageArea        0
WoodDeckSF        0
OpenPorchSF       0
EnclosedPorch     0
3SsnPorch         0
ScreenPorch       0
PoolArea          0
MiscVal           0
MoSold            0
YrSold            0
SalePrice         0
dtype: int64

In [74]:
for col in num_df_1:
    train_df[col].fillna(train_df[col].median(), inplace = True)
    
train_df.select_dtypes(include = [np.number]).isnull().sum()

MSSubClass       0
LotArea          0
OverallQual      0
OverallCond      0
YearBuilt        0
YearRemodAdd     0
MasVnrArea       0
BsmtFinSF1       0
BsmtFinSF2       0
BsmtUnfSF        0
TotalBsmtSF      0
1stFlrSF         0
2ndFlrSF         0
LowQualFinSF     0
GrLivArea        0
BsmtFullBath     0
BsmtHalfBath     0
FullBath         0
HalfBath         0
BedroomAbvGr     0
KitchenAbvGr     0
TotRmsAbvGrd     0
Fireplaces       0
GarageYrBlt      0
GarageCars       0
GarageArea       0
WoodDeckSF       0
OpenPorchSF      0
EnclosedPorch    0
3SsnPorch        0
ScreenPorch      0
PoolArea         0
MiscVal          0
MoSold           0
YrSold           0
SalePrice        0
dtype: int64

In [75]:
train_df.select_dtypes(exclude = [np.number]).isnull().sum()

MSZoning          0
Street            0
LotShape          0
LandContour       0
Utilities         0
LotConfig         0
LandSlope         0
Neighborhood      0
Condition1        0
Condition2        0
BldgType          0
HouseStyle        0
RoofStyle         0
RoofMatl          0
Exterior1st       0
Exterior2nd       0
MasVnrType        8
ExterQual         0
ExterCond         0
Foundation        0
BsmtQual         37
BsmtCond         37
BsmtExposure     38
BsmtFinType1     37
BsmtFinType2     38
Heating           0
HeatingQC         0
CentralAir        0
Electrical        1
KitchenQual       0
Functional        0
GarageType       81
GarageFinish     81
GarageQual       81
GarageCond       81
PavedDrive        0
SaleType          0
SaleCondition     0
dtype: int64

In [76]:
for col in cat_df_1:
    train_df[col].fillna(train_df[col].value_counts().index[0], inplace = True)
train_df.select_dtypes(exclude = [np.number]).isnull().sum()

MSZoning         0
Street           0
LotShape         0
LandContour      0
Utilities        0
LotConfig        0
LandSlope        0
Neighborhood     0
Condition1       0
Condition2       0
BldgType         0
HouseStyle       0
RoofStyle        0
RoofMatl         0
Exterior1st      0
Exterior2nd      0
MasVnrType       0
ExterQual        0
ExterCond        0
Foundation       0
BsmtQual         0
BsmtCond         0
BsmtExposure     0
BsmtFinType1     0
BsmtFinType2     0
Heating          0
HeatingQC        0
CentralAir       0
Electrical       0
KitchenQual      0
Functional       0
GarageType       0
GarageFinish     0
GarageQual       0
GarageCond       0
PavedDrive       0
SaleType         0
SaleCondition    0
dtype: int64

In [77]:
train_df.isnull().sum().count()

74

In [78]:
# Converting categorical data to numerical values
from sklearn.preprocessing import LabelEncoder
for feature in cat_df_1:
    le = LabelEncoder()
    train_df[feature] = le.fit_transform(train_df[feature].astype(str))

train_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 74 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   MSSubClass     1460 non-null   int64  
 1   MSZoning       1460 non-null   int64  
 2   LotArea        1460 non-null   int64  
 3   Street         1460 non-null   int64  
 4   LotShape       1460 non-null   int64  
 5   LandContour    1460 non-null   int64  
 6   Utilities      1460 non-null   int64  
 7   LotConfig      1460 non-null   int64  
 8   LandSlope      1460 non-null   int64  
 9   Neighborhood   1460 non-null   int64  
 10  Condition1     1460 non-null   int64  
 11  Condition2     1460 non-null   int64  
 12  BldgType       1460 non-null   int64  
 13  HouseStyle     1460 non-null   int64  
 14  OverallQual    1460 non-null   int64  
 15  OverallCond    1460 non-null   int64  
 16  YearBuilt      1460 non-null   int64  
 17  YearRemodAdd   1460 non-null   int64  
 18  RoofStyl

In [79]:
nan_count_2 = test_df.isnull().sum().sort_values(ascending = False)
nan_proportion_2 = (test_df.isnull().sum()/test_df.isnull().count())*100

nans_df_2 = pd.concat([nan_count_2, nan_proportion_2], keys = ["Total no of nans", "Proportion"], axis =1)

nans_df_2.head(10)

Unnamed: 0,Total no of nans,Proportion
PoolQC,1456,99.79438
MiscFeature,1408,96.504455
Alley,1352,92.66621
Fence,1169,80.123372
FireplaceQu,730,50.03427
LotFrontage,227,15.558602
GarageYrBlt,78,5.346127
GarageQual,78,5.346127
GarageFinish,78,5.346127
GarageCond,78,5.346127


In [80]:
# Drop all features with nan count > 100 in test set
test_df.drop((nans_df_2[nans_df_2["Total no of nans"] >100]).index, axis =1, inplace = True)
test_df.isnull().sum().sort_values(ascending = False)


GarageYrBlt      78
GarageFinish     78
GarageQual       78
GarageCond       78
GarageType       76
                 ..
CentralAir        0
Electrical        0
1stFlrSF          0
2ndFlrSF          0
SaleCondition     0
Length: 74, dtype: int64

In [81]:
num_df_2 = [col for col in test_df.select_dtypes(include = [np.number])]
cat_df_2 = [col for col in test_df.select_dtypes(exclude = ['int', 'float'])]

In [82]:
test_df.select_dtypes(include = [np.number]).isnull().sum()

Id                0
MSSubClass        0
LotArea           0
OverallQual       0
OverallCond       0
YearBuilt         0
YearRemodAdd      0
MasVnrArea       15
BsmtFinSF1        1
BsmtFinSF2        1
BsmtUnfSF         1
TotalBsmtSF       1
1stFlrSF          0
2ndFlrSF          0
LowQualFinSF      0
GrLivArea         0
BsmtFullBath      2
BsmtHalfBath      2
FullBath          0
HalfBath          0
BedroomAbvGr      0
KitchenAbvGr      0
TotRmsAbvGrd      0
Fireplaces        0
GarageYrBlt      78
GarageCars        1
GarageArea        1
WoodDeckSF        0
OpenPorchSF       0
EnclosedPorch     0
3SsnPorch         0
ScreenPorch       0
PoolArea          0
MiscVal           0
MoSold            0
YrSold            0
dtype: int64

In [83]:
for col in num_df_2:
    test_df.fillna(test_df[col].median(), inplace =True)

test_df.select_dtypes(include = [np.number]).isnull().sum()

Id               0
MSSubClass       0
LotArea          0
OverallQual      0
OverallCond      0
YearBuilt        0
YearRemodAdd     0
MasVnrArea       0
BsmtFinSF1       0
BsmtFinSF2       0
BsmtUnfSF        0
TotalBsmtSF      0
1stFlrSF         0
2ndFlrSF         0
LowQualFinSF     0
GrLivArea        0
BsmtFullBath     0
BsmtHalfBath     0
FullBath         0
HalfBath         0
BedroomAbvGr     0
KitchenAbvGr     0
TotRmsAbvGrd     0
Fireplaces       0
GarageYrBlt      0
GarageCars       0
GarageArea       0
WoodDeckSF       0
OpenPorchSF      0
EnclosedPorch    0
3SsnPorch        0
ScreenPorch      0
PoolArea         0
MiscVal          0
MoSold           0
YrSold           0
dtype: int64

In [84]:
test_df.select_dtypes(exclude = [np.number]).isnull().sum()

MSZoning         0
Street           0
LotShape         0
LandContour      0
Utilities        0
LotConfig        0
LandSlope        0
Neighborhood     0
Condition1       0
Condition2       0
BldgType         0
HouseStyle       0
RoofStyle        0
RoofMatl         0
Exterior1st      0
Exterior2nd      0
MasVnrType       0
ExterQual        0
ExterCond        0
Foundation       0
BsmtQual         0
BsmtCond         0
BsmtExposure     0
BsmtFinType1     0
BsmtFinType2     0
Heating          0
HeatingQC        0
CentralAir       0
Electrical       0
KitchenQual      0
Functional       0
GarageType       0
GarageFinish     0
GarageQual       0
GarageCond       0
PavedDrive       0
SaleType         0
SaleCondition    0
dtype: int64

In [85]:
for col in cat_df_2:
    test_df.fillna(test_df[col].value_counts().index[0], inplace = True)

test_df.select_dtypes(exclude = [np.number]).isnull().sum()

MSZoning         0
Street           0
LotShape         0
LandContour      0
Utilities        0
LotConfig        0
LandSlope        0
Neighborhood     0
Condition1       0
Condition2       0
BldgType         0
HouseStyle       0
RoofStyle        0
RoofMatl         0
Exterior1st      0
Exterior2nd      0
MasVnrType       0
ExterQual        0
ExterCond        0
Foundation       0
BsmtQual         0
BsmtCond         0
BsmtExposure     0
BsmtFinType1     0
BsmtFinType2     0
Heating          0
HeatingQC        0
CentralAir       0
Electrical       0
KitchenQual      0
Functional       0
GarageType       0
GarageFinish     0
GarageQual       0
GarageCond       0
PavedDrive       0
SaleType         0
SaleCondition    0
dtype: int64

In [86]:
test_df.isnull().sum()

Id               0
MSSubClass       0
MSZoning         0
LotArea          0
Street           0
                ..
MiscVal          0
MoSold           0
YrSold           0
SaleType         0
SaleCondition    0
Length: 74, dtype: int64

In [87]:
# Converting categorical data to numerical values
for feature in cat_df_2:
    le = LabelEncoder()
    test_df[feature] = le.fit_transform(test_df[feature].astype(str))

test_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1459 entries, 0 to 1458
Data columns (total 74 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1459 non-null   int64  
 1   MSSubClass     1459 non-null   int64  
 2   MSZoning       1459 non-null   int64  
 3   LotArea        1459 non-null   int64  
 4   Street         1459 non-null   int64  
 5   LotShape       1459 non-null   int64  
 6   LandContour    1459 non-null   int64  
 7   Utilities      1459 non-null   int64  
 8   LotConfig      1459 non-null   int64  
 9   LandSlope      1459 non-null   int64  
 10  Neighborhood   1459 non-null   int64  
 11  Condition1     1459 non-null   int64  
 12  Condition2     1459 non-null   int64  
 13  BldgType       1459 non-null   int64  
 14  HouseStyle     1459 non-null   int64  
 15  OverallQual    1459 non-null   int64  
 16  OverallCond    1459 non-null   int64  
 17  YearBuilt      1459 non-null   int64  
 18  YearRemo

**Predictive analysis**

In [88]:
# Splitting the dataset into X and y 
y = train_df["SalePrice"].values
train_df.drop(["SalePrice"], inplace = True, axis =1)
X = train_df.values

In [89]:
# Splitting into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 42)

In [92]:
# Testing data on Support Vector regressor
from sklearn.svm import SVR
reg_svr = SVR(kernel= 'linear')
reg_svr.fit(X_train, y_train)
y_pred_svr = reg_svr.predict(X_test)

In [95]:
from sklearn.metrics import r2_score

In [96]:
print(r2_score(y_test, y_pred_svr))

0.7671288208005216
0.8519666781041533
0.7893882407208979


In [97]:
np.set_printoptions(precision= 3)
np.concatenate((y_test.reshape(-1,1), y_pred_svr.reshape(-1,1)),1)

array([[154500.   , 153001.22 ],
       [325000.   , 326957.388],
       [115000.   , 109559.785],
       [159000.   , 175961.724],
       [315500.   , 320868.953],
       [ 75500.   ,  64428.628],
       [311500.   , 235316.213],
       [146000.   , 143053.663],
       [ 84500.   ,  66701.125],
       [135500.   , 152099.204],
       [145000.   , 149938.125],
       [130000.   , 112411.134],
       [ 81000.   ,  81463.538],
       [214000.   , 207733.847],
       [181000.   , 169149.789],
       [134500.   , 138183.001],
       [183500.   , 209633.085],
       [135000.   , 118174.011],
       [118400.   , 112308.955],
       [226000.   , 229302.714],
       [155000.   , 140792.847],
       [210000.   , 210042.033],
       [173500.   , 187910.424],
       [129000.   , 125265.701],
       [192000.   , 214755.69 ],
       [153900.   , 159239.566],
       [181134.   , 209059.361],
       [141000.   ,  84880.305],
       [181000.   , 170823.198],
       [208900.   , 194889.131],
       [12

**Test data prediction**

In [98]:
X_train.shape

(1168, 73)

In [99]:
test_df.shape

(1459, 74)

In [100]:
submission_df = pd.DataFrame()
submission_id = test_df['Id']

In [101]:
test_df.drop(['Id'], axis = 1, inplace = True)

In [112]:
prediction = reg_svr.predict(test_df)

  f"X has feature names, but {self.__class__.__name__} was fitted without"


In [114]:
submission_pred = pd.DataFrame(prediction, columns =["SalePrice"])

In [115]:
submission_df = pd.concat([submission_id, submission_pred], axis = 1)

In [116]:
submission_df.tail(20)

Unnamed: 0,Id,SalePrice
1439,2900,160217.485259
1440,2901,222571.065929
1441,2902,209367.284878
1442,2903,278601.731031
1443,2904,283961.623332
1444,2905,118931.655238
1445,2906,210624.934626
1446,2907,111731.742056
1447,2908,130768.500949
1448,2909,199842.120643


In [117]:
submission_df.to_csv('submission_data.csv', index=False)