In [1]:
import pandas as pd
import numpy as np
import sklearn.naive_bayes as NB
import sklearn.model_selection as cv
import sklearn.metrics as m
from sklearn import preprocessing

# Naive Bayes

## Read the data

As usual, before analyzing the data we read the csv and store all the values in a variable.

In [2]:
data = pd.read_csv('../datasets/preprocessed/train.csv', sep=',', na_values="NA")

In [3]:
data.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotArea,LotShape,LandContour,LotConfig,Neighborhood,Condition1,BldgType,...,Fence,MiscVal,SaleType,SaleCondition,SalePrice,MasVnr,SecondFloor,Baths,Porch,Pool
0,0.0,F,RL,0.249485,1.0,Lvl,Inside,CollgCr,Norm,1Fam,...,0.0,0.0,WD,Normal,Level3,1.0,1.0,0.6,True,0.0
1,0.000685,A,RL,0.289612,1.0,Lvl,FR,Veenker,Feedr,1Fam,...,0.0,0.0,WD,Normal,Level2,0.0,0.0,0.4,True,0.0
2,0.001371,F,RL,0.347186,0.0,Lvl,Inside,CollgCr,Norm,1Fam,...,0.0,0.0,WD,Normal,Level3,1.0,1.0,0.6,True,0.0
3,0.002056,G,RL,0.287868,0.0,Lvl,Corner,Crawfor,Norm,1Fam,...,0.0,0.0,WD,Abnorml,Level2,0.0,1.0,0.2,True,0.0
4,0.002742,F,RL,0.452214,0.0,Lvl,FR,NoRidge,Norm,1Fam,...,0.0,0.0,WD,Normal,Level3,1.0,1.0,0.6,True,0.0


## Prepare the data

First, we begin by separating the data into two different variables: num_data, which only contains the numerical values, and cat_data, which only contains the categorical ones. We also exclude from cat_dat all the values that correspond to the column "SalePrice", since that's what we want to predict.

In [4]:
data
num_data = data.select_dtypes(include=np.number)
cat_data = data.select_dtypes(include=['bool','object']).drop(columns='SalePrice')

## Train a model with numeric columns

First, we train a model using the numerical values in num_data. We're using Gaussian Naive Bayes which, as we can see, gives us a very small score of only 0.259.

In [5]:
X = num_data
Y = data.loc[:,'SalePrice']

X_train, X_test, y_train, y_test = cv.train_test_split(X, Y, test_size=.3, random_state=1)

gnb = NB.GaussianNB()
gnb.fit(X_train,y_train)
gnb.score(X_test,y_test)

0.25925925925925924

## Train a model with categorical columns

Afterwards, we train a model with the categorical values in cat_data. Since we're using Multinomial Naive Bayes, which requires numerical tags instead of strings, we need to preprocess the categories in order to assign an integer ID to each different one before doing the training. As we can see, the final score is 0.625, which is much better than the previous score of 0.259 but still has plenty of room for improvement.

In [6]:
X = cat_data
Y = data.loc[:,'SalePrice']

data.dtypes
for col in X.columns:
    X.loc[:,col] = pd.factorize(X.loc[:,col])[0]
X.head()

X_train, X_test, y_train, y_test = cv.train_test_split(X, Y, test_size=.3, random_state=1)

mnb = NB.MultinomialNB()
mnb.fit(X_train,y_train)
mnb.score(X_test,y_test)

0.625

## Cross validation of the best model

Now we do cross validation with the categorical values, applying the same preprocessing we did for the previous model and Multinomial Naive Bayes again, but now calculating the cross_val_score. As we can see, the result is very similar: 0.63. We also build the confusion matrix and compute the accuracy, which again has almost the same value, and then we finish with the classification report.

In [9]:
kfold = cv.StratifiedKFold(n_splits=10, random_state=1) 

X = cat_data
Y = data.loc[:,'SalePrice']

for col in X.columns:
    X.loc[:,col] = pd.factorize(X.loc[:,col])[0]
X.head()

mnb = NB.MultinomialNB()

cvs = cv.cross_val_score(mnb,X=X,y=Y,cv=kfold)
np.mean(cvs)



0.6307886557886558

In [10]:
pred = cv.cross_val_predict(mnb, X=X, y=Y, cv=kfold)  

print(m.confusion_matrix(Y, pred))
print(m.accuracy_score(Y, pred))

[[ 53  57   3   0   1]
 [ 75 678 131  10  10]
 [  2 131 137  26  15]
 [  0  14  29  34   7]
 [  0   1   4  15   5]]
0.6307371349095967


In [11]:
print(m.classification_report(Y, pred))

              precision    recall  f1-score   support

      Level1       0.41      0.46      0.43       114
      Level2       0.77      0.75      0.76       904
      Level3       0.45      0.44      0.45       311
      Level4       0.40      0.40      0.40        84
      Level5       0.13      0.20      0.16        25

    accuracy                           0.63      1438
   macro avg       0.43      0.45      0.44      1438
weighted avg       0.64      0.63      0.63      1438



## Balancing the dataset

Since we haven't seen any score that's high enough, we decide to balance our data so that there isn't very different amounts of instances for each one, since Level2 had 904 cases and Level5 only had 25. By reducing this difference, we can see that the scores and accuracy are lower than before doing so, but the f1-score for the different levels is more consistent, with a smaller difference.

In [12]:
Y.value_counts()

Level2    904
Level3    311
Level1    114
Level4     84
Level5     25
Name: SalePrice, dtype: int64

In [13]:
print(data['SalePrice'].unique())

X1 = data[data['SalePrice'] == 'Level1']
X2 = data[data['SalePrice'] == 'Level2']
X3 = data[data['SalePrice'] == 'Level3']
X4 = data[data['SalePrice'] == 'Level4']
X5 = data[data['SalePrice'] == 'Level5']

bdata = pd.DataFrame()

for i in range(3):
    bdata = bdata.append(X1, ignore_index = True)
bdata = bdata.append(X2.sample(frac=1/3), ignore_index = True)
bdata = bdata.append(X3, ignore_index = True)
for i in range(4):
    bdata = bdata.append(X4, ignore_index = True)
for i in range(10):
    bdata = bdata.append(X5, ignore_index = True)

bdata['SalePrice'].value_counts()

['Level3' 'Level2' 'Level4' 'Level1' 'Level5']


Level1    342
Level4    336
Level3    311
Level2    301
Level5    250
Name: SalePrice, dtype: int64

In [14]:
X = bdata.select_dtypes(include=['bool','object']).drop(columns=['SalePrice'])
Y = bdata['SalePrice']

kfold = cv.StratifiedKFold(n_splits=10) 

for col in X.columns:
    X.loc[:,col] = pd.factorize(X.loc[:,col])[0]
X.head()

mnb = NB.MultinomialNB()

cvs = cv.cross_val_score(mnb,X=X,y=Y,cv=kfold)
np.mean(cvs)

0.5292207792207793

In [15]:
pred = cv.cross_val_predict(mnb, X=X, y=Y, cv=kfold)  

print(m.confusion_matrix(Y, pred))
print(m.accuracy_score(Y, pred))

[[271  55   9   7   0]
 [ 62 119  88  14  18]
 [  8  53 174  36  40]
 [  9  29 121  81  96]
 [ 10   0  30  40 170]]
0.5292207792207793


In [16]:
print(m.classification_report(Y, pred))

              precision    recall  f1-score   support

      Level1       0.75      0.79      0.77       342
      Level2       0.46      0.40      0.43       301
      Level3       0.41      0.56      0.47       311
      Level4       0.46      0.24      0.32       336
      Level5       0.52      0.68      0.59       250

    accuracy                           0.53      1540
   macro avg       0.52      0.53      0.52      1540
weighted avg       0.53      0.53      0.52      1540



## Naive Bayes using PCA 

Using PCA we obtain some slightly better results. We have a final score of 0.678 with Gaussian Naive Bayes and 0.709 for the cross validation. This is an improvement, but the numbers are still a little bit lower than desired.

In [33]:
data_pca = pd.read_csv('../datasets/preprocessed/trainpca.csv', sep=',', na_values="NA")

In [34]:
X = data_pca
Y = data.loc[:,'SalePrice']
X.head()

Unnamed: 0,0,1,2,3,4,5
0,0.55188,0.483035,-0.707528,-0.032989,0.043738,-0.504799
1,-0.355187,-0.326004,-0.066491,-0.397804,-0.234813,-0.542305
2,1.017713,0.519211,0.170533,0.108878,0.203064,-0.473465
3,0.015638,0.68282,0.683731,-0.07619,0.155393,-0.487857
4,1.106544,0.579149,0.160572,0.117646,0.134389,-0.469887


In [35]:
X_train, X_test, y_train, y_test = cv.train_test_split(X, Y, test_size=.3, random_state=1)

gnb = NB.GaussianNB()
gnb.fit(X_train,y_train)
gnb.score(X_test,y_test)

0.6782407407407407

In [36]:
cvs = cv.cross_val_score(gnb,X=X,y=Y,cv=kfold)
np.mean(cvs)

0.7093045843045843

In [37]:
pred = cv.cross_val_predict(gnb, X=X, y=Y, cv=kfold)  

print(m.confusion_matrix(Y, pred))
print(m.accuracy_score(Y, pred))
print(m.classification_report(Y, pred))

[[ 34  80   0   0   0]
 [ 26 784  94   0   0]
 [  0 121 178  12   0]
 [  0   9  51  24   0]
 [  0   1  13  11   0]]
0.7093184979137691
              precision    recall  f1-score   support

      Level1       0.57      0.30      0.39       114
      Level2       0.79      0.87      0.83       904
      Level3       0.53      0.57      0.55       311
      Level4       0.51      0.29      0.37        84
      Level5       0.00      0.00      0.00        25

    accuracy                           0.71      1438
   macro avg       0.48      0.40      0.43      1438
weighted avg       0.68      0.71      0.69      1438



  _warn_prf(average, modifier, msg_start, len(result))
