# A Naive Bayes Classifier for Predicting Probability of Americans to Purchase a Home Based on Generation & Having Student Loans (2023)

##Anna Larracuente

In [1]:
pip install tabulate

Note: you may need to restart the kernel to use updated packages.


In [2]:
#Import Necessary Libraries & Tools

import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.metrics import accuracy_score
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from tabulate import tabulate

In [3]:
#Load, Convert to Data Frame, and View First 5 Records of Student Loans Data Data Set

sl = pd.read_csv("STUDENTLOANSDS.csv")
sldf = pd.DataFrame(sl)
sldf.head()

Unnamed: 0,SUB_DATE,SEX,SEX_BIN,AGE,GEN,GEN_CLASS,MAR_STAT,MAR_STAT_CLASS,IND_ANN_INC ($),IND_ANN_INC_LOW,...,STUDENTLOANS?,STUDENTLOANS_BIN,TOT_UG_SLD ($),TOT_G_SLD ($),TOT_UGG_SLD_BCS ($),TOT_UGG_SLD_WCS ($),TOT_UGG_SLD_AVG ($),ANN_SLD_INC_DTIRAT,MON_SLD_INC_DTIRAT,ANN_SLD_INC_DTIRAT_PRCNT
0,3/26/23,Male,0,27 - 42,Millennial,5,Single,0,90001 - 110000,90000,...,Yes,1,10001 - 25000,25001 - 50000,35000,75000,55000,0.55,6.6,55.0
1,3/26/23,Female,1,27 - 42,Millennial,5,Married,1,50001 - 75000,50000,...,Yes,1,10001 - 25000,10001 - 25000,20000,50000,35000,0.56,6.72,56.0
2,3/26/23,Male,0,43 - 58,Gen X,4,Married,1,110001 - 130000,110000,...,"No, I did not take out student loans in my nam...",0,0,0,0,0,0,0.0,0.0,0.0
3,3/26/23,Female,1,43 - 58,Gen X,4,Married,1,30001 - 50000,30000,...,"No, I did not take out student loans in my nam...",0,0,0,0,0,0,0.0,0.0,0.0
4,3/26/23,Male,0,59 - 68,Boomers II,3,Divorced,3,130001 - 150000,130000,...,"No, I have paid off all of my student loans",0,0,0,0,0,0,0.0,0.0,0.0


In [4]:
#Pop off Submission Date, Sex, Age, Gender, Marital Status Columns (SUB_DATE)

sldf.pop("SUB_DATE")
sldf.pop("SEX")
sldf.pop("AGE")
sldf.pop("GEN")
sldf.pop("MAR_STAT")
sldf.pop("IND_ANN_INC ($)")
sldf.pop("HSD_GED?")
sldf.pop("BACH_DEG?")
sldf.pop("HIGHER_ED?")
sldf.pop("CURR_STUDENT?")
sldf.pop("OWN_CAR?")
sldf.pop("HOUSING")
sldf.pop("PETS?")
sldf.pop("VACATION?")
sldf.pop("TRAV_STATE?")
sldf.pop("TRAV_STATE_Q")
sldf.pop("TRAV_US?")
sldf.pop("TRAV_US_Q")
sldf.pop("MULTI_GEN?")
sldf.pop("CHILDREN?")
sldf.pop("MOV?")
sldf.pop("HOB?")
sldf.pop("AVG_MON_HOB")
sldf.pop("DINEOUT?")
sldf.pop("AVG_WEEKLY_DINEOUT")
sldf.pop("ORDERIN?")
sldf.pop("AVG_WEEKLY_ORDERIN")
sldf.pop("DONATIONS?")
sldf.pop("ANN_EVENTS_ATT_Q")
sldf.pop("ANN_EVENTS_HOST_Q")
sldf.pop("MON_ESS_EXP ($)")
sldf.pop("ANN_NONESS_EXP ($)")
sldf.pop("LUX_Q")
sldf.pop("STUDENTLOANS?")
sldf.pop("TOT_UG_SLD ($)")
sldf.pop("TOT_G_SLD ($)")

0      25001 - 50000
1      10001 - 25000
2                  0
3                  0
4                  0
           ...      
96                 0
97                 0
98                 0
99                 0
100    10001 - 25000
Name: TOT_G_SLD ($), Length: 101, dtype: object

In [5]:
#Print the Target Variable: HOUSING_BIN

print(sldf["HOUSING_BIN"])

0      0
1      0
2      0
3      0
4      1
      ..
96     0
97     1
98     1
99     0
100    0
Name: HOUSING_BIN, Length: 101, dtype: int64


In [6]:
#Split Student Loan Data Set into Training & Testing Sets: 80% Training and 20% Test

x_train, x_test, y_train, y_test = train_test_split(sldf, sldf["HOUSING_BIN"], test_size = 0.20, random_state = 26)

In [7]:
#Create a Gaussian Naive Bayes Classifier

NBC = GaussianNB()

In [8]:
#Train the Model using the Training Set

NBC.fit(x_train, y_train)

GaussianNB()

In [9]:
#Predict the Response for Test Data Set

y_pred = NBC.predict(x_test)

In [10]:
#Check Model Accuracy

print("Accuracy:", metrics.accuracy_score(y_test, y_pred))

Accuracy: 0.8571428571428571


In [11]:
#Create a Frequency Table of American Homeowners by Predicted Generation & Having Student Loan Debt

freqtab = pd.crosstab(index = sldf["HOUSING_BIN"], 
                      columns = [sldf["GEN_CLASS"],
                                 sldf["STUDENTLOANS_BIN"]],
                              margins = True)

freqtab

GEN_CLASS,1,2,3,3,4,4,5,5,6,All
STUDENTLOANS_BIN,0,0,0,1,0,1,0,1,1,Unnamed: 10_level_1
HOUSING_BIN,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2
0,0,1,4,0,5,1,9,19,3,42
1,1,1,12,2,13,3,18,8,1,59
All,1,2,16,2,18,4,27,27,4,101


In [12]:
#Hypothesis & Prior Probability of Homeownership by Having Student Loans

hypos = 'Own a Home', 'Do Not Own a Home'

prob = 0.40, 0.60

In [13]:
#Prior Probabilities Summary by Gender

prior = pd.Series(prob, hypos)

print(prior)

Own a Home           0.4
Do Not Own a Home    0.6
dtype: float64


In [14]:
#Likelihoods by Generation (Taken From Frequency Table Above, with Laplacian Correction)

likelihoodsPostWarWithLoans = 1/1, 1/1
likelihoodsPostWarNoLoans = 2/2, 1/2

likelihoodsBoomersIWithLoans = 1/3, 1/3
likelihoodsBoomersINoLoans = 3/3, 2/3

likelihoodsBoomersIIWithLoans = 3/5, 2/5
likelihoodsBoomersIINoLoans = 14/19, 6/19

likelihoodsGenXWithLoans = 9/28, 20/28
likelihoodsGenXNoLoans = 19/28, 10/28

likelihoodsMillennialsWithLoans = 2/17, 2/17
likelihoodsMillennialsNoLoans = 2/17, 2/17

likelihoodsGenZWithLoans = 2/5, 4/5
likelihoodsGenZNoLoans = 1/1, 1/1

likelihoodsPostWarOwnersWithLoans = 1/1
likelihoodsPostWarOwnersNoLoans = 2/2
likelihoodsPostWarNonOwnersWithLoans = 1/1
likelihoodsPostWarNonOwnersNoLoans = 1/2

likelihoodsBoomersIOwnersWithLoans = 1/3
likelihoodsBoomersIOwnersNoLoans = 3/3
likelihoodsBoomersINonOwnersWithLoans = 1/3
likelihoodsBoomersINonOwnersNoLoans = 2/3

likelihoodsBoomersIIOwnersWithLoans = 3/5
likelihoodsBoomersIIOwnersNoLoans = 14/19
likelihoodsBoomersIINonOwnersWithLoans = 2/5
likelihoodsBoomersIINonOwnersNoLoans = 6/19

likelihoodsGenXOwnersWithLoans = 9/28
likelihoodsGenXOwnersNoLoans = 19/28
likelihoodsGenXNonOwnersWithLoans = 20/28
likelihoodsGenXNonOwnersNoLoans = 10/28

likelihoodsMillennialsOwnersWithLoans = 2/17
likelihoodsMillennialsOwnersNoLoans = 2/17
likelihoodsMillennialsNonOwnersWithLoans = 2/17
likelihoodsMillennialsNonOwnersNoLoans = 2/17

likelihoodsGenZOwnersWithLoans = 2/5
likelihoodsGenZOwnersNoLoans = 1/1
likelihoodsGenZNonOwnersWithLoans = 4/5
likelihoodsGenZNonOwnersNoLoans = 1/1

In [15]:
#Likelihoods Table Summary By Generation and Ownership

data = [["PostWar", "Yes", "Owner", likelihoodsPostWarOwnersWithLoans], 
        ["PostWar", "No", "Owner", likelihoodsPostWarOwnersNoLoans],
        ["PostWar", "Yes", "Non-Owner", likelihoodsPostWarNonOwnersWithLoans],
        ["PostWar", "No", "Non-Owner", likelihoodsPostWarNonOwnersNoLoans],
        ["BoomersI", "Yes", "Owner", likelihoodsBoomersIOwnersWithLoans], 
        ["BoomersI", "No", "Owner", likelihoodsBoomersIOwnersNoLoans],
        ["BoomersI", "Yes", "Non-Owner", likelihoodsBoomersINonOwnersWithLoans],
        ["BoomersI", "No", "Non-Owner", likelihoodsBoomersINonOwnersNoLoans],
        ["BoomersII", "Yes", "Owner", likelihoodsBoomersIIOwnersWithLoans], 
        ["BoomersII", "No", "Owner", likelihoodsBoomersIIOwnersNoLoans],
        ["BoomersII", "Yes", "Non-Owner", likelihoodsBoomersIINonOwnersWithLoans],
        ["BoomersII", "No", "Non-Owner", likelihoodsBoomersIINonOwnersNoLoans],
        ["GenX", "Yes", "Owner", likelihoodsGenXOwnersWithLoans], 
        ["GenX", "No", "Owner", likelihoodsGenXOwnersNoLoans],
        ["GenX", "Yes", "Non-Owner", likelihoodsGenXNonOwnersWithLoans],
        ["GenX", "No", "Non-Owner", likelihoodsGenXNonOwnersNoLoans],
        ["Millennials", "Yes", "Owner", likelihoodsMillennialsOwnersWithLoans], 
        ["Millennials", "No", "Owner", likelihoodsMillennialsOwnersNoLoans],
        ["Millennials", "Yes", "Non-Owner", likelihoodsMillennialsNonOwnersWithLoans],
        ["Millennials", "No", "Non-Owner", likelihoodsMillennialsNonOwnersNoLoans],
        ["GenZ", "Yes", "Owner", likelihoodsGenZOwnersWithLoans], 
        ["GenZ", "No", "Owner", likelihoodsGenZOwnersNoLoans],
        ["GenZ", "Yes", "Non-Owner", likelihoodsGenZNonOwnersWithLoans],
        ["GenZ", "No", "Non-Owner", likelihoodsGenZNonOwnersNoLoans]]

col_names = ["Generation", "Student Loans?", "Home Ownership", "Likelihood"]

print(tabulate(data, headers = col_names, tablefmt = "fancy_grid"))

╒══════════════╤══════════════════╤══════════════════╤══════════════╕
│ Generation   │ Student Loans?   │ Home Ownership   │   Likelihood │
╞══════════════╪══════════════════╪══════════════════╪══════════════╡
│ PostWar      │ Yes              │ Owner            │     1        │
├──────────────┼──────────────────┼──────────────────┼──────────────┤
│ PostWar      │ No               │ Owner            │     1        │
├──────────────┼──────────────────┼──────────────────┼──────────────┤
│ PostWar      │ Yes              │ Non-Owner        │     1        │
├──────────────┼──────────────────┼──────────────────┼──────────────┤
│ PostWar      │ No               │ Non-Owner        │     0.5      │
├──────────────┼──────────────────┼──────────────────┼──────────────┤
│ BoomersI     │ Yes              │ Owner            │     0.333333 │
├──────────────┼──────────────────┼──────────────────┼──────────────┤
│ BoomersI     │ No               │ Owner            │     1        │
├──────────────┼────

In [16]:
#Unnormalized Posterior Probability Calculation for Post War Home Owners & Non-Owners With Student Loans

unnorm1 = prior * likelihoodsPostWarWithLoans

print(unnorm1)

Own a Home           0.4
Do Not Own a Home    0.6
dtype: float64


In [17]:
#Unnormalized Posterior Probability Calculation for Post War Home Owners & Non-Owners With No Student Loans

unnorm2 = prior * likelihoodsPostWarNoLoans

print(unnorm2)

Own a Home           0.4
Do Not Own a Home    0.3
dtype: float64


In [18]:
#Unnormalized Posterior Probability Calculation for Boomers I Home Owners & Non-Owners With Student Loans

unnorm3 = prior * likelihoodsBoomersIWithLoans

print(unnorm3)

Own a Home           0.133333
Do Not Own a Home    0.200000
dtype: float64


In [19]:
#Unnormalized Posterior Probability Calculation for Boomers I Home Owners & Non-Owners With No Student Loans

unnorm4 = prior * likelihoodsBoomersINoLoans

print(unnorm4)

Own a Home           0.4
Do Not Own a Home    0.4
dtype: float64


In [20]:
#Unnormalized Posterior Probability Calculation for Boomers II Home Owners & Non-Owners With Student Loans

unnorm5 = prior * likelihoodsBoomersIIWithLoans

print(unnorm5)

Own a Home           0.24
Do Not Own a Home    0.24
dtype: float64


In [21]:
#Unnormalized Posterior Probability Calculation for Boomers II Home Owners & Non-Owners With No Student Loans

unnorm6 = prior * likelihoodsBoomersIINoLoans

print(unnorm6)

Own a Home           0.294737
Do Not Own a Home    0.189474
dtype: float64


In [22]:
#Unnormalized Posterior Probability Calculation for Gen X Home Owners & Non-Owners With Student Loans

unnorm7 = prior * likelihoodsGenXWithLoans

print(unnorm7)

Own a Home           0.128571
Do Not Own a Home    0.428571
dtype: float64


In [23]:
#Unnormalized Posterior Probability Calculation for Gen X Home Owners & Non-Owners With No Student Loans

unnorm8 = prior * likelihoodsGenXNoLoans

print(unnorm8)

Own a Home           0.271429
Do Not Own a Home    0.214286
dtype: float64


In [24]:
#Unnormalized Posterior Probability Calculation for Millennials Home Owners & Non-Owners With Student Loans

unnorm9 = prior * likelihoodsMillennialsWithLoans

print(unnorm9)

Own a Home           0.047059
Do Not Own a Home    0.070588
dtype: float64


In [25]:
#Unnormalized Posterior Probability Calculation for Millennials Home Owners & Non-Owners With No Student Loans

unnorm10 = prior * likelihoodsMillennialsNoLoans

print(unnorm10)

Own a Home           0.047059
Do Not Own a Home    0.070588
dtype: float64


In [26]:
#Unnormalized Posterior Probability Calculation for Gen Z Home Owners & Non-Owners With Student Loans

unnorm11 = prior * likelihoodsGenZWithLoans

print(unnorm11)

Own a Home           0.16
Do Not Own a Home    0.48
dtype: float64


In [27]:
#Unnormalized Posterior Probability Calculation for Gen Z Home Owners & Non-Owners With No Student Loans

unnorm12 = prior * likelihoodsGenZNoLoans

print(unnorm12)

Own a Home           0.4
Do Not Own a Home    0.6
dtype: float64


In [28]:
#Summation of Unnormalized Posterior Probability Calculation for Post War Home Owners & Non-Owners With Student Loans

prob_data1 = unnorm1.sum()

print(prob_data1)

1.0


In [29]:
#Summation of Unnormalized Posterior Probability Calculation for Post War Home Owners & Non-Owners With No Student Loans

prob_data2 = unnorm2.sum()

print(prob_data2)

0.7


In [30]:
#Summation of Unnormalized Posterior Probability Calculation for Boomers I Home Owners & Non-Owners With Student Loans

prob_data3 = unnorm3.sum()

print(prob_data3)

0.3333333333333333


In [31]:
#Summation of Unnormalized Posterior Probability Calculation for Boomers I Home Owners & Non-Owners With No Student Loans

prob_data4 = unnorm4.sum()

print(prob_data4)

0.8


In [32]:
#Summation of Unnormalized Posterior Probability Calculation for Boomers II Home Owners & Non-Owners With Student Loans

prob_data5 = unnorm5.sum()

print(prob_data5)

0.48


In [33]:
#Summation of Unnormalized Posterior Probability Calculation for Boomers II Home Owners & Non-Owners With No Student Loans

prob_data6 = unnorm6.sum()

print(prob_data6)

0.4842105263157894


In [34]:
#Summation of Unnormalized Posterior Probability Calculation for Gen X Home Owners & Non-Owners With Student Loans

prob_data7 = unnorm7.sum()

print(prob_data7)

0.5571428571428572


In [35]:
#Summation of Unnormalized Posterior Probability Calculation for Gen X Home Owners & Non-Owners With No Student Loans

prob_data8 = unnorm8.sum()

print(prob_data8)

0.48571428571428577


In [36]:
#Summation of Unnormalized Posterior Probability Calculation for Millennials Home Owners & Non-Owners With Student Loans

prob_data9 = unnorm9.sum()

print(prob_data9)

0.11764705882352941


In [37]:
#Summation of Unnormalized Posterior Probability Calculation for Millennials Home Owners & Non-Owners With Student Loans

prob_data10 = unnorm10.sum()

print(prob_data10)

0.11764705882352941


In [38]:
#Summation of Unnormalized Posterior Probability Calculation for Gen Z Home Owners & Non-Owners With Student Loans

prob_data11 = unnorm11.sum()

print(prob_data11)

0.64


In [39]:
#Summation of Unnormalized Posterior Probability Calculation for Gen Z Home Owners & Non-Owners With No Student Loans

prob_data12 = unnorm12.sum()

print(prob_data12)

1.0


In [40]:
#Normalized Posterior Probability Percentage of Post War Home Owners & Non-Owners With Student Loans

posterior1 = unnorm1 / prob_data1*100

print(posterior1)

Own a Home           40.0
Do Not Own a Home    60.0
dtype: float64


In [41]:
#Normalized Posterior Probability Percentage of Post War Home Owners & Non-Owners With No Student Loans

posterior2 = unnorm2 / prob_data2*100

print(posterior2)

Own a Home           57.142857
Do Not Own a Home    42.857143
dtype: float64


In [42]:
#Normalized Posterior Probability Percentage of Boomers I Home Owners & Non-Owners With Student Loans

posterior3 = unnorm3 / prob_data3*100

print(posterior3)

Own a Home           40.0
Do Not Own a Home    60.0
dtype: float64


In [43]:
#Normalized Posterior Probability Percentage of Boomers I Home Owners & Non-Owners With No Student Loans

posterior4 = unnorm4 / prob_data4*100

print(posterior4)

Own a Home           50.0
Do Not Own a Home    50.0
dtype: float64


In [44]:
#Normalized Posterior Probability Percentage of Boomers II Home Owners & Non-Owners With Student Loans

posterior5 = unnorm5 / prob_data5*100

print(posterior5)

Own a Home           50.0
Do Not Own a Home    50.0
dtype: float64


In [45]:
#Normalized Posterior Probability Percentage of Boomers II Home Owners & Non-Owners With No Student Loans

posterior6 = unnorm6 / prob_data6*100

print(posterior6)

Own a Home           60.869565
Do Not Own a Home    39.130435
dtype: float64


In [46]:
#Normalized Posterior Probability Percentage of Gen X Home Owners & Non-Owners With Student Loans

posterior7 = unnorm7 / prob_data7*100

print(posterior7)

Own a Home           23.076923
Do Not Own a Home    76.923077
dtype: float64


In [47]:
#Normalized Posterior Probability Percentage of Gen X Home Owners & Non-Owners With No Student Loans

posterior8 = unnorm8 / prob_data8*100

print(posterior8)

Own a Home           55.882353
Do Not Own a Home    44.117647
dtype: float64


In [48]:
#Normalized Posterior Probability Percentage of Millennials Home Owners & Non-Owners With Student Loans

posterior9 = unnorm9 / prob_data9*100

print(posterior9)

Own a Home           40.0
Do Not Own a Home    60.0
dtype: float64


In [49]:
#Normalized Posterior Probability Percentage of Millennials Home Owners & Non-Owners With No Student Loans

posterior10 = unnorm10 / prob_data10*100

print(posterior10)

Own a Home           40.0
Do Not Own a Home    60.0
dtype: float64


In [50]:
#Normalized Posterior Probability Percentage of Gen Z Home Owners & Non-Owners With Student Loans

posterior11 = unnorm11 / prob_data11*100

print(posterior11)

Own a Home           25.0
Do Not Own a Home    75.0
dtype: float64


In [51]:
#Normalized Posterior Probability Percentage of Gen Z Home Owners & Non-Owners With No Student Loans

posterior12 = unnorm12 / prob_data12*100

print(posterior12)

Own a Home           40.0
Do Not Own a Home    60.0
dtype: float64


In [52]:
#Posterior Probability of Home Ownership: Table Summary By Age and Gender

data = [["Post War", "Have Student Loans", posterior1],
        ["Post War", "Do Not Have Student Loans", posterior2],
        ["Boomers I", "Have Student Loans", posterior3],
        ["Boomers I", "Do Not Have Student Loans", posterior4],
        ["Boomers II", "Have Student Loans", posterior5],
        ["Boomers II", "Do Not Have Student Loans", posterior6],
        ["Gen X", "Have Student Loans", posterior7],
        ["Gen X", "Do Not Have Student Loans", posterior8],
        ["Millennials", "Have Student Loans", posterior9],
        ["Millennials", "Do Not Have Student Loans", posterior10],
        ["Gen Z", "Have Student Loans", posterior11],
        ["Gen Z", "Do Not Have Student Loans", posterior12]]

col_names = ["Generation", "Student Loans Status", "Posterior Probability Percentages"]

print(tabulate(data, headers = col_names, tablefmt = "fancy_grid"))

╒══════════════╤═══════════════════════════╤═════════════════════════════════════╕
│ Generation   │ Student Loans Status      │ Posterior Probability Percentages   │
╞══════════════╪═══════════════════════════╪═════════════════════════════════════╡
│ Post War     │ Have Student Loans        │ Own a Home           40.0           │
│              │                           │ Do Not Own a Home    60.0           │
│              │                           │ dtype: float64                      │
├──────────────┼───────────────────────────┼─────────────────────────────────────┤
│ Post War     │ Do Not Have Student Loans │ Own a Home           57.142857      │
│              │                           │ Do Not Own a Home    42.857143      │
│              │                           │ dtype: float64                      │
├──────────────┼───────────────────────────┼─────────────────────────────────────┤
│ Boomers I    │ Have Student Loans        │ Own a Home           40.0           │
│   