In [3]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score, mean_absolute_error, mean_absolute_percentage_error, mean_squared_error, confusion_matrix, accuracy_score
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.svm import SVR
import warnings
warnings.filterwarnings ('ignore')

In [18]:
#Importing the data for Household Income
Hi_data= pd.read_csv('Household Income data.csv')

In [19]:
#get a data overview
Hi_data.head()

Unnamed: 0,Age,Education_Level,Occupation,Number_of_Dependents,Location,Work_Experience,Marital_Status,Employment_Status,Household_Size,Homeownership_Status,Type_of_Housing,Gender,Primary_Mode_of_Transportation,Income
0,56,Master's,Technology,5,Urban,21,Married,Full-time,7,Own,Apartment,Male,Public transit,72510
1,69,High School,Finance,0,Urban,4,Single,Full-time,7,Own,Apartment,Male,Biking,75462
2,46,Bachelor's,Technology,1,Urban,1,Single,Full-time,7,Own,Single-family home,Female,Car,71748
3,32,High School,Others,2,Urban,32,Married,Full-time,1,Own,Apartment,Female,Car,74520
4,60,Bachelor's,Finance,3,Urban,15,Married,Self-employed,4,Own,Townhouse,Male,Walking,640210


In [20]:
Hi_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 14 columns):
 #   Column                          Non-Null Count  Dtype 
---  ------                          --------------  ----- 
 0   Age                             10000 non-null  int64 
 1   Education_Level                 10000 non-null  object
 2   Occupation                      10000 non-null  object
 3   Number_of_Dependents            10000 non-null  int64 
 4   Location                        10000 non-null  object
 5   Work_Experience                 10000 non-null  int64 
 6   Marital_Status                  10000 non-null  object
 7   Employment_Status               10000 non-null  object
 8   Household_Size                  10000 non-null  int64 
 9   Homeownership_Status            10000 non-null  object
 10  Type_of_Housing                 10000 non-null  object
 11  Gender                          10000 non-null  object
 12  Primary_Mode_of_Transportation  10000 non-null 

Encoding of dataset

In [21]:
cat_col = [i for i in Hi_data.columns if Hi_data[i].dtype == "object"]

print(cat_col)

['Education_Level', 'Occupation', 'Location', 'Marital_Status', 'Employment_Status', 'Homeownership_Status', 'Type_of_Housing', 'Gender', 'Primary_Mode_of_Transportation']


In [22]:
le= LabelEncoder()

In [23]:
for col in cat_col:
    Hi_data[col] = le.fit_transform(Hi_data[col])

In [24]:
#view data
Hi_data.head()

Unnamed: 0,Age,Education_Level,Occupation,Number_of_Dependents,Location,Work_Experience,Marital_Status,Employment_Status,Household_Size,Homeownership_Status,Type_of_Housing,Gender,Primary_Mode_of_Transportation,Income
0,56,3,4,5,2,21,1,0,7,0,0,1,2,72510
1,69,2,1,0,2,4,2,0,7,0,0,1,0,75462
2,46,0,4,1,2,1,2,0,7,0,1,0,1,71748
3,32,2,3,2,2,32,1,0,1,0,0,0,1,74520
4,60,0,1,3,2,15,1,2,4,0,2,1,3,640210


In [25]:
#splitting the dataset
X= Hi_data
y= Hi_data['Income']

In [26]:
from sklearn.model_selection import train_test_split


In [27]:
#split dataset
X_train, X_test, y_train, y_test= train_test_split(X, y, test_size= 0.2, random_state=42)

Using the Linear regression model


In [28]:
lr_model= LinearRegression()
lr_model.fit(X_train, y_train)

#make predictions
y_predict= lr_model.predict(X_test)
print(y_predict)

#Evaluate model
# Evaluate
mse = mean_squared_error(y_test, y_predict)
print(f"Mean Squared Error: {mse}")

r2= r2_score(y_test, y_predict)
print(f"The r2 score is: {r2}")

mae= mean_absolute_error(y_test, y_predict)
print(f"The mean absolute error is: {mae}")

[  74742. 5989527.   69356. ...  147809.   70212.  127291.]
Mean Squared Error: 1.4945406001635008e-18
The r2 score is: 1.0
The mean absolute error is: 7.767266652081162e-10


KNN model

In [29]:
#initialize KNN
knn_model = KNeighborsRegressor()
knn_model.fit(X_train, y_train)

#predict
knn_predict = knn_model.predict(X_test)

#evaluation metrics
knn_r2 = r2_score(knn_predict, y_test)
knn_mse = mean_squared_error(knn_predict, y_test)
knn_mae = mean_absolute_error(knn_predict, y_test)

print(f"The r2 Score is: {knn_r2}")
print(f" The mean square error is: {knn_mse}")
print(f" The mean absolute error is: {knn_mae}")

The r2 Score is: 0.9999974001609426
 The mean square error is: 8199619.476319985
 The mean absolute error is: 823.885


SVR

In [30]:
#initialize svr
svr_model = SVR()
svr_model.fit(X_train, y_train)

#predict
svr_predict = svr_model.predict(X_test)

#evaluation metrics
svr_r2 = r2_score(svr_predict, y_test)
svr_mse = mean_squared_error(svr_predict, y_test)
svr_mape= mean_absolute_percentage_error(svr_predict, y_test)
svr_mae= mean_absolute_error(svr_predict, y_test)

print(f"r2 Score is: {svr_r2}")
print(f"mean squared error is: {svr_mse}")
print(f" The mean absolute error is:{svr_mae}")
print(f"The mean absolute percentage error is: {svr_mape}")

r2 Score is: -11335196.395874262
mean squared error is: 3712526989918.478
 The mean absolute error is:753981.9836669465
The mean absolute percentage error is: 10.148680386244036


random forest regressor


In [31]:
rf_model= RandomForestRegressor()
rf_model.fit(X_train, y_train)

#make predictions
rf_predict= rf_model.predict(X_test)

#evaluate model
rf_r2 = r2_score(rf_predict, y_test)
rf_mse = mean_squared_error(rf_predict, y_test)
rf_mape= mean_absolute_percentage_error(rf_predict, y_test)
rf_mae= mean_absolute_error(rf_predict, y_test)

print(f"r2 Score is: {rf_r2}")
print(f"mean squared error is: {rf_mse}")
print(f" The mean absolute error is:{rf_mae}")
print(f"The mean absolute percentage error is: {rf_mape}")

r2 Score is: 0.9999958242106842
mean squared error is: 13170560.40065325
 The mean absolute error is:1008.4418849999951
The mean absolute percentage error is: 0.0005286101161561097


From the models used for the regression dataset, the most appropriate is the Linear Regression model with an r2 score of 1. 