## Model 3 -- SVM

In [2]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

In [3]:
data = pd.read_csv('final_cleaned.csv')
data.head()

Unnamed: 0,bathrooms,bedrooms,price,square_feet,time,state_AK,state_AL,state_AR,state_AZ,state_CA,...,Gym,TV,Hot Tub,Tennis,Wood Floors,View,Alarm,Doorman,Luxury,Golf
0,1.0,0.0,790,0.0,2019-12-26 11:23:35,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1.0,1.0,425,0.000125,2019-12-22 12:17:43,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1.0,0.0,1390,0.00015,2019-12-26 11:23:30,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1.0,0.0,925,0.000376,2019-12-18 11:15:43,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1.0,0.0,880,0.000602,2019-12-26 11:23:21,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


Use model 2 to label record as "fair" or "unfair

In [6]:
! git clone "https://github.com/Daniel-Tran3/CSE_151A_Project.git"

fatal: destination path 'CSE_151A_Project' already exists and is not an empty directory.


In [4]:
from keras.models import Sequential
from keras.layers import Dense

from keras.models import load_model

ANNmodel = load_model('./CSE_151A_Project/Models')

# ANNmodel.summary()

X = data.drop(['price', 'time'], axis=1)
fairness_data = data.copy()
fairness_data['predicted_price'] = ANNmodel.predict(X)
fairness_data.head()

# if the actual price is more than 30% more than the predicted price, the value in the fairness column is 'unfair', otherwise it is 'fair'
fairness_data['fairness'] = np.where(fairness_data['price'] > 1.3*fairness_data['predicted_price'], 'unfair', 'fair')
fairness_data['fairness'].value_counts()

fairness_data.head()

# save fairness_data to a csv for future use
# fairness_data.to_csv('fairness_data.csv', index=False)



Unnamed: 0,bathrooms,bedrooms,price,square_feet,time,state_AK,state_AL,state_AR,state_AZ,state_CA,...,Hot Tub,Tennis,Wood Floors,View,Alarm,Doorman,Luxury,Golf,predicted_price,fairness
0,1.0,0.0,790,0.0,2019-12-26 11:23:35,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1767.783081,fair
1,1.0,1.0,425,0.000125,2019-12-22 12:17:43,0,0,0,0,0,...,0,0,0,0,0,0,0,0,609.471741,fair
2,1.0,0.0,1390,0.00015,2019-12-26 11:23:30,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1288.924805,fair
3,1.0,0.0,925,0.000376,2019-12-18 11:15:43,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1575.438965,fair
4,1.0,0.0,880,0.000602,2019-12-26 11:23:21,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1291.257935,fair


In [5]:
from sklearn.model_selection import train_test_split

# encode fiarness to 0 and 1
fairness_mapping = {'fair': 0, 'unfair': 1}
fairness_data['fairness_encoded'] = fairness_data['fairness'].map(fairness_mapping)

X = fairness_data.drop(columns=['price', 'time', 'predicted_price', 'fairness'])  # Features
y = fairness_data['fairness_encoded']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# sns.pairplot(data, hue='price_class', palette='colorblind')

From the result of pairplot. Let's pick ____ as our features.

That's not an ideal approach since we have 1600+ features. Plotting pairplot takes huge time.

Let's use RFE to select features: https://www.scikit-yb.org/en/latest/api/model_selection/rfecv.html#:~:text=Recursive%20feature%20elimination%20(RFE)%20is,number%20of%20features%20is%20reached.

In [6]:
from sklearn.feature_selection import RFE
from sklearn.svm import SVC

estimator = SVC(kernel="linear")

num_features_to_select = 100  # Adjust as needed

rfe = RFE(estimator, n_features_to_select=num_features_to_select)
rfe.fit(X_train, y_train)

selected_features = X_train.columns[rfe.support_]
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

Let's train linear and RBF SVMs and evaluate them.

Leaner SVM

In [7]:
svm_linear_model = SVC(kernel='linear')
svm_linear_model.fit(X_train_selected, y_train)

In [8]:
# evaluation

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

yhat_linear = svm_linear_model.predict(X_test_selected)

accuracy = accuracy_score(y_test, yhat_linear)
print("Accuracy:", accuracy)

print("\nClassification Report:")
print(classification_report(y_test, yhat_linear))

conf_matrix = confusion_matrix(y_test, yhat_linear)
print("\nConfusion Matrix:")
print(conf_matrix)

Accuracy: 1.0

Classification Report:
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1787
           1       1.00      1.00      1.00       213

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000


Confusion Matrix:
[[1787    0]
 [   0  213]]


RBF SVM

In [9]:
svm_rbf_model = SVC(kernel='rbf')
svm_rbf_model.fit(X_train_selected, y_train)

In [10]:
# evaluation

yhat_rbf = svm_rbf_model.predict(X_test_selected)

accuracy_rbf = accuracy_score(y_test, yhat_rbf)
print("Accuracy (SVC with RBF kernel):", accuracy_rbf)

print("\nClassification Report (SVC with RBF kernel):")
print(classification_report(y_test, yhat_rbf))

conf_matrix_rbf = confusion_matrix(y_test, yhat_rbf)
print("\nConfusion Matrix (SVC with RBF kernel):")
print(conf_matrix_rbf)


Accuracy (SVC with RBF kernel): 1.0

Classification Report (SVC with RBF kernel):
              precision    recall  f1-score   support

           0       1.00      1.00      1.00      1787
           1       1.00      1.00      1.00       213

    accuracy                           1.00      2000
   macro avg       1.00      1.00      1.00      2000
weighted avg       1.00      1.00      1.00      2000


Confusion Matrix (SVC with RBF kernel):
[[1787    0]
 [   0  213]]
