## Model 3 -- SVM

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from datetime import datetime

from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import OneHotEncoder

In [None]:
data = pd.read_csv('final_cleaned.csv')
data.head()

Unnamed: 0,bathrooms,bedrooms,price,square_feet,time,state_AK,state_AL,state_AR,state_AZ,state_CA,...,Gym,TV,Hot Tub,Tennis,Wood Floors,View,Alarm,Doorman,Luxury,Golf
0,1.0,0.0,790,0.0,2019-12-26 11:23:35,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1.0,1.0,425,0.000125,2019-12-22 12:17:43,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1.0,0.0,1390,0.00015,2019-12-26 11:23:30,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,1.0,0.0,925,0.000376,2019-12-18 11:15:43,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1.0,0.0,880,0.000602,2019-12-26 11:23:21,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
data['price'].describe()

count     9998.000000
mean      1486.262753
std       1076.615140
min        200.000000
25%        949.000000
50%       1270.000000
75%       1695.000000
max      52500.000000
Name: price, dtype: float64

To train SVM, let's divide our target 'price' to 4 classes.
Class 0: min - 25%
Class 1: 25% - 50%
class 2: 50% - 75%
class 3: 75% - max

In [None]:
percentiles = np.percentile(data['price'], [25, 50, 75])

# Assign class labels based on percentiles
data['price_class'] = np.digitize(data['price'], bins=percentiles, right=True)

print(data[['price', 'price_class']])
print(data['price_class'].unique())

      price  price_class
0       790            0
1       425            0
2      1390            2
3       925            0
4       880            0
...     ...          ...
9993   6000            3
9994  25000            3
9995  11000            3
9996   4790            3
9997   1009            1

[9998 rows x 2 columns]
[0 2 3 1]


In [None]:
from sklearn.model_selection import train_test_split
X = data.drop(columns=['price', 'price_class', 'time'])  # Features
y = data['price_class']  # Target variable

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
# sns.pairplot(data, hue='price_class', palette='colorblind')

From the result of pairplot. Let's pick ____ as our features.

That's not an ideal approach since we have 1600+ features. Plotting pairplot takes huge time.

Let's use RFE to select features: https://www.scikit-yb.org/en/latest/api/model_selection/rfecv.html#:~:text=Recursive%20feature%20elimination%20(RFE)%20is,number%20of%20features%20is%20reached.

In [None]:
from sklearn.feature_selection import RFE
from sklearn.svm import SVC

estimator = SVC(kernel="linear")

num_features_to_select = 100  # Adjust as needed

rfe = RFE(estimator, n_features_to_select=num_features_to_select)
rfe.fit(X_train, y_train)

selected_features = X_train.columns[rfe.support_]
X_train_selected = X_train[selected_features]
X_test_selected = X_test[selected_features]

Let's train linear and RBF SVMs and evaluate them.

Leaner SVM

In [None]:
svm_linear_model = SVC(kernel='linear')
svm_linear_model.fit(X_train_selected, y_train)

In [None]:
# evaluation

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

yhat_linear = svm_linear_model.predict(X_test_selected)

accuracy = accuracy_score(y_test, yhat_linear)
print("Accuracy:", accuracy)

print("\nClassification Report:")
print(classification_report(y_test, yhat_linear))

conf_matrix = confusion_matrix(y_test, yhat_linear)
print("\nConfusion Matrix:")
print(conf_matrix)

RBF SVM

In [None]:
svm_rbf_model = SVC(kernel='rbf')
svm_rbf_model.fit(X_train_selected, y_train)

In [None]:
# evaluation

yhat_rbf = svm_rbf_model.predict(X_test_selected)

accuracy_rbf = accuracy_score(y_test, yhat_rbf)
print("Accuracy (SVC with RBF kernel):", accuracy_rbf)

print("\nClassification Report (SVC with RBF kernel):")
print(classification_report(y_test, yhat_rbf))

conf_matrix_rbf = confusion_matrix(y_test, yhat_rbf)
print("\nConfusion Matrix (SVC with RBF kernel):")
print(conf_matrix_rbf)
