# Data Analysis Final Project

## 1. Imports and Data Loading

In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVC
from sklearn.decomposition import PCA
from sklearn.metrics import plot_confusion_matrix, confusion_matrix, classification_report, ConfusionMatrixDisplay
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
#Import data
listings = pd.read_csv("data/listings.csv")
#calendar = pd.read_csv("data/calendar.csv")
#neighborhoods = pd.read_csv("data/neighbourhoods.csv")

## 2. Exploratory Data Analysis

In [None]:
# First look into the data
listings.info()

In [None]:
# Removing variables
listings = listings.drop(['id', 'name', 'host_name', 'host_id', 'last_review'], axis=1)
listings.info()

In [None]:
print("There are a total of %d neighbourhoods, %d neighbourhood groups and %d room types" % (listings.neighbourhood.nunique(),listings.neighbourhood_group.nunique(),listings.room_type.nunique()))

# Applying one-hot encoding
listings = pd.get_dummies(listings, columns=['neighbourhood', 'neighbourhood_group', 'room_type'])

In [None]:
# Analyzing price distribution
plt.hist(listings.price, bins=100)
plt.title("Price histogram")
plt.show()

In [None]:
print("The minimum value for the price of any house is %f, so that's the reason why we filter the 0 values" %min(listings.price))
print("We are also filtering houses with a price higher than 400")

#Filtering by price and spliting the dataset into X and Y
listings = listings[(listings['price'] > 0) & (listings['price'] <= 100)]
listings_Y = listings.price
listings_X = listings.drop(columns=["price"])
listings_X.head()

In [None]:
# Analyzing price distribution (again)
plt.hist(listings_Y, bins=70)
plt.title("Filtered Price histogram")
plt.show()

In [None]:
# Filling NaN values in reviews_per_month column
listings_X.reviews_per_month = listings_X.reviews_per_month.fillna(0)

In [None]:
# Option 1: Discretization of "price" column using non-uniform bins
bins=[0, 20, 50, 75, 101]
labels = ['Cheap', 'Moderate', 'Expensive', 'Elite']
listings_Y_cat = pd.cut(listings_Y, bins=bins, labels=['Cheap', 'Moderate', 'Expensive', 'Elite'])

listings_Y_cat.value_counts()

In [None]:
# Option 2: Discretization of "price" column using uniform bins
bins=[0, 20, 50, 75, 101]
labels = ['Cheap', 'Moderate', 'Expensive', 'Elite']
listings_Y_cat,bins = pd.qcut(listings_Y.rank(method = 'first'), 4, labels=labels, retbins = True)
listings_Y_cat.value_counts()

In [None]:
# PCA (optional to apply)
print("Number of dimensions before PCA: %d"%listings_X.shape[1])

pca = PCA(n_components=0.99, svd_solver = 'full')
listings_X = pca.fit_transform(listings_X)
print("The number of PC that we get to explain a 99%% of variance is %d" %listings_X.shape[1])

plt.figure(figsize=(10,5))
plt.bar(x = list(range(1,listings_X.shape[1]+1)), height= pca.explained_variance_ratio_)
plt.ylabel("Explained variance ratio")
plt.xlabel("PC")
plt.show()

## 3. Model Training

In [None]:
#Models used
lnr = LinearRegression()
lgr = LogisticRegression()
svm = SVC()
knn = KNeighborsClassifier(n_neighbors=11)
rf = RandomForestClassifier()

In [None]:
#Train/Test Split
splits = [0.05, 0.1, 0.2, 0.5, 0.75]

for split in splits:
    X_train, X_test, y_train, y_test = train_test_split(listings_X, listings_Y, test_size=split)
    X_train_cat, X_test_cat, y_train_cat, y_test_cat = train_test_split(listings_X, listings_Y_cat, test_size=split)
    
    lnr.fit(X_train, y_train)
    lgr.fit(X_train_cat, y_train_cat)
    svm.fit(X_train_cat, y_train_cat)
    knn.fit(X_train_cat, y_train_cat)
    rf.fit(X_train_cat, y_train_cat)
    
    print("LNR Coefficient of Det.: " + str(lnr.score(X_test, y_test)) + " for K = " + str(split))
    print("LGR Mean Accuracy: " + str(lgr.score(X_test_cat, y_test_cat)) + " for K = " + str(split))
    print("SVM Mean Accuracy: " + str(svm.score(X_test_cat, y_test_cat)) + " for K = " + str(split))
    print("KNN Mean Accuracy: " + str(knn.score(X_test_cat, y_test_cat)) + " for K = " + str(split))
    print("RF Mean Accuracy: " + str(rf.score(X_test_cat, y_test_cat)) + " for K = " + str(split))

In [None]:
# Final Train/Test Split
split = .1
X_train, X_test, y_train, y_test = train_test_split(listings_X, listings_Y, test_size=split)
X_train_cat, X_test_cat, y_train_cat, y_test_cat = train_test_split(listings_X, listings_Y_cat, test_size=split)

In [None]:
# Option 1: Training Models using default hyperparameters:
lnr.fit(X_train, y_train)
lgr.fit(X_train_cat, y_train_cat)
svm.fit(X_train_cat, y_train_cat)
knn.fit(X_train_cat, y_train_cat)
rf.fit(X_train_cat, y_train_cat)

In [None]:
# Option 2: Training Models using GridSearchCV

# Logistic Regression Grid-Search
parameters = {'solver': ("newton-cg", "lbfgs", "liblinear"), "C":(.001,.01,.1,1)}
lgr_grid = GridSearchCV(lgr, parameters)
lgr_grid.fit(X_train_cat, y_train_cat)
lgr = lgr_grid
lgr_grid.best_estimator_

In [None]:
# Support Vector Machine Grid-Search
parameters = {"C":(.01,.1,1)}
svm_grid = GridSearchCV(svm, parameters)
svm_grid.fit(X_train_cat, y_train_cat)
svm = svm_grid
svm_grid.best_estimator_

In [None]:
#RF Grid-Search
parameters = {"boostrap":(True, False), "n_estimators":(50,100,500)}
rf_grid = GridSearchCV(RandomForestClassifier(), parameters)
rf_grid.fit(X_train_cat, y_train_cat)
rf = rf_grid
rf_grid.best_estimator_

In [None]:
#KNN Grid-Search
parameters = {"n_neighbors":(5,7,9,11)}
knn_grid = GridSearchCV(KNeighborsClassifier(), parameters)
knn_grid.fit(X_train_cat, y_train_cat)
knn = knn_grid
knn_grid.best_estimator_

In [None]:
# Hyperparameter tuning for KNN
scores = []
k_values = list(range(3,52,2))
for k in k_values:
    knn.n_neighbors = k
    knn.fit(X_train_cat, y_train_cat)
    scores.append(knn.score(X_train_cat, y_train_cat))
    
plt.plot(k_values, scores)
plt.title("Tuning 'n_neighbors'")
plt.xlabel("n_neighbors")
plt.ylabel("Score")
plt.show()

## 4. Model Test & Evaluation

In [None]:
#Linear Regression test and evaluation
lnr_pred = lnr.predict(X_test)
errors = np.abs(lnr_pred-y_test)
coeff_of_det = lnr.score(X_test, y_test)
print("The coefficient of determination of the trained linear regression is %f"%coeff_of_det)
print("The square root of the square sum of the errors is %f"%(np.sqrt((y_test - lnr_pred)** 2).sum()))
print("The max error is %f, the min error is %f, the average error is %f and the median error is %f\n"%(max(errors), min(errors), np.mean(errors), np.median(errors)))

# Classifying the continous predictions from the linear regression into the categories, as we did for training the
# classificators, in order to evaluate the LR in a similar same way than the others
lnr_predict_cat = pd.cut(lnr_pred, bins=[0,20,40,60,80,101],labels=[0,1,2,3,4])
lnr_true_cat = pd.cut(y_test, bins=[0,20,40,60,80,101], labels=[0, 1, 2, 3,4])
print(classification_report(lnr_true_cat.to_numpy(), lnr_predict_cat.to_numpy(), target_names=['0-20', '20-40', '40-60', '60-80','80-100']))

In [None]:
#Linear Regression confusion matrix
fig, ax = plt.subplots(figsize=(10, 10))
cm = confusion_matrix(lnr_true_cat, lnr_predict_cat, labels=[0,1,2,3,4])
disp = ConfusionMatrixDisplay(confusion_matrix = cm, display_labels=['0-20', '20-40', '40-60', '60-80','80-100'])
disp.plot(ax=ax)
plt.xticks(rotation = -45)
plt.show()

In [None]:
#Logistic Regression classification report
lgr_predict_cat = lgr.predict(X_test_cat)
print(classification_report(y_test_cat, lgr_predict_cat, target_names=['Cheap', 'Moderate', 'Expensive', 'Elite']))

In [None]:
#Logistic Regrssion confusion matrix
fig, ax = plt.subplots(figsize=(10, 10))
plot_confusion_matrix(lgr, X_test_cat, y_test_cat, labels = labels,ax=ax)
plt.xticks(rotation = -45)
plt.show()

In [None]:
#SVM classification report
svm_predict_cat = svm.predict(X_test_cat)
print(classification_report(y_test_cat, svm_predict_cat, target_names=['Cheap', 'Moderate', 'Expensive', 'Elite']))

In [None]:
#SVM prediction confusion matrix
fig, ax = plt.subplots(figsize=(10, 10))
plot_confusion_matrix(svm, X_test_cat, y_test_cat, labels = labels,ax=ax)
plt.xticks(rotation = -45)
plt.show()

In [None]:
#KNN Classification Report
knn_predict_cat = knn.predict(X_test_cat)
print(classification_report(y_test_cat, knn_predict_cat, target_names=['Cheap', 'Moderate', 'Expensive', 'Elite']))

In [None]:
#KNN Confusion Matrix
fig, ax = plt.subplots(figsize=(10, 10))
plot_confusion_matrix(knn, X_test_cat, y_test_cat, labels = labels,ax=ax)
plt.xticks(rotation = -45)
plt.show()

In [None]:
# Random Forest Classification Report
rf_predict_cat = rf.predict(X_test_cat)
print(classification_report(y_test_cat, rf_predict_cat, target_names=['Cheap', 'Moderate', 'Expensive', 'Elite']))

In [None]:
#Random Forest Confusion Matrix
fig, ax = plt.subplots(figsize=(10, 10))
plot_confusion_matrix(rf, X_test_cat, y_test_cat, labels = labels,ax=ax)
plt.xticks(rotation = -45)
plt.show()