In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.svm import SVC
from sklearn.decomposition import PCA

In [None]:
#Import data
listings_X = pd.read_csv("data/listings.csv")
#calendar = pd.read_csv("data/calendar.csv")
#neighborhoods = pd.read_csv("data/neighbourhoods.csv")

In [None]:
# First look into the data
listings_X.info()

In [None]:
# Removing variables
listings_X = listings_X.drop(['id', 'name', 'host_name', 'host_id', 'last_review'], axis=1)
listings_X.info()

In [None]:
print("There are a total of %d neighbourhoods, %d neighbourhood groups and %d room types" % (listings_X.neighbourhood.nunique(),listings_X.neighbourhood_group.nunique(),listings_X.room_type.nunique()))

# Applying one-hot encoding
listings_X = pd.get_dummies(listings_X, columns=['neighbourhood', 'neighbourhood_group', 'room_type'])

In [None]:
print("The minimum value for the price of any house is %f, so that's the reason why we filter the 0 values" %min(listings_X.price))

#Filtering by price and spliting the dataset into X and Y
listings_X = listings_X[(listings_X['price'] != 0)]
listings_Y = listings_X.price
listings_X.drop(columns=["price"])
listings_X.head()

In [None]:
# Filling NaN values in reviews_per_month column
listings_X.reviews_per_month = listings_X.reviews_per_month.fillna(0)

In [None]:
# Analyzing price distribution
plt.hist(listings_Y, bins=100)
plt.title("Price histogram")
plt.show()
plt.title("Price histogram (from 0 to 1000)")
plt.xlim(0,1000)
plt.hist(listings_Y,bins=1000)
plt.show()

In [None]:
# Discretization of "price" column
bins=[0, 50, 200, 500, 1000, 9999]
listings_Y_cat = pd.cut(listings_Y, bins=bins, labels=['Cheap', 'Moderate', 'Pricey', 'Expensive', 'Elite'])
listings_Y_cat.value_counts()

In [None]:
# PCA
print("Number of dimensions before PCA: %d"%listings_X.shape[1])

pca = PCA(n_components=0.99, svd_solver = 'full')
listings_X = pca.fit_transform(listings_X)
print("The number of PC that we get to explain a 99%% of variance is %d" %listings_X.shape[1])

plt.figure(figsize=(10,5))
plt.bar(x = list(range(1,4)), height= pca.explained_variance_ratio_)
plt.ylabel("Explained variance ratio")
plt.xlabel("PC")
plt.show()

In [None]:
#ModelS
lnr = LinearRegression()
lgr = LogisticRegression()
svm = SVC()

In [None]:
#Train/Test Split
splits = [0.1, 0.2, 0.5, 0.75]

for split in splits:
    X_train, X_test, y_train, y_test = train_test_split(listings_X, listings_Y, test_size=split)
    X_train_cat, X_test_cat, y_train_cat, y_test_cat = train_test_split(listings_X, listings_Y_cat, test_size=split)
    
    lnr.fit(X_train, y_train)
    lgr.fit(X_train_cat, y_train_cat)
    svm.fit(X_train_cat, y_train_cat)
    
    print("LNR Coefficient of Det.: " + str(lnr.score(X_test, y_test)) + " for K = " + str(split))
    print("LGR Mean Accuracy: " + str(lgr.score(X_test_cat, y_test_cat)) + " for K = " + str(split))
    print("SVM Mean Accuracy: " + str(svm.score(X_test_cat, y_test_cat)) + " for K = " + str(split))

In [None]:
#Testing

In [None]:
#Evalutation/Graphs