### Toronto Airbnb: Applying classification models, logistic regression, SVC, and random forest

In [1]:
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns
import pandas as pd

mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

pd.set_option("display.max_columns", 100)
pd.set_option('display.max_colwidth', 100)
pd.set_option("display.notebook_repr_html", True)

#### load dataset


In [2]:
df = pd.read_csv("cleaned_airbnb_dataset.csv")

#### look at the dataset -- head, shape, info, check for missing values, and basic statistics

In [3]:
df.head()

Unnamed: 0,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,availability_365
0,Waterfront Communities-The Island,43.64105,-79.37628,Private room,99,180,169,0
1,Annex,43.66724,-79.41598,Private room,66,1,0,0
2,Briar Hill-Belgravia,43.69602,-79.45468,Private room,72,1,211,262
3,Waterfront Communities-The Island,43.6453,-79.3894,Entire home/apt,199,4,38,323
4,Greenwood-Coxwell,43.6689,-79.32592,Entire home/apt,54,120,26,0


In [4]:
df = df[['neighbourhood', 'latitude', 'longitude', 'price', 'minimum_nights', 'number_of_reviews', 'availability_365', 'room_type' ]]

In [5]:
df.head(5)

Unnamed: 0,neighbourhood,latitude,longitude,price,minimum_nights,number_of_reviews,availability_365,room_type
0,Waterfront Communities-The Island,43.64105,-79.37628,99,180,169,0,Private room
1,Annex,43.66724,-79.41598,66,1,0,0,Private room
2,Briar Hill-Belgravia,43.69602,-79.45468,72,1,211,262,Private room
3,Waterfront Communities-The Island,43.6453,-79.3894,199,4,38,323,Entire home/apt
4,Greenwood-Coxwell,43.6689,-79.32592,54,120,26,0,Entire home/apt


In [6]:
print("Column names: \n", df.columns)
print("\nNumber of rows are: {} and number of columns are: {}".format(df.shape[0], df.shape[1]))
print("\nFindout missing values: \n")
print(df.info())

Column names: 
 Index(['neighbourhood', 'latitude', 'longitude', 'price', 'minimum_nights',
       'number_of_reviews', 'availability_365', 'room_type'],
      dtype='object')

Number of rows are: 20289 and number of columns are: 8

Findout missing values: 

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20289 entries, 0 to 20288
Data columns (total 8 columns):
neighbourhood        20289 non-null object
latitude             20289 non-null float64
longitude            20289 non-null float64
price                20289 non-null int64
minimum_nights       20289 non-null int64
number_of_reviews    20289 non-null int64
availability_365     20289 non-null int64
room_type            20289 non-null object
dtypes: float64(2), int64(4), object(2)
memory usage: 1.2+ MB
None


In [7]:
df.groupby("room_type").count()

Unnamed: 0_level_0,neighbourhood,latitude,longitude,price,minimum_nights,number_of_reviews,availability_365
room_type,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Entire home/apt,12729,12729,12729,12729,12729,12729,12729
Hotel room,173,173,173,173,173,173,173
Private room,7040,7040,7040,7040,7040,7040,7040
Shared room,347,347,347,347,347,347,347


In [8]:
categorical_features = ['neighbourhood', 'room_type']
numerical_features = ["latitude", "longitude", "price", "minimum_nights", "number_of_reviews", "availability_365"]

In [9]:
from sklearn import model_selection
from sklearn import preprocessing

In [10]:
df_ml = df.copy()

In [11]:
df.head(5)

Unnamed: 0,neighbourhood,latitude,longitude,price,minimum_nights,number_of_reviews,availability_365,room_type
0,Waterfront Communities-The Island,43.64105,-79.37628,99,180,169,0,Private room
1,Annex,43.66724,-79.41598,66,1,0,0,Private room
2,Briar Hill-Belgravia,43.69602,-79.45468,72,1,211,262,Private room
3,Waterfront Communities-The Island,43.6453,-79.3894,199,4,38,323,Entire home/apt
4,Greenwood-Coxwell,43.6689,-79.32592,54,120,26,0,Entire home/apt


In [12]:
lbl_encd = preprocessing.LabelEncoder()
df_ml[categorical_features] = df[categorical_features].apply(lambda col: lbl_encd.fit_transform(col))

In [13]:
df_ml.head(n=5)

Unnamed: 0,neighbourhood,latitude,longitude,price,minimum_nights,number_of_reviews,availability_365,room_type
0,122,43.64105,-79.37628,99,180,169,0,2
1,3,43.66724,-79.41598,66,1,0,0,2
2,15,43.69602,-79.45468,72,1,211,262,2
3,122,43.6453,-79.3894,199,4,38,323,0
4,46,43.6689,-79.32592,54,120,26,0,0


In [14]:
# df[100:150]

In [15]:
# df_ml[100:150]

In [16]:
minmaxscaler = preprocessing.MinMaxScaler()

In [17]:
df_ml[numerical_features] = minmaxscaler.fit_transform(df[numerical_features])

  return self.partial_fit(X, y)


In [18]:
df_ml.head()

Unnamed: 0,neighbourhood,latitude,longitude,price,minimum_nights,number_of_reviews,availability_365,room_type
0,122,0.21621,0.498699,0.331104,0.159253,0.225634,0.0,2
1,3,0.320415,0.418602,0.220736,0.0,0.0,0.0,2
2,15,0.434926,0.340523,0.240803,0.0,0.281709,0.717808,2
3,122,0.23312,0.472228,0.665552,0.002669,0.050734,0.884932,0
4,46,0.32702,0.600303,0.180602,0.105872,0.034713,0.0,0


In [19]:
X = df_ml[["neighbourhood", "latitude", "longitude", "price", "minimum_nights", "number_of_reviews", "availability_365" ]]

In [20]:
y = df_ml[["room_type"]]

In [21]:
# X_array = np.asanyarray(X)
# y_array = np.asanyarray(y)

In [22]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=0.2, random_state=0)

In [23]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((16231, 7), (16231, 1), (4058, 7), (4058, 1))

In [30]:
print("Training dataset: {} %".format(round(X_train.shape[0]/df_ml.shape[0] *100, 2)))
print("Test dataset    : {} %".format(round(X_test.shape[0]/df_ml.shape[0] * 100, 2)))

Training dataset: 80.0 %
Test dataset    : 20.0 %


In [24]:
X_train.head(3)

Unnamed: 0,neighbourhood,latitude,longitude,price,minimum_nights,number_of_reviews,availability_365
13389,119,0.230733,0.416463,0.230769,0.0,0.05474,0.2
1785,88,0.329646,0.557894,0.414716,0.001779,0.0,0.0
15731,5,0.682529,0.355775,0.364548,0.00089,0.00534,0.909589


In [25]:
type(X_train)

pandas.core.frame.DataFrame

In [26]:
y_train.head(3)

Unnamed: 0,room_type
13389,2
1785,0
15731,2


In [27]:
X_train.to_csv("X_train_df.csv", index=False)
y_train.to_csv("y_train_df.csv", index=False)

X_test.to_csv("X_test_df.csv", index=False)
y_test.to_csv("y_test_df.csv", index=False)