In [164]:
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt

## Data Preparation

In [165]:
df = pd.read_csv('AB_NYC_2019.csv')

df = df[['neighbourhood_group',
'room_type',
'latitude',
'longitude',
'price',
'minimum_nights',
'number_of_reviews',
'reviews_per_month',
'calculated_host_listings_count',
'availability_365']]

df.columns = df.columns.str.lower().str.replace(' ', '_')
categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)
numerical_columns = list(df.dtypes[df.dtypes != 'object'].index)
numerical_columns.remove('price')

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')
    
df.reviews_per_month = df.reviews_per_month.fillna(0)

## Question 1
What is the most frequent observation (mode) for the column 'neighbourhood_group'?

In [166]:
df.groupby(['neighbourhood_group']).size().sort_values(ascending=False)

neighbourhood_group
manhattan        21661
brooklyn         20104
queens            5666
bronx             1091
staten_island      373
dtype: int64

## Setting up the validation framework

In [186]:
from sklearn.model_selection import train_test_split

df.price = (df.price >= 152).astype(int)

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

y_train = df_train.price.values
y_val = df_val.price.values
y_test = df_test.price.values

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

del df_train['price']
del df_val['price']
del df_test['price']

## Question 2

- Create the correlation matrix for the numerical features of your train dataset. In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.
- What are the two features that have the biggest correlation in this dataset?

In [187]:
from IPython.display import display

correlation_matrix = df_train[numerical_columns].corr().abs()
display(correlation_matrix)

for i in range(len(numerical_columns)):
    column_name = numerical_columns[i]
    
    df_group = df_train[numerical_columns].corrwith(df_train[column_name]).abs().sort_values(ascending=False)[1:2]
    
    print(column_name)
    display(df_group)
    print()
    print()

so = correlation_matrix.unstack().sort_values(ascending=False)
print(so)

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
latitude,1.0,0.080301,0.027441,0.006246,0.007159,0.019375,0.005891
longitude,0.080301,1.0,0.06066,0.055084,0.134642,0.117041,0.083666
minimum_nights,0.027441,0.06066,1.0,0.07602,0.120703,0.118647,0.138901
number_of_reviews,0.006246,0.055084,0.07602,1.0,0.590374,0.073167,0.174477
reviews_per_month,0.007159,0.134642,0.120703,0.590374,1.0,0.048767,0.165376
calculated_host_listings_count,0.019375,0.117041,0.118647,0.073167,0.048767,1.0,0.225913
availability_365,0.005891,0.083666,0.138901,0.174477,0.165376,0.225913,1.0


latitude


longitude    0.080301
dtype: float64



longitude


reviews_per_month    0.134642
dtype: float64



minimum_nights


availability_365    0.138901
dtype: float64



number_of_reviews


reviews_per_month    0.590374
dtype: float64



reviews_per_month


number_of_reviews    0.590374
dtype: float64



calculated_host_listings_count


availability_365    0.225913
dtype: float64



availability_365


calculated_host_listings_count    0.225913
dtype: float64



latitude                        latitude                          1.000000
longitude                       longitude                         1.000000
calculated_host_listings_count  calculated_host_listings_count    1.000000
reviews_per_month               reviews_per_month                 1.000000
minimum_nights                  minimum_nights                    1.000000
number_of_reviews               number_of_reviews                 1.000000
availability_365                availability_365                  1.000000
number_of_reviews               reviews_per_month                 0.590374
reviews_per_month               number_of_reviews                 0.590374
availability_365                calculated_host_listings_count    0.225913
calculated_host_listings_count  availability_365                  0.225913
availability_365                number_of_reviews                 0.174477
number_of_reviews               availability_365                  0.174477
availability_365       

## Question 3


We need to turn the price variable from numeric into binary.

Let's create a variable above_average which is 1 if the price is above (or equal to) 152.

- Calculate the mutual information score with the (binarized) price for the two categorical variables that we have. Use the training set only.
- Which of these two variables has bigger score?
- Round it to 2 decimal digits using round(score, 2)

In [188]:
from sklearn.metrics import mutual_info_score

def mutual_info_price_score(series):
    return mutual_info_score(series, df_full_train.price)

mi = df_full_train[categorical_columns].apply(mutual_info_price_score)
mi.sort_values(ascending=False)
score = mi.values.round(2)
score

array([0.05, 0.14])

## Question 4
- Now let's train a logistic regression
- Remember that we have two categorical variables in the data. Include them using one-hot encoding.
- Fit the model on the training dataset.
  - To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters: model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42)
- Calculate the accuracy on the validation dataset and rount it to 2 decimal digits.

In [189]:
from sklearn.feature_extraction import DictVectorizer

# better score with all numerical_columns but not very significant value, so for faster 
# model fit we can just use 2 number_of_reviews and reviews_per_month.
train_dict = df_train[categorical_columns + numerical_columns].to_dict(orient='records')
val_dict = df_val[categorical_columns + numerical_columns].to_dict(orient='records')

dv = DictVectorizer(sparse=False)

X_train = dv.fit_transform(train_dict)
X_val = dv.fit_transform(val_dict)

In [190]:
from sklearn.linear_model import LogisticRegression

model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42, max_iter=10000)
model.fit(X_train, y_train)

y_pred = model.predict_proba(X_val)[:, 1]
price_decision = (y_pred >= 0.5)

print("Model performance:")
(price_decision == y_val).mean().round(3)

Model performance:


0.791

## Question 5
- We have 9 features: 7 numerical features and 2 categorical.
- Let's find the least useful one using the feature elimination technique.
- Train a model with all these features (using the same parameters as in Q4).
- Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
- For each feature, calculate the difference between the original accuracy and the accuracy without the feature.
- Which of following feature has the smallest difference?
    - neighbourhood_group
    - room_type
    - number_of_reviews
    - reviews_per_month

> note: the difference doesn't have to be positive

In [195]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

def one_hot_enc(columns):
    train_dict = df_train[columns].to_dict(orient='records')
    val_dict = df_val[columns].to_dict(orient='records')

    dv = DictVectorizer(sparse=False)

    X_train = dv.fit_transform(train_dict)
    X_val = dv.fit_transform(val_dict)
    
    return X_train, X_val

def train_and_fit(columns):
    X_train, X_val = one_hot_enc(columns)

    model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42, max_iter=10000)
    model.fit(X_train, y_train)
    y_pred = model.predict_proba(X_val)[:, 1]
    price_decision = (y_pred >= 0.5)

    return (price_decision == y_val).mean()
    
all_columns = categorical_columns + numerical_columns
original_accurancy = train_and_fit(all_columns)
differences = pd.DataFrame()

for i in range(len(all_columns)):
    columns = list(np.delete(all_columns, i))
    res = train_and_fit(columns)
    differences[str(all_columns[i])] = [original_accurancy - res]
    
differences.T.sort_values(by=0, ascending = False)[0:1]    

Unnamed: 0,0
room_type,0.06197


## Question 6

- For this question, we'll see how to use a linear regression model from Scikit-Learn
- We'll need to use the original column 'price'. Apply the logarithmic transformation to this column.
- Fit the Ridge regression model on the training data.
- This model has a parameter alpha. Let's try the following values: [0, 0.01, 0.1, 1, 10]
- Which of these alphas leads to the best RMSE on the validation set? Round your RMSE scores to 3 decimal digits.
- If there are multiple options, select the smallest alpha.

In [204]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import Ridge

# Read data
df = pd.read_csv('AB_NYC_2019.csv')

# Leave cols we need
df = df[['neighbourhood_group',
'room_type',
'latitude',
'longitude',
'price',
'minimum_nights',
'number_of_reviews',
'reviews_per_month',
'calculated_host_listings_count',
'availability_365']]

# Prepare columns
df.columns = df.columns.str.lower().str.replace(' ', '_')

categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)
numerical_columns = list(df.dtypes[df.dtypes != 'object'].index)
numerical_columns.remove('price')

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')
    
df.reviews_per_month = df.reviews_per_month.fillna(0)

# Prepare price
df.price = np.log1p(df.price)

# Split dataset
df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)

y_train = df_train.price.values
y_val = df_val.price.values
y_test = df_test.price.values

df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

del df_train['price']
del df_val['price']
del df_test['price']

# One Hot Enconding
train_dict = df_train[categorical_columns + numerical_columns].to_dict(orient='records')
val_dict = df_val[categorical_columns + numerical_columns].to_dict(orient='records')

dv = DictVectorizer(sparse=False)

X_train = dv.fit_transform(train_dict)
X_val = dv.fit_transform(val_dict)

# Ridge regression

def eval_score(alpha):
    model = Ridge(alpha=alpha)
    model.fit(X_train, y_train)
    return model.score(X_val, y_val)

alphas = [0, 0.01, 0.1, 1, 10]

for i in alphas:
    print(i, eval_score(i).round(3))

0 0.488
0.01 0.488
0.1 0.488
1 0.488
10 0.486
