In [1]:
import pandas as pd
import numpy as np

In [2]:
df = pd.read_csv('data_airbnb.csv')

In [3]:
df = df[['neighbourhood_group', 'room_type', 'latitude', 'longitude', 'price', 'minimum_nights', 
         'number_of_reviews', 'reviews_per_month', 'calculated_host_listings_count', 'availability_365']]

In [4]:
df.fillna(0, inplace=True)

### Question 1

In [5]:
df.neighbourhood_group.describe()

count         48895
unique            5
top       Manhattan
freq          21661
Name: neighbourhood_group, dtype: object

In [6]:
df.groupby('neighbourhood_group').neighbourhood_group.agg(['count']).sort_values(by=['count'], ascending=False)

Unnamed: 0_level_0,count
neighbourhood_group,Unnamed: 1_level_1
Manhattan,21661
Brooklyn,20104
Queens,5666
Bronx,1091
Staten Island,373


In [7]:
categorical_columns = list(df.dtypes[df.dtypes == 'object'].index)

for c in categorical_columns:
    df[c] = df[c].str.lower().str.replace(' ', '_')

In [8]:
from sklearn.model_selection import train_test_split

df_full_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_full_train, test_size=0.25, random_state=42)
len(df_train), len(df_val), len(df_test)

(29337, 9779, 9779)

In [9]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

y_train = df_train.price.values
y_val = df_val.price.values
y_test = df_test.price.values

In [10]:
del df_train['price']
del df_val['price']
del df_test['price']

### Question 2

In [11]:
numerical = ['latitude', 'longitude', 'minimum_nights', 'number_of_reviews', 
             'reviews_per_month', 'calculated_host_listings_count', 'availability_365']

In [12]:
df_train[numerical].corr()

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
latitude,1.0,0.080301,0.027441,-0.006246,-0.007159,0.019375,-0.005891
longitude,0.080301,1.0,-0.06066,0.055084,0.134642,-0.117041,0.083666
minimum_nights,0.027441,-0.06066,1.0,-0.07602,-0.120703,0.118647,0.138901
number_of_reviews,-0.006246,0.055084,-0.07602,1.0,0.590374,-0.073167,0.174477
reviews_per_month,-0.007159,0.134642,-0.120703,0.590374,1.0,-0.048767,0.165376
calculated_host_listings_count,0.019375,-0.117041,0.118647,-0.073167,-0.048767,1.0,0.225913
availability_365,-0.005891,0.083666,0.138901,0.174477,0.165376,0.225913,1.0


reviews_per_month/number_of_reviews,
availability_365/calculated_host_listings_count

In [13]:
above_average = 1
y_train_bin = np.where(y_train > 152, above_average, 0)

### Question 3

In [14]:
from sklearn.metrics import mutual_info_score

In [15]:
mutual_info_score(y_train_bin, df_train.neighbourhood_group)

0.04651348750524772

In [16]:
round(mutual_info_score(y_train_bin, df_train.room_type), 2)

0.14

### Question 4

In [17]:
from sklearn.feature_extraction import DictVectorizer
train_dicts = df_train.to_dict(orient='records')
val_dicts = df_val.to_dict(orient='records')

In [18]:
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)

In [19]:
from sklearn.linear_model import LogisticRegression

In [20]:
model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42)
model.fit(X_train, y_train_bin)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(random_state=42)

In [28]:
y_pred = model.predict_proba(X_val)[:, 1]

In [29]:
price_decision = (y_pred >= 0.5)

In [30]:
y_val_bin = np.where(y_val > 152, above_average, 0)

In [36]:
round((y_val_bin == price_decision).mean(), 2)

0.79

### Question 5

In [37]:
model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42)
model.fit(X_train, y_train_bin)
y_pred = model.predict_proba(X_val)[:, 1]
price_decision = (y_pred >= 0.5)
y_val_bin = np.where(y_val > 152, above_average, 0)
original_accuracy = round((y_val_bin == price_decision).mean(), 2)
original_accuracy

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.79

In [43]:
diff_accuracy = []
for feature in df_train.columns:
    df_wo_feature_train = df_train.drop([feature], axis=1)
    df_wo_feature_val = df_val.drop([feature], axis=1)
    
    train_dicts = df_wo_feature_train.to_dict(orient='records')
    val_dicts = df_wo_feature_val.to_dict(orient='records')
    dv = DictVectorizer(sparse=False)
    X_train = dv.fit_transform(train_dicts)
    X_val = dv.transform(val_dicts)
    
    model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42)
    model.fit(X_train, y_train_bin)
    y_pred = model.predict_proba(X_val)[:, 1]
    price_decision = (y_pred >= 0.5)
    current_accuracy = round((y_val_bin == price_decision).mean(), 2)
    
    diff_accuracy.append(original_accuracy - current_accuracy)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logist

In [45]:
dict(zip(df_train.columns, diff_accuracy))

{'neighbourhood_group': 0.040000000000000036,
 'room_type': 0.08000000000000007,
 'latitude': 0.0,
 'longitude': 0.0,
 'minimum_nights': 0.010000000000000009,
 'number_of_reviews': 0.0,
 'reviews_per_month': 0.010000000000000009,
 'calculated_host_listings_count': 0.0,
 'availability_365': 0.010000000000000009}

### Question 6

In [56]:
y_train_log = np.log1p(y_train)
y_val_log = np.log1p(y_val)

In [57]:
from sklearn.linear_model import Ridge
from sklearn.metrics import mean_squared_error

In [58]:
train_dicts = df_train.to_dict(orient='records')
val_dicts = df_val.to_dict(orient='records')
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dicts)
X_val = dv.transform(val_dicts)

In [66]:
for a in [0, 0.01, 0.1, 1, 10]:
    model_ridge = Ridge(alpha=a)
    model_ridge.fit(X_train, y_train_log)
    y_pred = model_ridge.predict(X_val)
    rmse = round(mean_squared_error(y_val_log, y_pred), 3)
    print(f'alpha={a} rmse={rmse}')

alpha=0 rmse=0.247
alpha=0.01 rmse=0.247
alpha=0.1 rmse=0.247
alpha=1 rmse=0.247
alpha=10 rmse=0.248


alpha=0.01