In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

In [2]:
data_3 = 'https://raw.githubusercontent.com/alexeygrigorev/datasets/master/AB_NYC_2019.csv'

In [3]:
df = pd.read_csv('data_3.csv')

In [4]:
df

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
0,2539,Clean & quiet apt home by the park,2787,John,Brooklyn,Kensington,40.64749,-73.97237,Private room,149,1,9,2018-10-19,0.21,6,365
1,2595,Skylit Midtown Castle,2845,Jennifer,Manhattan,Midtown,40.75362,-73.98377,Entire home/apt,225,1,45,2019-05-21,0.38,2,355
2,3647,THE VILLAGE OF HARLEM....NEW YORK !,4632,Elisabeth,Manhattan,Harlem,40.80902,-73.94190,Private room,150,3,0,,,1,365
3,3831,Cozy Entire Floor of Brownstone,4869,LisaRoxanne,Brooklyn,Clinton Hill,40.68514,-73.95976,Entire home/apt,89,1,270,2019-07-05,4.64,1,194
4,5022,Entire Apt: Spacious Studio/Loft by central park,7192,Laura,Manhattan,East Harlem,40.79851,-73.94399,Entire home/apt,80,10,9,2018-11-19,0.10,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
48890,36484665,Charming one bedroom - newly renovated rowhouse,8232441,Sabrina,Brooklyn,Bedford-Stuyvesant,40.67853,-73.94995,Private room,70,2,0,,,2,9
48891,36485057,Affordable room in Bushwick/East Williamsburg,6570630,Marisol,Brooklyn,Bushwick,40.70184,-73.93317,Private room,40,4,0,,,2,36
48892,36485431,Sunny Studio at Historical Neighborhood,23492952,Ilgar & Aysel,Manhattan,Harlem,40.81475,-73.94867,Entire home/apt,115,10,0,,,1,27
48893,36485609,43rd St. Time Square-cozy single bed,30985759,Taz,Manhattan,Hell's Kitchen,40.75751,-73.99112,Shared room,55,1,0,,,6,2


In [5]:
base = ['neighbourhood_group',
       'room_type', 'latitude', 'longitude', 'price',
       'minimum_nights', 'number_of_reviews', 'reviews_per_month',
       'calculated_host_listings_count', 'availability_365']

In [6]:
df_base = df[base]

In [7]:
df_base.isnull().sum()

neighbourhood_group                   0
room_type                             0
latitude                              0
longitude                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

In [8]:
df_base = df_base.fillna(0)

In [9]:
df_base.columns = df_base.columns.str.lower().str.replace(" ", "_")

cols = list(df_base.dtypes[df_base.dtypes == 'object'].index)

for col in cols:
    df_base[col] = df_base[col].str.lower().str.replace(" ", "_")

In [10]:
df_base.head(3)

Unnamed: 0,neighbourhood_group,room_type,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
0,brooklyn,private_room,40.64749,-73.97237,149,1,9,0.21,6,365
1,manhattan,entire_home/apt,40.75362,-73.98377,225,1,45,0.38,2,355
2,manhattan,private_room,40.80902,-73.9419,150,3,0,0.0,1,365


In [11]:
df_base['neighbourhood_group'].value_counts()

manhattan        21661
brooklyn         20104
queens            5666
bronx             1091
staten_island      373
Name: neighbourhood_group, dtype: int64

In [12]:
full_train, df_test = train_test_split(df_base, test_size=0.2, random_state=42)
df_train, df_valid = train_test_split(full_train, test_size=0.25, random_state=42)

In [13]:
len(full_train), len(df_test)

(39116, 9779)

In [14]:
len(df_train), len(df_valid)

(29337, 9779)

In [15]:
y_full_train = full_train['price'].values
y_test = df_test['price'].values
y_train = df_train['price'].values
y_valid = df_valid['price'].values

In [16]:
full_train = full_train.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)
df_train = df_train.reset_index(drop=True)
df_valid = df_valid.reset_index(drop=True)

In [17]:
del full_train['price']
del df_test['price']
del df_train['price']
del df_valid['price']

In [18]:
y_full_train = (y_full_train >= 152).astype(int)
y_test = (y_test >= 152).astype(int)
y_train = (y_train >= 152).astype(int)
y_valid = (y_valid >= 152).astype(int)

In [19]:
numerical = ['latitude', 'longitude', 'minimum_nights',
            'number_of_reviews', 'reviews_per_month',
            'calculated_host_listings_count', 'availability_365']

categorical = ['neighbourhood_group', 'room_type']

In [20]:
# correlation matrix
matrix = df_base[numerical].corr().to_dict()

In [21]:
def give_highest(matrix):
    for key, inner_dict in matrix.items():
        del inner_dict[key]
        result = dict(sorted(inner_dict.items(), key=lambda item: item[1], reverse=True))
        matrix[key] = result
    
    return matrix


In [22]:
give_highest(matrix)


{'latitude': {'longitude': 0.08478836838942543,
  'minimum_nights': 0.024869274138726128,
  'calculated_host_listings_count': 0.019517351185378132,
  'availability_365': -0.010983458290208541,
  'number_of_reviews': -0.015388804497945684,
  'reviews_per_month': -0.01875772712330614},
 'longitude': {'reviews_per_month': 0.1385161659552922,
  'latitude': 0.08478836838942543,
  'availability_365': 0.08273074786310534,
  'number_of_reviews': 0.05909428794877501,
  'minimum_nights': -0.06274711429076898,
  'calculated_host_listings_count': -0.11471279117178322},
 'minimum_nights': {'availability_365': 0.14430306319924938,
  'calculated_host_listings_count': 0.1279596294349121,
  'latitude': 0.024869274138726128,
  'longitude': -0.06274711429076898,
  'number_of_reviews': -0.08011606824164533,
  'reviews_per_month': -0.1249049651159733},
 'number_of_reviews': {'reviews_per_month': 0.5894072970835077,
  'availability_365': 0.17202758146293173,
  'longitude': 0.05909428794877501,
  'latitude':

In [23]:
above_average = (df_base.price >= 152).astype(int)

In [24]:
len(above_average)

48895

In [25]:
new_df = pd.DataFrame()
new_df['binary'] = above_average
new_df['price'] = df_base.price

In [26]:
new_df

Unnamed: 0,binary,price
0,0,149
1,1,225
2,0,150
3,0,89
4,0,80
...,...,...
48890,0,70
48891,0,40
48892,0,115
48893,0,55


In [27]:
from sklearn.metrics import mutual_info_score

In [28]:
def calc_mutual(series):
    return mutual_info_score(series, above_average)


In [29]:
result = df_base[categorical].apply(calc_mutual)


In [30]:

round(result.sort_values(ascending=False).to_frame('Mut. info'), 2)


Unnamed: 0,Mut. info
room_type,0.14
neighbourhood_group,0.05


In [31]:

from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction import DictVectorizer


In [32]:
train_dict = df_train[categorical + numerical].to_dict(orient='records')
valid_dict = df_valid[categorical + numerical].to_dict(orient='records')


In [33]:
dv = DictVectorizer(sparse=False)
X_train = dv.fit_transform(train_dict)


In [34]:
dv.get_feature_names()


['availability_365',
 'calculated_host_listings_count',
 'latitude',
 'longitude',
 'minimum_nights',
 'neighbourhood_group=bronx',
 'neighbourhood_group=brooklyn',
 'neighbourhood_group=manhattan',
 'neighbourhood_group=queens',
 'neighbourhood_group=staten_island',
 'number_of_reviews',
 'reviews_per_month',
 'room_type=entire_home/apt',
 'room_type=private_room',
 'room_type=shared_room']

In [35]:

model = LogisticRegression(solver='liblinear', dual=False, max_iter=2000,
                           C=1.0, random_state=42)


In [36]:
model.fit(X_train, y_train)

LogisticRegression(max_iter=2000, random_state=42, solver='liblinear')

In [37]:
model.intercept_


array([-0.08941197])

In [38]:
model.coef_.round(3)


array([[ 3.000e-03,  4.000e-03, -5.826e+00, -3.171e+00, -1.100e-02,
        -7.900e-02,  1.280e-01,  1.579e+00, -2.600e-02, -1.691e+00,
        -3.000e-03, -4.200e-02,  1.949e+00, -8.270e-01, -1.212e+00]])

In [39]:

X_valid = dv.transform(valid_dict)


In [40]:
soft_predictions = model.predict_proba(X_valid)[:, 1]


In [41]:
soft_predictions


array([0.02878852, 0.59571758, 0.42622019, ..., 0.11422766, 0.03458556,
       0.52821166])

In [42]:

dict(zip(dv.get_feature_names(), model.coef_[0].round(2)))


{'availability_365': 0.0,
 'calculated_host_listings_count': 0.0,
 'latitude': -5.83,
 'longitude': -3.17,
 'minimum_nights': -0.01,
 'neighbourhood_group=bronx': -0.08,
 'neighbourhood_group=brooklyn': 0.13,
 'neighbourhood_group=manhattan': 1.58,
 'neighbourhood_group=queens': -0.03,
 'neighbourhood_group=staten_island': -1.69,
 'number_of_reviews': -0.0,
 'reviews_per_month': -0.04,
 'room_type=entire_home/apt': 1.95,
 'room_type=private_room': -0.83,
 'room_type=shared_room': -1.21}

In [43]:

decision_threshold = (soft_predictions >= 0.5)

decision_threshold


array([False,  True, False, ..., False, False,  True])

In [44]:

round((decision_threshold == y_valid).mean(),2)


0.79

In [45]:

df_temp = pd.DataFrame()
df_temp['soft pred.'] = soft_predictions
df_temp['pred.'] = decision_threshold.astype(int)
df_temp['real data'] = y_valid
df_temp['correct'] = df_temp['pred.'] == df_temp['real data']

df_temp



Unnamed: 0,soft pred.,pred.,real data,correct
0,0.028789,0,0,True
1,0.595718,1,0,False
2,0.426220,0,1,False
3,0.074907,0,0,True
4,0.811702,1,1,True
...,...,...,...,...
9774,0.637514,1,1,True
9775,0.010381,0,0,True
9776,0.114228,0,0,True
9777,0.034586,0,0,True


In [46]:
class FeatureElimination:
    def __init__(self, initial_acc, train_df, valid_df, y_train, y_valid):
        self.initial_acc = initial_acc
        self.train_df = train_df
        self.valid_df = valid_df
        self.y_tr = y_train
        self.y_val = y_valid
        self.categorical = ['neighbourhood_group', 'room_type']
        self.numerical = ['latitude', 'longitude', 'minimum_nights',
            'number_of_reviews', 'reviews_per_month',
            'calculated_host_listings_count', 'availability_365']
        self.dv = DictVectorizer(sparse=False)
        self.model = LogisticRegression(solver='liblinear', dual=False, max_iter=2000,
                                        C=1.0, random_state=42)
    
    
    def get_result(self):
        all_features = self.categorical + self.numerical
        levels = {}
        
        for i in range(len(all_features)):
            curr_drop = all_features[i]
            
            features = self._remove_element(all_features, i)
            X_train, X_valid = self._transform_data(self.train_df, self.valid_df, features)
            
            self._train_model(X_train, self.y_tr)
            soft_predictions = self._predict_data(X_valid)
            curr_acc = self._get_accuracy(soft_predictions)
            
            levels[curr_drop] = (round(curr_acc, 2), self.initial_acc - curr_acc)
        
        return levels    
    
    def _remove_element(self, features, idx):
        if idx == 0:
            features = features[1:]
        elif idx == len(features) - 1:
            features = features[:-1]
        else:
            features = features[:idx] + features[idx + 1:]
        
        return features
    
    def _transform_data(self, train, valid, features):
        train_dict = train[features].to_dict(orient='records')
        valid_dict = valid[features].to_dict(orient='records')
        
        X_train = dv.fit_transform(train_dict)
        X_valid = dv.transform(valid_dict)
        
        return X_train, X_valid
    
    def _train_model(self, X, y):
        self.model.fit(X, y)
    
    def _predict_data(self, X):
        return self.model.predict_proba(X)[:, 1]
    
    def _get_accuracy(self, data, threshold=0.5):
        price_level = (data >= threshold)
        result = (price_level == self.y_val).mean()
        return result


In [48]:
feature_elim = FeatureElimination(0.79, df_train, df_valid, y_train, y_valid)
feature_elim.get_result()

{'neighbourhood_group': (0.75, 0.040127824930974554),
 'room_type': (0.73, 0.06150015338991721),
 'latitude': (0.79, 0.0032119848655282057),
 'longitude': (0.79, 0.003007464975968932),
 'minimum_nights': (0.79, 0.00034870641169859606),
 'number_of_reviews': (0.79, -0.0014919725943347562),
 'reviews_per_month': (0.79, -0.0007761529808774092),
 'calculated_host_listings_count': (0.79, 0.0005532263012578698),
 'availability_365': (0.78, 0.008529501994068989)}

In [49]:
from sklearn.linear_model import Ridge
from sklearn.model_selection import train_test_split


In [50]:
df_base.price = np.log1p(df_base.price)


In [51]:
full_train, df_test = train_test_split(df_base, test_size=0.2, random_state=5)
df_train, df_valid = train_test_split(full_train, test_size=0.25, random_state=5)


In [52]:
y_full_train = full_train.price.values
y_test = df_test.price.values
y_train = df_train.price.values
y_valid = df_valid.price.values


In [53]:
del full_train['price']
del df_test['price']
del df_train['price']
del df_valid['price']


In [55]:
df_base.isnull().sum()


neighbourhood_group               0
room_type                         0
latitude                          0
longitude                         0
price                             0
minimum_nights                    0
number_of_reviews                 0
reviews_per_month                 0
calculated_host_listings_count    0
availability_365                  0
dtype: int64

In [56]:
dict_train = df_train.to_dict(orient='records')
dict_valid = df_valid.to_dict(orient='records')


In [57]:
dv = DictVectorizer(sparse=False)


In [58]:
X_train = dv.fit_transform(dict_train)
X_valid = dv.transform(dict_valid)


In [59]:
from sklearn.metrics import mean_squared_error

def train_predict(a=0):
    model = Ridge(alpha=a)
    model.fit(X_train, y_train)
    
    pred = model.predict(X_valid)  
    rmse = np.sqrt(mean_squared_error(y_valid, pred))
    return round(rmse, 7)


In [60]:
results = {}

for i in [0, 0.01, 0.1, 1, 10]:
    results[i] = train_predict(a=i)


In [61]:
results


{0: 0.4863279, 0.01: 0.4863136, 0.1: 0.486313, 1: 0.4863199, 10: 0.4870211}