# Import libraries 

In [1]:
import pandas as pd
import numpy as np

import seaborn as sns
from matplotlib import pyplot as plt
%matplotlib inline

# Load the data

In [2]:
df = pd.read_csv('AB_NYC_2019.csv')

In [4]:
df.head().T

Unnamed: 0,0,1,2,3,4
id,2539,2595,3647,3831,5022
name,Clean & quiet apt home by the park,Skylit Midtown Castle,THE VILLAGE OF HARLEM....NEW YORK !,Cozy Entire Floor of Brownstone,Entire Apt: Spacious Studio/Loft by central park
host_id,2787,2845,4632,4869,7192
host_name,John,Jennifer,Elisabeth,LisaRoxanne,Laura
neighbourhood_group,Brooklyn,Manhattan,Manhattan,Brooklyn,Manhattan
neighbourhood,Kensington,Midtown,Harlem,Clinton Hill,East Harlem
latitude,40.64749,40.75362,40.80902,40.68514,40.79851
longitude,-73.97237,-73.98377,-73.9419,-73.95976,-73.94399
room_type,Private room,Entire home/apt,Private room,Entire home/apt,Entire home/apt
price,149,225,150,89,80


In [5]:
features = ['neighbourhood_group','room_type','latitude',
'longitude','price','minimum_nights','number_of_reviews',
'reviews_per_month','calculated_host_listings_count','availability_365']

In [123]:
numerical = ['latitude','longitude','minimum_nights','number_of_reviews',
             'reviews_per_month','calculated_host_listings_count','availability_365']

In [124]:
categorical = ['neighbourhood_group','room_type']

In [6]:
df.isnull().sum()

id                                    0
name                                 16
host_id                               0
host_name                            21
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10052
reviews_per_month                 10052
calculated_host_listings_count        0
availability_365                      0
dtype: int64

# Fill the missing values with 0

In [7]:
df[features] = df[features].fillna(0)

In [8]:
df.isnull().sum()

id                                    0
name                                 16
host_id                               0
host_name                            21
neighbourhood_group                   0
neighbourhood                         0
latitude                              0
longitude                             0
room_type                             0
price                                 0
minimum_nights                        0
number_of_reviews                     0
last_review                       10052
reviews_per_month                     0
calculated_host_listings_count        0
availability_365                      0
dtype: int64

**clean the data**

In [9]:
df.columns = df.columns.str.lower().str.replace(' ', '_')

string_columns = list(df.dtypes[df.dtypes == 'object'].index)

for col in string_columns:
    df[col] = df[col].str.lower().str.replace(' ', '_')

# Question #1
**What is the most frequent observation (mode) for the column 'neighbourhood_group'?**

In [10]:
df['neighbourhood_group'].value_counts()

manhattan        21661
brooklyn         20104
queens            5666
bronx             1091
staten_island      373
Name: neighbourhood_group, dtype: int64

**The most repeated is (manhattan)**

# Split the data
* Split your data in train/val/test sets, with 60%/20%/20% distribution.
* Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
* Make sure that the target value ('price') is not in your dataframe.

In [11]:
from sklearn.model_selection import train_test_split


df_full_train , df_test = train_test_split(df,test_size = 0.2, random_state = 42)

len (df_full_train) , len(df_test)

(39116, 9779)

In [13]:
df_train , df_val  = train_test_split(df_full_train,test_size = 0.25, random_state = 1)

len (df_train) , len(df_val) ,len(df_test)

(29337, 9779, 9779)

In [15]:
df_train

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,price,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365
2777,1520806,huge_2br+1ba_apt_for_group_only_15_min_to_nycity,347642,jullien,brooklyn,bedford-stuyvesant,40.68084,-73.92804,entire_home/apt,170,3,134,2019-06-20,2.00,3,209
26060,20794837,one_bedroom,149071197,sydnee,manhattan,harlem,40.82040,-73.93867,private_room,50,1,2,2017-09-10,0.09,1,0
39854,30965275,basement_in_a_gorgeous_brooklyn_brownstone,231370569,sharyn,brooklyn,bedford-stuyvesant,40.69007,-73.94326,private_room,40,7,13,2019-06-26,2.39,1,93
31667,24677673,"large,_private,_sunny_room_8_min_to_subway_-ha...",4538012,karen,manhattan,east_harlem,40.80931,-73.93982,private_room,65,4,8,2019-06-18,0.64,4,0
3790,2282085,duplex_ph_2_bedloft_williamsburg,2044302,christopher,brooklyn,williamsburg,40.71969,-73.95830,entire_home/apt,500,1,0,,0.00,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
37462,29730504,brooklyn_apartment_with_tons_of_light,223667060,rachel,brooklyn,bushwick,40.69293,-73.90537,entire_home/apt,100,1,4,2019-02-10,0.48,1,35
35693,28335031,chic_studio_in_a_great_neighborhood,11050667,michelle,manhattan,chelsea,40.73917,-73.99412,entire_home/apt,154,4,7,2019-05-14,0.77,1,0
2086,941642,hell's_kitchen_studio_for_2_or_3,4888599,rod,manhattan,hell's_kitchen,40.76662,-73.99302,entire_home/apt,179,3,209,2019-06-11,2.70,1,202
26057,20792967,your_own_flat_in_historic_bed-stuy_district,2648270,jaweer,brooklyn,bedford-stuyvesant,40.68462,-73.93724,entire_home/apt,74,4,1,2017-09-30,0.05,1,0


In [16]:
df_full_train = df_full_train.reset_index(drop = True)
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

**delete the price from data frame**

In [17]:
y_train = df_train.price.values
y_val = df_val.price.values
y_test = df_test.price.values


del df_train['price']
del df_val['price']
del df_test['price']

# Question 2
* Create the correlation matrix for the numerical features of your train dataset.
* In a correlation matrix, you compute the correlation coefficient between every pair of features in the dataset.
* What are the two features that have the biggest correlation in this dataset?

In [22]:
df_corr_matrix = df_train[numerical].corr()

In [23]:
df_corr_matrix

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
latitude,1.0,0.077832,0.022611,-0.009668,-0.016642,0.018823,-0.008696
longitude,0.077832,1.0,-0.064854,0.057652,0.131194,-0.117221,0.08489
minimum_nights,0.022611,-0.064854,1.0,-0.078089,-0.122739,0.124147,0.142787
number_of_reviews,-0.009668,0.057652,-0.078089,1.0,0.578046,-0.072686,0.179343
reviews_per_month,-0.016642,0.131194,-0.122739,0.578046,1.0,-0.045882,0.166073
calculated_host_listings_count,0.018823,-0.117221,0.124147,-0.072686,-0.045882,1.0,0.224027
availability_365,-0.008696,0.08489,0.142787,0.179343,0.166073,0.224027,1.0


**The biggest correlation happens between number_of_reviews & reviews_per_month = 0.578046**

# Make price binary
* We need to turn the price variable from numeric into binary.
* Let's create a variable above_average which is 1 if the price is above (or equal to) 152.

In [54]:
y_train_above_avg = (y_train >= 152).astype('int')
y_val_above_avg = (y_val >= 152).astype('int')
y_test_above_avg = (y_test >= 152).astype('int')

In [55]:
y_train_above_avg.astype('int')

array([1, 0, 0, ..., 1, 0, 0])

In [31]:
y_train

array([170,  50,  40, ..., 179,  74,  90], dtype=int64)

In [33]:
df_train['above_average'] = (y_train >= 152)

In [34]:
df_train

Unnamed: 0,id,name,host_id,host_name,neighbourhood_group,neighbourhood,latitude,longitude,room_type,minimum_nights,number_of_reviews,last_review,reviews_per_month,calculated_host_listings_count,availability_365,above_average
0,1520806,huge_2br+1ba_apt_for_group_only_15_min_to_nycity,347642,jullien,brooklyn,bedford-stuyvesant,40.68084,-73.92804,entire_home/apt,3,134,2019-06-20,2.00,3,209,True
1,20794837,one_bedroom,149071197,sydnee,manhattan,harlem,40.82040,-73.93867,private_room,1,2,2017-09-10,0.09,1,0,False
2,30965275,basement_in_a_gorgeous_brooklyn_brownstone,231370569,sharyn,brooklyn,bedford-stuyvesant,40.69007,-73.94326,private_room,7,13,2019-06-26,2.39,1,93,False
3,24677673,"large,_private,_sunny_room_8_min_to_subway_-ha...",4538012,karen,manhattan,east_harlem,40.80931,-73.93982,private_room,4,8,2019-06-18,0.64,4,0,False
4,2282085,duplex_ph_2_bedloft_williamsburg,2044302,christopher,brooklyn,williamsburg,40.71969,-73.95830,entire_home/apt,1,0,,0.00,1,0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29332,29730504,brooklyn_apartment_with_tons_of_light,223667060,rachel,brooklyn,bushwick,40.69293,-73.90537,entire_home/apt,1,4,2019-02-10,0.48,1,35,False
29333,28335031,chic_studio_in_a_great_neighborhood,11050667,michelle,manhattan,chelsea,40.73917,-73.99412,entire_home/apt,4,7,2019-05-14,0.77,1,0,True
29334,941642,hell's_kitchen_studio_for_2_or_3,4888599,rod,manhattan,hell's_kitchen,40.76662,-73.99302,entire_home/apt,3,209,2019-06-11,2.70,1,202,True
29335,20792967,your_own_flat_in_historic_bed-stuy_district,2648270,jaweer,brooklyn,bedford-stuyvesant,40.68462,-73.93724,entire_home/apt,4,1,2017-09-30,0.05,1,0,False


# Question 3
* Calculate the mutual information score with the (binarized) price for the two categorical variables that we have. Use the training set only.
* Which of these two variables has bigger score?
* Round it to 2 decimal digits using round(score, 2)

In [35]:
from sklearn.metrics import mutual_info_score

In [36]:
def mutual_info_price_score(series):
    return mutual_info_score(series,df_train.above_average)

In [37]:
mi = df_train[categorical].apply(mutual_info_price_score)

In [39]:
mi.sort_values(ascending = False).round(2)

room_type              0.14
neighbourhood_group    0.05
dtype: float64

* The room_type has the biggest score = 0.14

# Question 4
* Now let's train a logistic regression
* Remember that we have two categorical variables in the data. Include them using one-hot encoding.
* Fit the model on the training dataset.
* To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
* model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42)
* Calculate the accuracy on the validation dataset and rount it to 2 decimal digits.

**first we include one-hot encoding**

In [40]:
from sklearn.feature_extraction import DictVectorizer

In [46]:
train_dicts = df_train[categorical + numerical].to_dict(orient = 'records')

dv = DictVectorizer(sparse = False)

In [47]:
dv.fit(train_dicts)
X_train = dv.transform(train_dicts)
X_train
X_train.shape

(29337, 15)

In [49]:
X_train

array([[209.     ,   3.     ,  40.68084, ...,   1.     ,   0.     ,
          0.     ],
       [  0.     ,   1.     ,  40.8204 , ...,   0.     ,   1.     ,
          0.     ],
       [ 93.     ,   1.     ,  40.69007, ...,   0.     ,   1.     ,
          0.     ],
       ...,
       [202.     ,   1.     ,  40.76662, ...,   1.     ,   0.     ,
          0.     ],
       [  0.     ,   1.     ,  40.68462, ...,   1.     ,   0.     ,
          0.     ],
       [235.     ,   3.     ,  40.82332, ...,   0.     ,   1.     ,
          0.     ]])

In [50]:
X_train[0]

array([209.     ,   3.     ,  40.68084, -73.92804,   3.     ,   0.     ,
         1.     ,   0.     ,   0.     ,   0.     , 134.     ,   2.     ,
         1.     ,   0.     ,   0.     ])

In [51]:
dv.get_feature_names()

['availability_365',
 'calculated_host_listings_count',
 'latitude',
 'longitude',
 'minimum_nights',
 'neighbourhood_group=bronx',
 'neighbourhood_group=brooklyn',
 'neighbourhood_group=manhattan',
 'neighbourhood_group=queens',
 'neighbourhood_group=staten_island',
 'number_of_reviews',
 'reviews_per_month',
 'room_type=entire_home/apt',
 'room_type=private_room',
 'room_type=shared_room']

In [52]:
val_dicts = df_val[categorical + numerical].to_dict(orient = 'records')

dv = DictVectorizer(sparse = False)

dv.fit(val_dicts)
X_val = dv.transform(val_dicts)
X_val
X_val.shape

(9779, 15)

**Train the model using logistic regression**

In [53]:
from sklearn.linear_model import LogisticRegression

In [56]:
model = LogisticRegression(solver='liblinear', C=1.0, random_state=42)
model.fit(X_train,y_train_above_avg)

LogisticRegression(random_state=42, solver='liblinear')

In [57]:
model.intercept_[0]

-0.10349402323328087

In [58]:
model.coef_[0].round(3)

array([ 3.000e-03,  4.000e-03, -5.870e+00, -3.195e+00, -1.300e-02,
       -2.180e-01,  1.900e-01,  1.617e+00,  6.000e-03, -1.698e+00,
       -3.000e-03, -5.400e-02,  1.931e+00, -8.610e-01, -1.173e+00])

In [60]:
model.predict_proba(X_train)

array([[0.61707695, 0.38292305],
       [0.93813471, 0.06186529],
       [0.96477595, 0.03522405],
       ...,
       [0.41931486, 0.58068514],
       [0.64486606, 0.35513394],
       [0.88004806, 0.11995194]])

In [61]:
y_pred = model.predict_proba(X_train)[:,1]

In [62]:
price_above_avg_decision = (y_pred >= 0.5)

In [63]:
y_pred = model.predict_proba(X_val)[:,1]

price_above_avg_decision = (y_pred >= 0.5)

In [65]:
y_val_above_avg

array([1, 1, 0, ..., 0, 0, 0])

In [66]:
price_above_avg_decision.astype('int')

array([1, 0, 0, ..., 0, 0, 0])

In [67]:
(y_val_above_avg ==price_above_avg_decision).mean()

0.7950710706616219

**So the accuracy of the model is**

In [70]:
accuracy = (y_val_above_avg ==price_above_avg_decision).mean().round(3)
accuracy

0.795

the closest one on the form is 0.79

# Question 5
* We have 9 features: 7 numerical features and 2 categorical.
* Let's find the least useful one using the feature elimination technique.
* Train a model with all these features (using the same parameters as in Q4).
* Now exclude each feature from this set and train a model without it. Record the accuracy for each model.
* For each feature, calculate the difference between the original accuracy and the accuracy without the feature.
* Which of following feature has the smallest difference?
*   neighbourhood_group
*   room_type
*   number_of_reviews
*   reviews_per_month
*note: the difference doesn't have to be positive

In [116]:
to_exclude_numerical = ['number_of_reviews','reviews_per_month',]

In [137]:
to_exclude_categorical = ['neighbourhood_group', 'room_type']

In [126]:
num = numerical.copy()

In [127]:
num.remove(to_exclude_numerical[0])

In [128]:
num

['latitude',
 'longitude',
 'minimum_nights',
 'reviews_per_month',
 'calculated_host_listings_count',
 'availability_365']

In [129]:
numerical

['latitude',
 'longitude',
 'minimum_nights',
 'number_of_reviews',
 'reviews_per_month',
 'calculated_host_listings_count',
 'availability_365']

In [144]:
    train_dicts = df_train[categorical + numerical].to_dict(orient = 'records')
    dv = DictVectorizer(sparse = False)

    dv.fit(train_dicts)
    X_train = dv.transform(train_dicts)

    val_dicts = df_val[categorical + numerical].to_dict(orient = 'records')
    dv = DictVectorizer(sparse = False)

    dv.fit(val_dicts)
    X_val = dv.transform(val_dicts)

    
    model = LogisticRegression(solver='liblinear', C=1.0, random_state=42)
    model.fit(X_train,y_train_above_avg)

    y_pred = model.predict_proba(X_val)[:,1]
    price_above_avg_decision = (y_pred >= 0.5)
    

    accuracy = (y_val_above_avg ==price_above_avg_decision).mean()
    accuracy

0.7950710706616219

**First let's start excluding numericals**

In [143]:
for i in range(len(to_exclude_numerical)):
    num = numerical.copy()
    num.remove(to_exclude_numerical[i], )
    
    
    train_dicts = df_train[categorical + num].to_dict(orient = 'records')
    dv = DictVectorizer(sparse = False)

    dv.fit(train_dicts)
    X_train = dv.transform(train_dicts)

    val_dicts = df_val[categorical + num].to_dict(orient = 'records')
    dv = DictVectorizer(sparse = False)

    dv.fit(val_dicts)
    X_val = dv.transform(val_dicts)

    
    model = LogisticRegression(solver='liblinear', C=1.0, random_state=42)
    model.fit(X_train,y_train_above_avg)

    y_pred = model.predict_proba(X_val)[:,1]
    price_above_avg_decision = (y_pred >= 0.5)
    

    accuracy_exclude = (y_val_above_avg ==price_above_avg_decision).mean()
    diff = accuracy - accuracy_exclude
    print(to_exclude_numerical[i], accuracy_exclude, diff)
    
    
for i in range(len(to_exclude_categorical)):
    cat = categorical.copy()
    cat.remove(to_exclude_categorical[i], )
    
    
    train_dicts = df_train[cat + numerical].to_dict(orient = 'records')
    dv = DictVectorizer(sparse = False)

    dv.fit(train_dicts)
    X_train = dv.transform(train_dicts)

    val_dicts = df_val[cat + numerical].to_dict(orient = 'records')
    dv = DictVectorizer(sparse = False)

    dv.fit(val_dicts)
    X_val = dv.transform(val_dicts)

    
    model = LogisticRegression(solver='liblinear', C=1.0, random_state=42)
    model.fit(X_train,y_train_above_avg)

    y_pred = model.predict_proba(X_val)[:,1]
    price_above_avg_decision = (y_pred >= 0.5)
    

    accuracy_exclude = (y_val_above_avg ==price_above_avg_decision).mean()
    diff = accuracy - accuracy_exclude
    print(to_exclude_categorical[i], accuracy_exclude, diff)
    
    

number_of_reviews 0.793946211269046 0.0011248593925758943
reviews_per_month 0.7954801104407404 -0.0004090397791185474
neighbourhood_group 0.7457817772778402 0.04928929338378163
room_type 0.7374987217506902 0.05757234891093166


**reviews_per_month  has the smallest difference = -0.0004090397791185474**

# Question 6
* For this question, we'll see how to use a linear regression model from Scikit-Learn
* We'll need to use the original column 'price'. Apply the logarithmic transformation to this column.
* Fit the Ridge regression model on the training data.
* This model has a parameter alpha. Let's try the following values: [0, 0.01, 0.1, 1, 10]
* Which of these alphas leads to the best RMSE on the validation set? Round your RMSE scores to 3 decimal digits.
* If there are multiple options, select the smallest alpha.

In [145]:
from sklearn.linear_model import LinearRegression

In [205]:
y_train_log = np.log1p(y_train)
y_val_log = np.log1p(y_val)
y_test_log = np.log1p(y_test)

In [206]:
train_dicts = df_train[categorical + numerical].to_dict(orient = 'records')
dv = DictVectorizer(sparse = False)

dv.fit(train_dicts)
X_train = dv.transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient = 'records')
dv = DictVectorizer(sparse = False)

dv.fit(val_dicts)
X_val = dv.transform(val_dicts)

    
model = LinearRegression().fit(X_train,y_train_log)

y_pred = model.predict(X_val)

In [207]:
dv.get_feature_names()

['availability_365',
 'calculated_host_listings_count',
 'latitude',
 'longitude',
 'minimum_nights',
 'neighbourhood_group=bronx',
 'neighbourhood_group=brooklyn',
 'neighbourhood_group=manhattan',
 'neighbourhood_group=queens',
 'neighbourhood_group=staten_island',
 'number_of_reviews',
 'reviews_per_month',
 'room_type=entire_home/apt',
 'room_type=private_room',
 'room_type=shared_room']

In [208]:
y_pred

array([5.16931276, 5.12277502, 5.0008242 , ..., 4.97340293, 4.83555515,
       4.93809752])

In [209]:
y_val_log = np.log1p(y_val)
y_val_log

array([5.62040087, 5.08140436, 5.01727984, ..., 4.30406509, 5.01727984,
       4.7095302 ])

In [210]:
from sklearn.metrics import mean_squared_error

In [211]:
mean_squared_error(y_val_log, y_pred)

0.2552679669025112

In [212]:
y_val[5]

250

In [213]:
np.expm1(y_val_log[5])

249.9999999999999

In [214]:
y_pred

array([5.16931276, 5.12277502, 5.0008242 , ..., 4.97340293, 4.83555515,
       4.93809752])

In [215]:
from sklearn.linear_model import Ridge

In [216]:
train_dicts = df_train[categorical + numerical].to_dict(orient = 'records')
dv = DictVectorizer(sparse = False)

dv.fit(train_dicts)
X_train = dv.transform(train_dicts)

val_dicts = df_val[categorical + numerical].to_dict(orient = 'records')
dv = DictVectorizer(sparse = False)

dv.fit(val_dicts)
X_val = dv.transform(val_dicts)

In [229]:
alpha_ = [0, 0.01, 0.1, 1, 10]

In [233]:
for i in alpha_:
    rr = Ridge(alpha=i)
    rr.fit(X_train, y_train_log) 
    pred_val_rr= rr.predict(X_val)
    print(i , np.sqrt(mean_squared_error(y_val_log,pred_val_rr)).round(3))

0 0.505
0.01 0.505
0.1 0.505
1 0.505
10 0.507


  return linalg.solve(A, Xy, sym_pos=True,


**The answer is the smallest value = 0**

In [226]:
pred_val_rr

array([5.15457261, 5.14464515, 4.99817858, ..., 4.96673898, 4.81059303,
       4.93905848])

In [227]:
y_val_log

array([5.62040087, 5.08140436, 5.01727984, ..., 4.30406509, 5.01727984,
       4.7095302 ])