The link for the original Assignment3 document om ML_Zoomcamp by Alexey Grigorev of [Data Talks Club](https://datatalks.club/) is [here](https://github.com/alexeygrigorev/mlbookcamp-code/blob/master/course-zoomcamp/03-classification/homework.md).

The link for the dataset used in the assignment is [here](https://www.kaggle.com/dgomonov/new-york-city-airbnb-open-data).

---
---

# Initialization & Overview

In [179]:
import pandas as pd
import numpy as np
import seaborn as sns
from matplotlib import pyplot as pyplot

from sklearn.model_selection import train_test_split
from sklearn.metrics import mutual_info_score
from sklearn.feature_extraction import DictVectorizer
from sklearn.linear_model import LogisticRegression

%matplotlib inline

### Select desired columns and fill in the missing values with 0.

In [150]:
df = pd.read_csv('AB_NYC_2019.csv')

selection = ['latitude', 'longitude', 'price', 'minimum_nights', 'number_of_reviews', 
            'reviews_per_month', 'calculated_host_listings_count', 
            'availability_365','neighbourhood_group', 'room_type']

df = df[selection].fillna(0)

In [151]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 48895 entries, 0 to 48894
Data columns (total 10 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   latitude                        48895 non-null  float64
 1   longitude                       48895 non-null  float64
 2   price                           48895 non-null  int64  
 3   minimum_nights                  48895 non-null  int64  
 4   number_of_reviews               48895 non-null  int64  
 5   reviews_per_month               48895 non-null  float64
 6   calculated_host_listings_count  48895 non-null  int64  
 7   availability_365                48895 non-null  int64  
 8   neighbourhood_group             48895 non-null  object 
 9   room_type                       48895 non-null  object 
dtypes: float64(3), int64(5), object(2)
memory usage: 3.7+ MB


In [152]:
df.describe().round(2)

Unnamed: 0,latitude,longitude,price,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
count,48895.0,48895.0,48895.0,48895.0,48895.0,48895.0,48895.0,48895.0
mean,40.73,-73.95,152.72,7.03,23.27,1.09,7.14,112.78
std,0.05,0.05,240.15,20.51,44.55,1.6,32.95,131.62
min,40.5,-74.24,0.0,1.0,0.0,0.0,1.0,0.0
25%,40.69,-73.98,69.0,1.0,1.0,0.04,1.0,0.0
50%,40.72,-73.96,106.0,3.0,5.0,0.37,1.0,45.0
75%,40.76,-73.94,175.0,5.0,24.0,1.58,2.0,227.0
max,40.91,-73.71,10000.0,1250.0,629.0,58.5,327.0,365.0


In [153]:
def quick_look(mydf, m=5):
    print(f"It is in type of {type(mydf)} and shape of {mydf.shape}")
    for col in df.columns:
        print(str(col).upper(), 
            f"column has {mydf[col].nunique()} unique values (in {mydf[col].dtypes} format) and {mydf[col].isnull().sum()} NaNs")
        print(mydf[col].unique()[:m], "\n")

quick_look(df, 5)

It is in type of <class 'pandas.core.frame.DataFrame'> and shape of (48895, 10)
LATITUDE column has 19048 unique values (in float64 format) and 0 NaNs
[40.64749 40.75362 40.80902 40.68514 40.79851] 

LONGITUDE column has 14718 unique values (in float64 format) and 0 NaNs
[-73.97237 -73.98377 -73.9419  -73.95976 -73.94399] 

PRICE column has 674 unique values (in int64 format) and 0 NaNs
[149 225 150  89  80] 

MINIMUM_NIGHTS column has 109 unique values (in int64 format) and 0 NaNs
[ 1  3 10 45  2] 

NUMBER_OF_REVIEWS column has 394 unique values (in int64 format) and 0 NaNs
[  9  45   0 270  74] 

REVIEWS_PER_MONTH column has 938 unique values (in float64 format) and 0 NaNs
[0.21 0.38 0.   4.64 0.1 ] 

CALCULATED_HOST_LISTINGS_COUNT column has 47 unique values (in int64 format) and 0 NaNs
[6 2 1 4 3] 

AVAILABILITY_365 column has 366 unique values (in int64 format) and 0 NaNs
[365 355 194   0 129] 

NEIGHBOURHOOD_GROUP column has 5 unique values (in object format) and 0 NaNs
['Brookly

# Q1 
## What is the most frequent observation (mode) for the column 'neighbourhood_group'?

In [253]:
df['neighbourhood_group'].value_counts().index[0]

'Manhattan'

---
---
### Split the data
+ Split your data in train/val/test sets, with 60%/20%/20% distribution.
+ Use Scikit-Learn for that (the train_test_split function) and set the seed to 42.
+ Make sure that the target value ('price') is not in your dataframe.

In [155]:
df_temporary_train, df_test = train_test_split(df, test_size=0.2, random_state=42)
df_train, df_val = train_test_split(df_temporary_train, test_size=0.25, random_state=42)

In [156]:
# Check the size of subsets:

len(df_train), len(df_val), len(df_test)

(29337, 9779, 9779)

In [157]:
df_train = df_train.reset_index(drop=True)
df_val = df_val.reset_index(drop=True)
df_test = df_test.reset_index(drop=True)

In [158]:
price_train = df_train.price.values
price_val = df_val.price.values
price_test = df_test.price.values

del df_train['price']
del df_val['price']
del df_test['price']

In [159]:
df_train.corr()

Unnamed: 0,latitude,longitude,minimum_nights,number_of_reviews,reviews_per_month,calculated_host_listings_count,availability_365
latitude,1.0,0.080301,0.027441,-0.006246,-0.007159,0.019375,-0.005891
longitude,0.080301,1.0,-0.06066,0.055084,0.134642,-0.117041,0.083666
minimum_nights,0.027441,-0.06066,1.0,-0.07602,-0.120703,0.118647,0.138901
number_of_reviews,-0.006246,0.055084,-0.07602,1.0,0.590374,-0.073167,0.174477
reviews_per_month,-0.007159,0.134642,-0.120703,0.590374,1.0,-0.048767,0.165376
calculated_host_listings_count,0.019375,-0.117041,0.118647,-0.073167,-0.048767,1.0,0.225913
availability_365,-0.005891,0.083666,0.138901,0.174477,0.165376,0.225913,1.0


# Q2
## What are the two features that have the biggest correlation in this dataset?

In [160]:
df.corr().unstack().sort_values(ascending=False)[0:20]


latitude                        latitude                          1.000000
longitude                       longitude                         1.000000
calculated_host_listings_count  calculated_host_listings_count    1.000000
reviews_per_month               reviews_per_month                 1.000000
number_of_reviews               number_of_reviews                 1.000000
minimum_nights                  minimum_nights                    1.000000
price                           price                             1.000000
availability_365                availability_365                  1.000000
reviews_per_month               number_of_reviews                 0.589407
number_of_reviews               reviews_per_month                 0.589407
availability_365                calculated_host_listings_count    0.225701
calculated_host_listings_count  availability_365                  0.225701
number_of_reviews               availability_365                  0.172028
availability_365         

---
---

### Make price binary
+ We need to turn the price variable from numeric into binary.
+ Let's create a variable above_average which is 1 if the price is above (or equal to) 152.

In [161]:
price_train[price_train < 152] = 0
price_train[price_train >= 152] = 1

price_val[price_val < 152] = 0
price_val[price_val >= 152] = 1

price_test[price_test < 152] = 0
price_test[price_test >= 152] = 1


# Question 3
## Calculate the mutual information score with the (binarized) price for the two categorical variables that we have. Use the training set only.
+ Which of these two variables has bigger score?
+ Round it to 2 decimal digits using round(score, 2)

In [166]:
print(mutual_info_score(price_train, df_train.neighbourhood_group))

print(mutual_info_score(price_train, df_train.room_type))

0.04650605348506435
0.14322617342090396


---
---

# Question 4
## Now let's train a logistic regression
+ Remember that we have two categorical variables in the data. Include them using one-hot encoding.
    + Fit the model on the training dataset. + To make sure the results are reproducible across different versions of Scikit-Learn, fit the model with these parameters:
    + model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42)
+ Calculate the accuracy on the validation dataset and rount it to 2 decimal digits.

In [275]:
# One-hot encoding

dv = DictVectorizer(sparse=False)

train_dict = df_train.to_dict(orient='records')
X_train = dv.fit_transform(train_dict)

val_dict = df_val.to_dict(orient='records')
X_val = dv.transform(val_dict)

In [276]:
# Training the model
model = LogisticRegression(solver='lbfgs', C=1.0, random_state=42, max_iter=1000)
model.fit(X_train, price_train)

LogisticRegression(max_iter=1000, random_state=42)

In [281]:
def accuracy(model, train_val, target_val, treshold=0.5):
    prediction = model.predict_proba(train_val)[:,1]
    decision = (prediction >= treshold)

    df_pred = pd.DataFrame()
    df_pred['probability'] = prediction
    df_pred['prediction'] = decision.astype(int)
    df_pred['actual'] = target_val
    df_pred['correct'] = df_pred.prediction == df_pred.actual

    return df_pred.correct.mean()

print(accuracy(model, X_val, price_val))

0.790878412925657


---
---

In [239]:
[list(df_train.columns)]

[['latitude',
  'longitude',
  'minimum_nights',
  'number_of_reviews',
  'reviews_per_month',
  'calculated_host_listings_count',
  'availability_365',
  'neighbourhood_group',
  'room_type']]

In [285]:
feature_list = list(df_train.columns)


for i in range(len(feature_list)):
    temporary_list = list(df_train.columns)
    temporary_list.remove(feature_list[i])
    
    # One-hot encoding
    dv = DictVectorizer(sparse=False)

    train_dict2 = df_train[temporary_list].to_dict(orient='records')
    X_train2 = dv.fit_transform(train_dict2)

    val_dict2 = df_val.to_dict(orient='records')
    X_val2 = dv.transform(val_dict2)

    # Training the model
    model2 = LogisticRegression(solver='lbfgs', C=1.0, random_state=42, max_iter=1000)
    model2.fit(X_train2, price_train)

    result = accuracy(model2, X_val2, price_val)

    # Assessing model accuracy
    # price_pred3 = model3.predict_proba(X_val3)[:,1]
    # price_decision3 = (price_pred3 >= 0.5)
    # df_pred3 = pd.DataFrame()
    # df_pred3['probability'] = price_pred3
    # df_pred3['prediction'] = price_decision3.astype(int)
    # df_pred3['actual'] = price_val
    # df_pred3['correct'] = df_pred3.prediction == df_pred3.actual
    # result = df_pred3.correct.mean()

    print(f"The difference for {feature_list[i]} is: {(result_all_features - result)}")
  


The difference for latitude is: 0.003988137846405504
The difference for longitude is: 0.003885877901625867
The difference for minimum_nights is: 0.00010225994477952582
The difference for number_of_reviews is: -0.0004090397791185474
The difference for reviews_per_month is: 0.0003067798343387995
The difference for calculated_host_listings_count is: 0.001329379282135168
The difference for availability_365 is: 0.009612434809285197
The difference for neighbourhood_group is: 0.039881378464055595
The difference for room_type is: 0.06227630637079451


---
---