# Seattle Airbnb Data 
The data I used in this notebook refer to over 3800 accommodations that where enlisted to Airbnb in Seattle during 2016. They describe a wide variety of features: not only information about hosts, locations and fares applied to each listing, but also their average ratings and availability period. These datasets are available for free download at https://www.kaggle.com/datasets/airbnb/seattle, where you can find more details and other interesting studies about them.

## Train Regressors

#### Import libraries

In [1]:
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.linear_model import LinearRegression, ElasticNet 
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import r2_score, mean_squared_error
from sqlalchemy import create_engine
import pickle

import nltk
nltk.download(['punkt', 'wordnet', 'omw-1.4'])

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\2100\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\2100\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\2100\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

#### Load data

In [2]:
engine = create_engine('sqlite:///../data/AirbnbRatings.db')
df = pd.read_sql('SELECT * FROM listings', engine)

In [3]:
X = df.drop(['review_scores_rating'], axis=1)
y = df.review_scores_rating

### Write a tokenization function to process text data

In [4]:
def tokenize(text):
    
    tokens = word_tokenize(text)
    lemmatizer = WordNetLemmatizer()
    
    clean_tokens = []
    for tok in tokens:
        clean_tok = lemmatizer.lemmatize(tok).lower().strip()
        clean_tokens.append(clean_tok)

    return clean_tokens

### Build a machine learning pipeline
This machine pipeline should take in as input 
- the `description` column
- the most influential numeric columns in the dataset to predict ratings

and output rating results.

In [5]:
numeric_features = list(X.columns)
text_feature = "description"
numeric_features.remove(text_feature)

In [6]:
preprocess = make_column_transformer(
    (Pipeline([
        ('vect', CountVectorizer(tokenizer=tokenize)),
        ('tfidf', TfidfTransformer())
    ]), text_feature),
    (StandardScaler(), numeric_features)
)

In [7]:
pipeline = Pipeline([
        ('preprocess', preprocess),
        ('reg', DecisionTreeRegressor())
])

### Train pipeline
- Split data into train and test sets
- Train pipeline

In [8]:
X_train, X_test, y_train, y_test = train_test_split(X, y)

In [9]:
pipeline.fit(X_train, y_train)

Pipeline(steps=[('preprocess',
                 ColumnTransformer(transformers=[('pipeline',
                                                  Pipeline(steps=[('vect',
                                                                   CountVectorizer(tokenizer=<function tokenize at 0x00000198294E6CA0>)),
                                                                  ('tfidf',
                                                                   TfidfTransformer())]),
                                                  'description'),
                                                 ('standardscaler',
                                                  StandardScaler(),
                                                  ['host_is_superhost_t',
                                                   'host_response_time_within '
                                                   'an hour',
                                                   'host_identity_verified_t',
                               

In [10]:
pickle.dump(pipeline, open('../models/airbnb_pipeline.pkl', 'wb'))

### Improve the model
Use grid search to find better parameters. 

In [11]:
pipeline.get_params()

{'memory': None,
 'steps': [('preprocess',
   ColumnTransformer(transformers=[('pipeline',
                                    Pipeline(steps=[('vect',
                                                     CountVectorizer(tokenizer=<function tokenize at 0x00000198294E6CA0>)),
                                                    ('tfidf',
                                                     TfidfTransformer())]),
                                    'description'),
                                   ('standardscaler', StandardScaler(),
                                    ['host_is_superhost_t',
                                     'host_response_time_within an hour',
                                     'host_identity_verified_t',
                                     'host_response_time_within a day',
                                     'instant_bookable_t',
                                     'cancellation_policy_flexible',
                                     'room_type_Private room',


In [12]:
parameters = {
        'preprocess__pipeline__vect__ngram_range': ((1, 1), (1, 2)),
        'preprocess__pipeline__vect__max_df': (0.5, 0.75, 1.0),
        'preprocess__pipeline__vect__max_features': (None, 5000, 10000),
        'preprocess__pipeline__tfidf__use_idf': (True, False),
        'reg': [ElasticNet(), DecisionTreeRegressor()]
    }

cv = GridSearchCV(pipeline, param_grid=parameters, verbose=20)

In [13]:
cv.fit(X_train, y_train)

Fitting 5 folds for each of 72 candidates, totalling 360 fits
[CV 1/5; 1/72] START preprocess__pipeline__tfidf__use_idf=True, preprocess__pipeline__vect__max_df=0.5, preprocess__pipeline__vect__max_features=None, preprocess__pipeline__vect__ngram_range=(1, 1), reg=ElasticNet()
[CV 1/5; 1/72] END preprocess__pipeline__tfidf__use_idf=True, preprocess__pipeline__vect__max_df=0.5, preprocess__pipeline__vect__max_features=None, preprocess__pipeline__vect__ngram_range=(1, 1), reg=ElasticNet();, score=0.045 total time=   3.6s
[CV 2/5; 1/72] START preprocess__pipeline__tfidf__use_idf=True, preprocess__pipeline__vect__max_df=0.5, preprocess__pipeline__vect__max_features=None, preprocess__pipeline__vect__ngram_range=(1, 1), reg=ElasticNet()
[CV 2/5; 1/72] END preprocess__pipeline__tfidf__use_idf=True, preprocess__pipeline__vect__max_df=0.5, preprocess__pipeline__vect__max_features=None, preprocess__pipeline__vect__ngram_range=(1, 1), reg=ElasticNet();, score=0.035 total time=   3.5s
[CV 3/5; 1/7

GridSearchCV(estimator=Pipeline(steps=[('preprocess',
                                        ColumnTransformer(transformers=[('pipeline',
                                                                         Pipeline(steps=[('vect',
                                                                                          CountVectorizer(tokenizer=<function tokenize at 0x000001FE95DD5280>)),
                                                                                         ('tfidf',
                                                                                          TfidfTransformer())]),
                                                                         'description'),
                                                                        ('standardscaler',
                                                                         StandardScaler(),
                                                                         ['host_is_superhost_t',
                      

In [14]:
pickle.dump(cv, open('../models/airbnb_gridsearch.pkl', 'wb'))

### Test the model

In [17]:
cv = pickle.load(open('../models/airbnb_gridsearch.pkl', 'rb'))

In [18]:
y_pred_cv = cv.predict(X_test)

In [21]:
r2 = r2_score(y_test, y_pred_cv)
print("The R2 score of the model is {}".format(r2))

The R2 score of the model is 0.41561957028665497


In [20]:
mse = mean_squared_error(y_test, y_pred_cv)
print("The mean squared error of the model is {}".format(mse))

The mean squared error of the model is 42.73268765919611
