# Capstone Workbook 5: Advanced Modelling NLP

In [1]:
# Import libraries
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats
import statsmodels.api as sm

In [2]:
# Import data 
airbnb_ldn = pd.read_csv('airbnb_ldn_pp.csv')

In [3]:
# drop 'Unnamed: 0'
airbnb_ldn = airbnb_ldn.drop(columns = 'Unnamed: 0')

Split the columns to view just those with the object datatype, keeping the target column:

In [4]:
X_obj = airbnb_ldn.select_dtypes(include='object')
X_obj = pd.concat([X_obj, airbnb_ldn['Annual Revenue LTM (Native)']], axis=1)

In [5]:
X_obj

Unnamed: 0,Listing Title,Property Type,Zipcode,Check-in Time,Amenities,guest_controls,Annual Revenue LTM (Native)
0,Cozy 2BR house with a garden view,Entire home,SW15 3,12:00 PM - 12:00 AM,"['Free parking on premises', 'Air conditioning...","{""allows_children"": true, ""allows_infants"": tr...",1619.5
1,GuestReady - Amazing home with a private garden,Entire home,SW15 3,3:00 PM - 12:00 AM,"['Wifi', 'Kitchen', 'Dryer', 'Dedicated worksp...","{""allows_children"": true, ""allows_infants"": tr...",16737.7
2,Cosy cottage on Richmond Park,Entire home,SW15 3,After 3:00 PM,"['Free parking on premises', 'Air conditioning...","{""allows_children"": false, ""allows_infants"": f...",345.2
3,"Entire Flat. Free parking, Garden , Richmond park",Entire rental unit,SW15 3,3:00 PM - 11:00 PM,"['Free parking on premises', 'Wifi', 'Kitchen'...","{""allows_children"": true, ""allows_infants"": tr...",12853.3
4,Maisonette inbetween Richmond Park and Wimbledon,Private room in rental unit,SW15 3,12:00 PM - 10:00 PM,"['Free parking on premises', 'Wifi', 'Breakfas...","{""allows_children"": false, ""allows_infants"": f...",267.8
...,...,...,...,...,...,...,...
32669,Service Apartment- London Thamesmead,Entire condo,SE28 8,3:00 PM - 9:00 PM,"['Free parking on premises', 'Wifi', 'Kitchen'...","{""allows_children"": true, ""allows_infants"": tr...",5371.2
32670,Large Double Room with Free Parking and Garden,Private room in home,SE28 8,2:00 PM - 4:00 PM,"['Free parking on premises', 'Wifi', 'Kitchen'...","{""allows_children"": true, ""allows_infants"": tr...",189.6
32671,Forest view room in welcoming home,Private room in home,E4 6,,"['Wifi', 'Kitchen', 'Hair dryer', 'TV', 'Iron'...","{""allows_children"": true, ""allows_infants"": fa...",6335.8
32672,Spacious Double Room with En-suite bathroom,Private room in home,E4 6,3:00 PM - 10:00 PM,"['Wifi', 'Kitchen', 'Hair dryer', 'TV', 'Iron'...","{""allows_children"": false, ""allows_infants"": f...",4264.4


In [6]:
# confirm datatype of remaining columns:
X_obj.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32674 entries, 0 to 32673
Data columns (total 7 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Listing Title                32674 non-null  object 
 1   Property Type                32674 non-null  object 
 2   Zipcode                      32674 non-null  object 
 3   Check-in Time                30917 non-null  object 
 4   Amenities                    32674 non-null  object 
 5   guest_controls               32674 non-null  object 
 6   Annual Revenue LTM (Native)  32674 non-null  float64
dtypes: float64(1), object(6)
memory usage: 1.7+ MB


Now the object datatype columns have been isolated with the target column, the data will be checked:

In [7]:
# check for nulls
X_obj.isnull().sum()

Listing Title                     0
Property Type                     0
Zipcode                           0
Check-in Time                  1757
Amenities                         0
guest_controls                    0
Annual Revenue LTM (Native)       0
dtype: int64

The 'Check-in Time' column has many nulls. This column isn't essential for the model, so it can be dropped for the time being. 
The 'Listing Title' column has 4 null values. This is insignificant compared to the entire dataset size, so these rows can be dropped.

In [8]:
# dropping Check-in time column
X_obj.drop(columns='Check-in Time', inplace=True)

In [9]:
# dropping the Listing Title null rows:
X_obj.dropna(inplace=True)

In [10]:
# sanity check
X_obj.isnull().sum()

Listing Title                  0
Property Type                  0
Zipcode                        0
Amenities                      0
guest_controls                 0
Annual Revenue LTM (Native)    0
dtype: int64

As seen, there are no null values remaining.

The independent and dependent variables can now be split.

In [11]:
X = X_obj.drop(columns='Annual Revenue LTM (Native)')

In [12]:
y = X_obj['Annual Revenue LTM (Native)']

## Starting With Listing Title

In [13]:
lt_array = X_obj['Listing Title']

In [14]:
lt_array

0                        Cozy 2BR house with a garden view
1          GuestReady - Amazing home with a private garden
2                            Cosy cottage on Richmond Park
3        Entire Flat. Free parking, Garden , Richmond park
4         Maisonette inbetween Richmond Park and Wimbledon
                               ...                        
32669                Service  Apartment- London Thamesmead
32670       Large Double Room with Free Parking and Garden
32671                   Forest view room in welcoming home
32672          Spacious Double Room with En-suite bathroom
32673    Large Room & Bathroom close to Forest and Station
Name: Listing Title, Length: 32674, dtype: object

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

In [16]:
# .1 Instantiate transformer object
bagofwords_lt = CountVectorizer(min_df = 50,
                                stop_words = "english",
                             ngram_range = (1, 4))

# 2. Fit
bagofwords_lt.fit(lt_array)

# 3. Transform
lt_transformed = bagofwords_lt.transform(lt_array)
lt_transformed

<32674x739 sparse matrix of type '<class 'numpy.int64'>'
	with 192522 stored elements in Compressed Sparse Row format>

In [17]:
# View the words that are occuring in the Listing Title:
bagofwords_lt.vocabulary_

{'cozy': 200,
 '2br': 16,
 'house': 353,
 'garden': 302,
 'view': 713,
 'house garden': 356,
 'garden view': 306,
 'guestready': 322,
 'amazing': 26,
 'home': 343,
 'private': 540,
 'private garden': 545,
 'cosy': 186,
 'cottage': 196,
 'park': 516,
 'entire': 245,
 'flat': 267,
 'free': 296,
 'parking': 517,
 'entire flat': 246,
 'flat free': 274,
 'free parking': 297,
 'maisonette': 441,
 'wimbledon': 734,
 'room': 575,
 'private room': 546,
 'friendly': 298,
 'clean': 169,
 'place': 531,
 'people': 525,
 'nice': 491,
 'quiet': 556,
 'double': 216,
 'single': 605,
 'rooms': 592,
 'near': 473,
 'piccadilly': 529,
 'line': 393,
 'house near': 358,
 'lovely': 416,
 'lovely room': 427,
 'beautiful': 64,
 'views': 714,
 'stay': 648,
 'family': 255,
 'east': 234,
 'guests': 323,
 'family home': 257,
 'home garden': 346,
 'large': 379,
 'large room': 385,
 'stunning': 661,
 'london': 403,
 'london home': 411,
 'bedroom': 83,
 'bathroom': 56,
 'private double': 543,
 'double bedroom': 218,
 

In [18]:
# see the quantity of tokens:
len(bagofwords_lt.vocabulary_)

739

In [19]:
listing_title_cv = pd.DataFrame(columns=bagofwords_lt.get_feature_names(), data=lt_transformed.toarray())

In [20]:
# Viewing the listing title dataframe:
display(listing_title_cv)

Unnamed: 0,10,10 mins,15,1bd,1bd flat,1bed,1br,1br flat,1st,20,...,westminster,wharf,wi,wi fi,wifi,wimbledon,wimbledon tennis,wonderful,wood,zone
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,1,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32669,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
32670,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
32671,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
32672,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


The same Listing title will now be split using TF-IDF:

In [21]:
# required import
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction import text

In [22]:
# using our custom tokenizer in TfidfVectorizer
tfidf = TfidfVectorizer(min_df=50,
                        ngram_range = (1,3),
                        stop_words = "english")
tfidf.fit(lt_array)

lt_tfidf_transformed = tfidf.transform(lt_array)
listing_title_tfidf = pd.DataFrame(columns=tfidf.get_feature_names(), data=lt_tfidf_transformed.toarray())

In [23]:
display(listing_title_tfidf)

Unnamed: 0,10,10 mins,15,1bd,1bd flat,1bed,1br,1br flat,1st,20,...,westminster,wharf,wi,wi fi,wifi,wimbledon,wimbledon tennis,wonderful,wood,zone
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.585908,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32669,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
32670,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
32671,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0
32672,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,0.0,0.0


In [24]:
listing_title_tfidf.max()

10                  1.000000
10 mins             0.543250
15                  0.743492
1bd                 0.748726
1bd flat            0.713357
                      ...   
wimbledon           1.000000
wimbledon tennis    0.633090
wonderful           0.898538
wood                0.942951
zone                1.000000
Length: 737, dtype: float64

**Export the two process 'Listing title' columns to dataframe:**

In [25]:
# for the listing title count vectorized column
listing_title_cv.to_csv('lt_cv.csv', index=False)

In [26]:
# for the TF-IDF column:
listing_title_tfidf.to_csv('lt_tfidf.csv', index=False)

## Postcode Column

The Zipcode column will be split, so that just the first portion is retained (the district code). This will reduce the number of distinct values within the postcode column. This will group the properties better, giving a clearer idea of which postcode locations are more predictive of a higher annual revenue:

In [27]:
# splitting the zipcode column, to return the 
X_obj['Zipcode'].str.split().str[0]

0        SW15
1        SW15
2        SW15
3        SW15
4        SW15
         ... 
32669    SE28
32670    SE28
32671      E4
32672      E4
32673      E4
Name: Zipcode, Length: 32674, dtype: object

The column within the dataframe will be redefined, to just contain the district code:

In [28]:
X_obj['Zipcode'] = X_obj['Zipcode'].str.split().str[0]

In [29]:
zc_array = X_obj['Zipcode']

This column can now be be processed. As the postcode values do not have any meaning other than to indicate the area they are assigned to, the quantity of their occurance is the important element within this context. Meaning count vectorizing this column will be sufficient 

In [30]:
# .1 Instantiate transformer object
bagofwords_zc = CountVectorizer()

# 2. Fit
bagofwords_zc.fit(zc_array)

# 3. Transform
zc_transformed = bagofwords_zc.transform(zc_array)
zc_transformed

<32674x180 sparse matrix of type '<class 'numpy.int64'>'
	with 32674 stored elements in Compressed Sparse Row format>

In [31]:
zipcode_cv = pd.DataFrame(columns=(bagofwords_zc).get_feature_names(), data=zc_transformed.toarray())

In [32]:
display(zipcode_cv)

Unnamed: 0,e1,e10,e11,e12,e13,e14,e15,e16,e17,e18,...,wc1r,wc1v,wc1x,wc2a,wc2b,wc2e,wc2h,wc2n,wc2r,wd23
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
32669,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
32670,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
32671,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
32672,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Additional Stuff - To come to later 

In [33]:
# looking at individual text items from the 'Listing Title' column:
([x.split(':')[0] for x in X_obj['Listing Title'][0][1:].split(', ')])

['ozy 2BR house with a garden view']

In [34]:
# determining the number of individual features in the 'guest controls' column:
len([x.split(':')[0] for x in X_obj['guest_controls'][0][1:].split(', ')])

40

In [35]:
# looking at individual text items from the 'guest_controls' column:
([x.split(':')[0] for x in X_obj['guest_controls'][0][1:].split(', ')])

['"allows_children"',
 '"allows_infants"',
 '"allows_pets"',
 '"allows_smoking"',
 '"allows_events"',
 '"id"',
 '"host_check_in_time_message"',
 '"localized_structured_house_rules_with_tips"',
 '"p3_structured_house_rules"',
 '"No pets"',
 '"No parties or events"',
 '"Self check-in with lockbox"]',
 '"structured_house_rules"',
 '"No pets"',
 '"No parties or events"]',
 '"structured_house_rules_with_tips"',
 '"long_term_text"',
 '"text"',
 '"tip"',
 '"details"',
 '"airmoji_key"',
 '{"key"',
 '"long_term_text"',
 '"text"',
 '"tip"',
 '"details"',
 '"airmoji_key"',
 '{"key"',
 '"long_term_text"',
 '"text"',
 '"tip"',
 '"details"',
 '"airmoji_key"',
 '{"key"',
 '"long_term_text"',
 '"text"',
 '"tip"',
 '"details"',
 '"airmoji_key"',
 '"allows_non_china_users"']

## Looking at 'Amenities' column:

In [36]:
airbnb_obj['Amenities']

NameError: name 'airbnb_obj' is not defined

In [None]:
airbnb_obj['Amenities'].str.split(', ')

In [None]:
# .1 Instantiate transformer object
bagofwords = CountVectorizer(min_df = 1000,
                             ngram_range = (1, 4))

# 2. Fit
bagofwords.fit(X_train['Amenities'])

# 3. Transform
amenities_transformed = bagofwords.transform(X_train['Amenities'])
amenities_transformed

Looking at the features:

In [None]:
bagofwords.vocabulary_

In [None]:
len(bagofwords.vocabulary_)