# Capstone Workbook 5: Advanced Modelling

In [1]:
# Import libraries
import pandas as pd
import numpy as np

import matplotlib.pyplot as plt
import seaborn as sns

from scipy import stats
import statsmodels.api as sm

In [51]:
# Import data 
airbnb_ldn = pd.read_csv('airbnb_ldn_pp.csv')

In [52]:
# drop 'Unnamed: 0'
airbnb_ldn = airbnb_ldn.drop(columns = 'Unnamed: 0')

Split the columns to view just those with the object datatype, keeping the target column:

In [53]:
X_obj = airbnb_ldn.select_dtypes(include='object')
X_obj = pd.concat([X_obj, airbnb_ldn['Annual Revenue LTM (Native)']], axis=1)

In [54]:
X_obj

Unnamed: 0,Listing Title,Property Type,City,Zipcode,Check-in Time,Amenities,guest_controls,Annual Revenue LTM (Native)
0,Cozy 2BR house with a garden view,Entire home,Greater London,SW15 3,12:00 PM - 12:00 AM,"['Free parking on premises', 'Air conditioning...","{""allows_children"": true, ""allows_infants"": tr...",1619.5
1,GuestReady - Amazing home with a private garden,Entire home,Greater London,SW15 3,3:00 PM - 12:00 AM,"['Wifi', 'Kitchen', 'Dryer', 'Dedicated worksp...","{""allows_children"": true, ""allows_infants"": tr...",16737.7
2,Cosy cottage on Richmond Park,Entire home,Greater London,SW15 3,After 3:00 PM,"['Free parking on premises', 'Air conditioning...","{""allows_children"": false, ""allows_infants"": f...",345.2
3,"Entire Flat. Free parking, Garden , Richmond park",Entire rental unit,Greater London,SW15 3,3:00 PM - 11:00 PM,"['Free parking on premises', 'Wifi', 'Kitchen'...","{""allows_children"": true, ""allows_infants"": tr...",12853.3
4,Maisonette inbetween Richmond Park and Wimbledon,Private room in rental unit,Greater London,SW15 3,12:00 PM - 10:00 PM,"['Free parking on premises', 'Wifi', 'Breakfas...","{""allows_children"": false, ""allows_infants"": f...",267.8
...,...,...,...,...,...,...,...,...
32673,Service Apartment- London Thamesmead,Entire condo,Greater London,SE28 8,3:00 PM - 9:00 PM,"['Free parking on premises', 'Wifi', 'Kitchen'...","{""allows_children"": true, ""allows_infants"": tr...",5371.2
32674,Large Double Room with Free Parking and Garden,Private room in home,Greater London,SE28 8,2:00 PM - 4:00 PM,"['Free parking on premises', 'Wifi', 'Kitchen'...","{""allows_children"": true, ""allows_infants"": tr...",189.6
32675,Forest view room in welcoming home,Private room in home,Greater London,E4 6,,"['Wifi', 'Kitchen', 'Hair dryer', 'TV', 'Iron'...","{""allows_children"": true, ""allows_infants"": fa...",6335.8
32676,Spacious Double Room with En-suite bathroom,Private room in home,London,E4 6,3:00 PM - 10:00 PM,"['Wifi', 'Kitchen', 'Hair dryer', 'TV', 'Iron'...","{""allows_children"": false, ""allows_infants"": f...",4264.4


In [55]:
# confirm datatype of remaining columns:
X_obj.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 32678 entries, 0 to 32677
Data columns (total 8 columns):
 #   Column                       Non-Null Count  Dtype  
---  ------                       --------------  -----  
 0   Listing Title                32674 non-null  object 
 1   Property Type                32678 non-null  object 
 2   City                         32678 non-null  object 
 3   Zipcode                      32678 non-null  object 
 4   Check-in Time                30921 non-null  object 
 5   Amenities                    32678 non-null  object 
 6   guest_controls               32678 non-null  object 
 7   Annual Revenue LTM (Native)  32678 non-null  float64
dtypes: float64(1), object(7)
memory usage: 2.0+ MB


Now the object datatype columns have been isolated with the target column, the data will be checked:

In [56]:
# check for nulls
X_obj.isnull().sum()

Listing Title                     4
Property Type                     0
City                              0
Zipcode                           0
Check-in Time                  1757
Amenities                         0
guest_controls                    0
Annual Revenue LTM (Native)       0
dtype: int64

The 'Check-in Time' column has many nulls. This column isn't essential for the model, so it can be dropped for the time being. 
The 'Listing Title' column has 4 null values. This is insignificant compared to the entire dataset size, so these rows can be dropped.

In [57]:
# dropping Check-in time column
X_obj.drop(columns='Check-in Time', inplace=True)

In [58]:
# dropping the Listing Title null rows:
X_obj.dropna(inplace=True)

In [59]:
# sanity check
X_obj.isnull().sum()

Listing Title                  0
Property Type                  0
City                           0
Zipcode                        0
Amenities                      0
guest_controls                 0
Annual Revenue LTM (Native)    0
dtype: int64

As seen, there are no null values remaining.

The independent and dependent variables can now be split.

In [60]:
X = X_obj.drop(columns='Annual Revenue LTM (Native)')

In [61]:
y = X_obj['Annual Revenue LTM (Native)']

### Complete a Train-Test Split for the NLP to Take Place

In [36]:
# required imports 
from sklearn.model_selection import train_test_split

In [62]:
# split independent and target data between train and test sets:
X_train, X_test, y_train, y_test = train_test_split(X_obj, y, test_size = 0.25, random_state=42)

In [63]:
# reset index for train test split data:
X_train.reset_index(inplace=True, drop=True)
X_test.reset_index(inplace=True, drop=True)
y_train.reset_index(inplace=True, drop=True)
y_test.reset_index(inplace=True, drop=True)

## Starting With Listing Title

In [64]:
lt_array = X_train['Listing Title']

In [65]:
lt_array

0                   Amazing 2 bedroom apartment in Chelsea
1                        Double Room for two in Kensington
2                          Huge double room in Hammersmith
3           Superbly located 2 bed flat with garden office
4        Amazing Entire Flat in Heart of London/ Piccad...
                               ...                        
24500    Penthouse Room with Views over Park, near Station
24501    Newly Refurbished HotelStyle Apt - Central London
24502       Lovely central private bedroom and shower room
24503         Large Studio - w/Sofabed - Chelsea SW3 - NGH
24504                     Lovely 1bedroom-Baker St Station
Name: Listing Title, Length: 24505, dtype: object

In [66]:
from sklearn.feature_extraction.text import CountVectorizer

In [69]:
# .1 Instantiate transformer object
bagofwords = CountVectorizer(min_df = 1000,
                             ngram_range = (1, 4))

# 2. Fit
bagofwords.fit(lt_array)

# 3. Transform
amenities_transformed = bagofwords.transform(lt_array)
amenities_transformed

<24505x34 sparse matrix of type '<class 'numpy.int64'>'
	with 76593 stored elements in Compressed Sparse Row format>

In [73]:
# View the words that are occuring in the Listing Title:
bagofwords.vocabulary_

{'bedroom': 5,
 'apartment': 1,
 'in': 18,
 'apartment in': 2,
 'double': 11,
 'room': 27,
 'double room': 12,
 'room in': 28,
 'bed': 4,
 'flat': 13,
 'with': 33,
 'garden': 15,
 'of': 24,
 'london': 19,
 'flat in': 14,
 'the': 31,
 'studio': 30,
 'and': 0,
 'to': 32,
 'beautiful': 3,
 'house': 17,
 'home': 16,
 'central': 8,
 'central london': 9,
 'bright': 7,
 'spacious': 29,
 'modern': 22,
 'lovely': 20,
 'cosy': 10,
 'near': 23,
 'private': 26,
 'park': 25,
 'luxury': 21,
 'bedroom flat': 6}

In [16]:
# looking at individual text items from the 'Listing Title' column:
([x.split(':')[0] for x in X_obj['Listing Title'][0][1:].split(', ')])

['ozy 2BR house with a garden view']

In [10]:
# determining the number of individual features in the 'guest controls' column:
len([x.split(':')[0] for x in airbnb_obj['guest_controls'][0][1:].split(', ')])

40

In [35]:
# looking at individual text items from the 'guest_controls' column:
([x.split(':')[0] for x in airbnb_obj['guest_controls'][0][1:].split(', ')])

['"allows_children"',
 '"allows_infants"',
 '"allows_pets"',
 '"allows_smoking"',
 '"allows_events"',
 '"id"',
 '"host_check_in_time_message"',
 '"localized_structured_house_rules_with_tips"',
 '"p3_structured_house_rules"',
 '"No pets"',
 '"No parties or events"',
 '"Self check-in with lockbox"]',
 '"structured_house_rules"',
 '"No pets"',
 '"No parties or events"]',
 '"structured_house_rules_with_tips"',
 '"long_term_text"',
 '"text"',
 '"tip"',
 '"details"',
 '"airmoji_key"',
 '{"key"',
 '"long_term_text"',
 '"text"',
 '"tip"',
 '"details"',
 '"airmoji_key"',
 '{"key"',
 '"long_term_text"',
 '"text"',
 '"tip"',
 '"details"',
 '"airmoji_key"',
 '{"key"',
 '"long_term_text"',
 '"text"',
 '"tip"',
 '"details"',
 '"airmoji_key"',
 '"allows_non_china_users"']

{'bedroom': 5,
 'apartment': 1,
 'in': 18,
 'apartment in': 2,
 'double': 11,
 'room': 27,
 'double room': 12,
 'room in': 28,
 'bed': 4,
 'flat': 13,
 'with': 33,
 'garden': 15,
 'of': 24,
 'london': 19,
 'flat in': 14,
 'the': 31,
 'studio': 30,
 'and': 0,
 'to': 32,
 'beautiful': 3,
 'house': 17,
 'home': 16,
 'central': 8,
 'central london': 9,
 'bright': 7,
 'spacious': 29,
 'modern': 22,
 'lovely': 20,
 'cosy': 10,
 'near': 23,
 'private': 26,
 'park': 25,
 'luxury': 21,
 'bedroom flat': 6}

## Looking at 'Amenities' column:

In [24]:
airbnb_obj['Amenities']

NameError: name 'airbnb_obj' is not defined

In [40]:
airbnb_obj['Amenities'].str.split(', ')

0        [['Free parking on premises', 'Air conditionin...
1        [['Wifi', 'Kitchen', 'Dryer', 'Dedicated works...
2        [['Free parking on premises', 'Air conditionin...
3        [['Free parking on premises', 'Wifi', 'Kitchen...
4        [['Free parking on premises', 'Wifi', 'Breakfa...
                               ...                        
32673    [['Free parking on premises', 'Wifi', 'Kitchen...
32674    [['Free parking on premises', 'Wifi', 'Kitchen...
32675    [['Wifi', 'Kitchen', 'Hair dryer', 'TV', 'Iron...
32676    [['Wifi', 'Kitchen', 'Hair dryer', 'TV', 'Iron...
32677    [['Free parking on premises', 'Wifi', 'Kitchen...
Name: Amenities, Length: 32678, dtype: object

In [26]:
# .1 Instantiate transformer object
bagofwords = CountVectorizer(min_df = 1000,
                             ngram_range = (1, 4))

# 2. Fit
bagofwords.fit(X_train['Amenities'])

# 3. Transform
amenities_transformed = bagofwords.transform(X_train['Amenities'])
amenities_transformed

<24508x852 sparse matrix of type '<class 'numpy.int64'>'
	with 3974016 stored elements in Compressed Sparse Row format>

Looking at the features:

In [77]:
bagofwords.vocabulary_

{'free': 417,
 'parking': 710,
 'on': 663,
 'premises': 744,
 'air': 8,
 'conditioning': 238,
 'wifi': 986,
 'kitchen': 554,
 'indoor': 517,
 'fireplace': 403,
 'cable': 177,
 'tv': 939,
 'dryer': 311,
 'dedicated': 267,
 'workspace': 1004,
 'hair': 461,
 'shampoo': 799,
 'iron': 520,
 'hangers': 469,
 'washer': 953,
 'heating': 476,
 'essentials': 343,
 'bathtub': 123,
 'body': 166,
 'soap': 847,
 'cleaning': 196,
 'products': 761,
 'conditioner': 228,
 'hot': 501,
 'water': 963,
 'shower': 809,
 'gel': 438,
 'bed': 140,
 'linens': 585,
 'clothing': 209,
 'storage': 873,
 'drying': 327,
 'rack': 773,
 'for': 411,
 'extra': 378,
 'pillows': 727,
 'and': 56,
 'blankets': 153,
 'room': 789,
 'darkening': 263,
 'shades': 794,
 'safe': 793,
 'sound': 855,
 'system': 902,
 'babysitter': 85,
 'recommendations': 778,
 'board': 164,
 'games': 437,
 'children': 191,
 'books': 172,
 'toys': 932,
 'dinnerware': 284,
 'crib': 254,
 'portable': 738,
 'fans': 382,
 'carbon': 182,
 'monoxide': 643,
 

In [78]:
len(bagofwords.vocabulary_)

1014