In [49]:
# Imports, as always...
import pandas as pd
import numpy as np
import re
from textblob import TextBlob
from datetime import datetime as dt
import datetime
import matplotlib.pyplot as plt
import seaborn as sns

In [50]:
# Import custom scripts.
import success_metric

## Import the Data

In [51]:
# Get the raw textual data for the listings (include ID)...
raw_data = pd.read_csv('datasets/listings_edinburgh.csv')
textual_data = raw_data[['id', 'description', 'neighborhood_overview', 'host_about']]
textual_data.sample(5)

Unnamed: 0,id,description,neighborhood_overview,host_about
4623,50780406,Looking for a place to stay in Edinburgh? Then...,"In the elegant New Town, Georgian townhouses l...",
668,7651238,"Bright one-bedroom, 2nd floor flat located in ...",A bustling centre with a glorious network of G...,"I am from Norway but live in Edinburgh, a city..."
3387,35907844,Incredibly located lower ground flat in a UNES...,There is a great gym about 10 minutes walk fro...,
298,3914042,Great location quiet but still handy for lovel...,"A vibrant area, bars cafe, restaurant, superma...",
584,7222993,"Ideal location for city breaks, en-route to th...",Our house is in a quiet residential area in no...,I am semi-retired and have a bit of time to we...


In [53]:
# 'Fix' the raw text.
textual_data.description = textual_data.description.apply(success_metric.fix_string)
textual_data.neighborhood_overview = textual_data.neighborhood_overview	.apply(success_metric.fix_string)
textual_data.host_about = textual_data.host_about.apply(success_metric.fix_string)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  textual_data.description = textual_data.description.apply(success_metric.fix_string)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  textual_data.neighborhood_overview = textual_data.neighborhood_overview	.apply(success_metric.fix_string)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  textual_data.h

## NLP Tasks

In [54]:
# Conduct sentiment analysis for each source of textual data. Ignore NaN data...
# Conduct feature extraction for each source of textual data. Ignore NaN data...

# Data structures for storing sentiment anaylses output.
description_polarity = []
description_subjectivity = []
description_keywords = []

neighborhood_overview_polarity = []
neighborhood_overview_subjectivity = []
neighborhood_overview_keywords = []

host_about_polarity = []
host_about_subjectivity = []
host_about_keywords = []

for i in range(len(textual_data)):
    listing = textual_data.iloc[i]

    # Analysis on description.
    if type(listing['description']) == str:
        analysis = TextBlob(listing['description'])
        description_polarity.append(analysis.sentiment.polarity)
        description_subjectivity.append(analysis.sentiment.subjectivity)
        description_keywords.append(list(map(lambda x : x[0][0], analysis.sentiment_assessments.assessments)))
    else:
        description_polarity.append(np.NaN)
        description_subjectivity.append(np.NaN)
        description_keywords.append(np.NaN)

    # Analysis on neighborhood_overview.
    if type(listing['neighborhood_overview']) == str:
        analysis = TextBlob(listing['neighborhood_overview'])
        neighborhood_overview_polarity.append(analysis.sentiment.polarity)
        neighborhood_overview_subjectivity.append(analysis.sentiment.subjectivity)
        neighborhood_overview_keywords.append(list(map(lambda x : x[0][0], analysis.sentiment_assessments.assessments)))
    else:
        neighborhood_overview_polarity.append(np.NaN)
        neighborhood_overview_subjectivity.append(np.NaN)
        neighborhood_overview_keywords.append(np.NaN)

    # Analysis on host_about.
    if type(listing['host_about']) == str:
        analysis = TextBlob(listing['host_about'])
        host_about_polarity.append(analysis.sentiment.polarity)
        host_about_subjectivity.append(analysis.sentiment.subjectivity)
        host_about_keywords.append(list(map(lambda x : x[0][0], analysis.sentiment_assessments.assessments)))
    else:
        host_about_polarity.append(np.NaN)
        host_about_subjectivity.append(np.NaN)
        host_about_keywords.append(np.NaN)

nlp_data = pd.DataFrame({
    'id' : textual_data.id,
    'description_polarity' : description_polarity,
    'description_subjectivity' : description_subjectivity,
    'description_keywords' : description_keywords,
    'neighborhood_overview_polarity' : neighborhood_overview_polarity,
    'neighborhood_overview_subjectivity' : neighborhood_overview_subjectivity,
    'neighborhood_overview_keywords' : neighborhood_overview_keywords,
    'host_about_polarity' : host_about_polarity,
    'host_about_subjectivity' : host_about_subjectivity,
    'host_about_keywords' : host_about_keywords,
})

nlp_data.sample(5)

Unnamed: 0,id,description_polarity,description_subjectivity,description_keywords,neighborhood_overview_polarity,neighborhood_overview_subjectivity,neighborhood_overview_keywords,host_about_polarity,host_about_subjectivity,host_about_keywords
2593,27153709,0.187994,0.404545,"[recently, private, free, better, long, short,...",0.369792,0.521528,"[lovely, quiet, safe, local, flat, very, many,...",0.0,0.0,[]
934,12237129,0.268831,0.532257,"[beautiful, first, very, large, newly, single,...",0.180556,0.488889,"[full, independent, able, easily, open, green]",0.5,0.6,[love]
325,4215098,0.294444,0.472222,"[super, double, suitable]",,,,,,
3361,35726418,0.16553,0.46947,"[fabulous, top, historic, old, direct, free, o...",0.300926,0.581481,"[real, most, exciting, fine, inventive, creati...",0.253571,0.532143,"[love, small, free, love, love, different, fut..."
6151,664941084724346088,0.384964,0.618824,"[charming, top, quiet, elegant, new, perfect, ...",0.139141,0.325379,"[short, many, less, iconic, historic, old, qui...",0.414474,0.543421,"[love, wonderful, enjoy, great, vibrant, histo..."


In [55]:
# Compute success scores for the data and add them to the dataframe.
success_scores = success_metric.compute_scores('datasets/listings_edinburgh.csv', 'datasets/reviews_edinburgh.csv')
nlp_data = nlp_data.merge(success_scores, on='id')

Unnamed: 0,id,description_polarity,description_subjectivity,description_keywords,neighborhood_overview_polarity,neighborhood_overview_subjectivity,neighborhood_overview_keywords,host_about_polarity,host_about_subjectivity,host_about_keywords,log_price,rental_probability,weighted_average_sentiment,success_score
0,15420,0.301429,0.592759,"[stunning, own, historic, completely, own, com...",0.289394,0.471591,"[historic, new, own, lovely, independent, more]",-0.050000,0.850000,[passionate],4.736198,0.833333,0.422155,1.666176
1,707097,,,,0.500000,0.650000,"[perfect, short]",0.700000,0.550000,"[best, comfortable]",6.393591,1.000000,0.231195,1.478163
2,728199,0.308384,0.471296,"[fantastic, main, flat, easy, right, great, pr...",0.415789,0.568421,"[great, excellent, italian, chinese, free, exc...",0.539323,0.625000,"[love, great, very, much, little, fitting, per...",4.094345,0.257230,0.374478,0.394395
3,732008,0.329877,0.625805,"[beautiful, third, unique, flat, bright, open,...",,,,0.000000,0.000000,[],5.087596,0.446347,0.434169,0.985926
4,744710,0.573485,0.767172,"[stunning, new, perfectly, most, popular, orig...",0.275000,0.466667,"[central, very, many]",0.422727,0.613258,"[own, recently, great, able, love, new]",4.927254,0.053272,0.419052,0.109996
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6714,362653,0.046667,0.280000,"[quiet, much, hardly, free, double, toilet, ot...",,,,0.450000,0.601852,"[old, married, last, own, own, more, enjoy, ha...",4.553877,0.000000,0.331990,0.000000
6715,363642,0.300778,0.475556,"[lovely, double, available, available, very, d...",0.216111,0.544444,"[green, very, quick, reasonably, public, free]",0.342700,0.617080,"[hidden, really, local, own, own, filled, deli...",3.688879,0.000000,0.394620,0.000000
6716,376937,0.164586,0.418837,"[new, modern, old, central, fantastic, great, ...",,,,0.450000,0.601852,"[old, married, last, own, own, more, enjoy, ha...",4.442651,0.000000,0.341096,0.000000
6717,388297,0.253704,0.726157,"[beautiful, perfect, base, great, free, perfec...",,,,0.241604,0.464815,"[friendly, female, live, great, near, love, va...",5.442418,0.093607,0.474388,0.241677


In [101]:
# Score each extracted keyword keyword based on the success scores of their respective listings...

description_keyword_scores = {}
neighborhood_overview_keyword_scores = {}
host_about_keyword_scores = {}
for i in range(len(nlp_data)):
    entry = nlp_data.iloc[i]

    # Ignore NaN values...

    if type(entry.description_keywords) == list:
        for keyword in entry.description_keywords:
            if keyword in description_keyword_scores.keys():
                description_keyword_scores[keyword].append(entry.success_score)
            else:
                description_keyword_scores[keyword] = [entry.success_score]
    
    if type(entry.neighborhood_overview_keywords) == list:
        for keyword in entry.neighborhood_overview_keywords:
            if keyword in neighborhood_overview_keyword_scores.keys():
                neighborhood_overview_keyword_scores[keyword].append(entry.success_score)
            else:
                neighborhood_overview_keyword_scores[keyword] = [entry.success_score]

    if type(entry.host_about_keywords) == list:
        for keyword in entry.host_about_keywords:
            if keyword in host_about_keyword_scores.keys():
                host_about_keyword_scores[keyword].append(entry.success_score)
            else:
                host_about_keyword_scores[keyword] = [entry.success_score]
    

for keyword in description_keyword_scores.keys():
    description_keyword_scores[keyword] = np.mean(description_keyword_scores[keyword])

for keyword in neighborhood_overview_keyword_scores.keys():
    neighborhood_overview_keyword_scores[keyword] = np.mean(neighborhood_overview_keyword_scores[keyword])

for keyword in host_about_keyword_scores.keys():
    host_about_keyword_scores[keyword] = np.mean(host_about_keyword_scores[keyword])

In [100]:
sorted(description_keyword_scores.items(), key=lambda x:x[1], reverse=True)

[(':[', 2.081363056579589),
 ('strongly', 2.063118596744704),
 ('modestly', 2.0313896043106916),
 ('loyal', 1.8668490441458385),
 ('guarded', 1.8668490441458385),
 ('imaginatively', 1.8592079434210065),
 ('creatively', 1.8592079434210065),
 ('becoming', 1.8023138107539058),
 ('visually', 1.6880265321958203),
 ('subsequently', 1.6451481100253913),
 ('respectively', 1.6432610011750708),
 ('thrilled', 1.4248822934526835),
 ('advanced', 1.4195429784789555),
 ('ridiculously', 1.3901288663562947),
 ('rank', 1.359127470490676),
 ('respectable', 1.2761174714053014),
 ('wow', 1.26586943846301),
 ('impressed', 1.26586943846301),
 ('challenging', 1.2554254514845264),
 ('flawless', 1.2153423727455288),
 ('seamlessly', 1.1072624156670994),
 ('fortunate', 1.0559264742909868),
 ('worthy', 1.0430072667952845),
 ('bored', 1.0401622912016968),
 ('fabulously', 1.0309486820232832),
 ('sensational', 1.0010938352763286),
 ('drunk', 0.9955590340334246),
 ('annoying', 0.9891492777930826),
 ('worthwhile', 0.97

In [102]:
sorted(neighborhood_overview_keyword_scores.items(), key=lambda x:x[1], reverse=True)

[('ultimate', 2.0835828184902874),
 ('yarn', 1.7354610239998607),
 ('reminiscent', 1.3871803584044815),
 ('firm', 1.3727522352891368),
 ('loyal', 1.3660753957681675),
 ('guarded', 1.3660753957681675),
 ('ordinary', 1.3488448832390425),
 ('bloody', 1.2917267790286155),
 ('solid', 1.2893730254436835),
 ('gruesome', 1.2880179394872515),
 ('colorful', 1.2168406549468391),
 ('potentially', 1.2167887773454629),
 ('joy', 1.1670665288114435),
 ('priceless', 1.1559850106389686),
 ('yellow', 1.1534404685091024),
 ('popularly', 1.1456190534449917),
 ('earlier', 1.102585191062363),
 ('appreciated', 1.0822528607597481),
 ('relative', 1.0714786702562982),
 ('pleasantly', 1.063898301739056),
 ('effectively', 1.0638539538066027),
 ('partial', 1.0622938120341034),
 ('secondhand', 1.0398449445602969),
 ('political', 1.0229635088763103),
 ('dull', 1.0229635088763103),
 ('exhausting', 1.020716640891371),
 ('alive', 0.996096813761544),
 ('magic', 0.9904714453454),
 ('crafty', 0.9897110017224233),
 ('happil

In [103]:
sorted(host_about_keyword_scores.items(), key=lambda x:x[1], reverse=True)

[('badly', 1.6284889552510666),
 ('bass', 1.6284889552510666),
 ('lasting', 1.4865390177193902),
 ('linguistic', 1.464957645592232),
 ('monkey', 1.3074586243972521),
 ('wow', 1.2909412940804548),
 ('abundant', 1.2759398052915598),
 ('competently', 1.266269725915537),
 ('objective', 1.233829045805819),
 ('massive', 1.223547544221395),
 ('uncommon', 1.2177103036672596),
 ('extraordinary', 1.2140392241940337),
 ('larger', 1.1840287520329116),
 ('initial', 1.1840287520329116),
 ('primarily', 1.1829729240567264),
 ('drag', 1.1407215816527767),
 ('silly', 1.1396333219512134),
 ('3rd', 1.0626363930158258),
 (':o)', 1.0237874004428666),
 ('briefly', 1.0229635088763103),
 ('exact', 1.0229635088763103),
 ('brutally', 1.0229635088763103),
 ('toilet', 1.022863855922232),
 ('handy', 1.0091351729921514),
 ('talented', 0.9930669412892107),
 ('visual', 0.9833191673417941),
 ('norwegian', 0.9661384608713105),
 ('dark', 0.9614468951095049),
 ('liked', 0.956466044097154),
 ('8)', 0.9212698950031677),
 ('