In [2]:
import spacy
from spacytextblob.spacytextblob import SpacyTextBlob

nlp = spacy.load('en_core_web_sm')
nlp.add_pipe('spacytextblob')
text = "Terry's Hotel Alterntv in Boston was a perfect place to stay for myself and my partner.  We mixed our trip with business and pleasure and found the room perfectly appointed for our needs and affordable.  A great stay!"
doc = nlp(text)
print(doc._.polarity)
print(doc._.subjectivity)
print(doc._.assessments)

0.75
0.75
[(['perfect'], 1.0, 1.0, None), (['mixed'], 0.0, 0.25, None), (['perfectly'], 1.0, 1.0, None), (['great', '!'], 1.0, 0.75, None)]


In [None]:
doc._.polarity      # Polarity: -0.125
doc._.subjectivity  # Sujectivity: 0.9
doc._.assessments   # Assessments: [(['really', 'horrible'], -1.0, 1.0, None), (['worst', '!'], -1.0, 1.0, None), (['really', 'good'], 0.7, 0.6000000000000001, None), (['happy'], 0.8, 1.0, None)]


In [4]:
# import reviews_clone.csv
import pandas as pd
df = pd.read_csv('reviews_clone.csv')
df.head()


Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,polarity,subjectivity,assessments
0,5506,1021,2009-03-21,8903,Jenny,Terry's Hotel Alterntv in Boston was a perfect...,0.75,0.75,"[(['perfect'], 1.0, 1.0, None), (['mixed'], 0...."
1,5506,1953,2009-05-01,12970,Clint,Warm and accommodating host. Beautiful and wel...,0.725,0.8,"[(['warm'], 0.6, 0.6, None), (['beautiful'], 0..."
2,5506,10089,2009-09-14,14033,Nicolas,We stay 3 nights at the Hotel Alternative and ...,0.606,0.79,"[(['complete'], 0.1, 0.4, None), (['sweet'], 0..."
3,5506,18678,2009-11-29,51993,Sarah,We stayed in the private room. Terry provided...,0.12,0.485714,"[(['private'], 0.0, 0.375, None), (['soft'], 0..."
4,5506,24467,2010-01-19,68674,LaJuan,Stayed here recently during a conference and t...,0.316389,0.4475,"[(['recently'], 0.0, 0.25, None), (['fantastic..."


In [None]:
# add polarity and subjectivity to reviews
df['polarity'] = df['comments'].apply(lambda x: nlp(str(x))._.polarity)
df['subjectivity'] = df['comments'].apply(lambda x: nlp(str(x))._.subjectivity)
df['assessments'] = df['comments'].apply(lambda x: nlp(str(x))._.assessments)


In [None]:
# count number of reviews where polarity is positive 
df[df['polarity'] > 0.0].count()

# plot polarity as distribution
import matplotlib.pyplot as plt
plt.hist(df['polarity'], bins=20)
plt.show()

In [None]:
# export to csv
df.to_csv('reviews_clone_with_senti.csv', index=False)

In [None]:
# 　do TF-IDF over reviews
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer(min_df=0.01, max_df=0.9, ngram_range=(1,2))
x2 = tfidf.fit_transform(df['comments'].values.astype('U'))  ## Even astype(str) would work


In [None]:
# 　do LDA over reviews
from sklearn.decomposition import LatentDirichletAllocation
lda = LatentDirichletAllocation(n_components=10, learning_method='online', random_state=0)
x3 = lda.fit_transform(x2)

In [None]:
x3[1]

In [None]:
# put x3 into the dataframe
df['lda'] = x3

In [None]:
# remove assessments
df.drop(['assessments'], axis=1, inplace=True)

In [None]:
'''
- location , price, communication, value.... using cosine similariity 
- gender , male female, pos neg, review length, words most used
- 

'''

In [None]:
import os
import openai
import pandas as pd
from IPython.display import clear_output

# load csv into dataframe
df = pd.read_csv('reviews_clone_with_senti.csv')
# drop assessments
df.drop(['assessments'], axis=1, inplace=True)
df.head()

In [None]:

openai.api_key = 'xxx'

restart_sequence = "\n"

new_col = []
# iterate over comments
for i, name in enumerate(df['reviewer_name']):
    try:
        # strip spaces from name
        name = name.strip()
        # if name is empty, skip
        if name == "":
            new_col.append(name)
            continue
        clear_output(wait=True)
        
        response = openai.Completion.create(
            engine="ada",
            prompt=f"Claire:female\nJohn:male\nPeter:male\n{name}:",
            temperature=0,
            max_tokens=1,
            top_p=1,
            frequency_penalty=0,
            presence_penalty=0,
            stop=["\n"]
        )
        to_add=response.choices[0].text.strip()
        new_col.append(to_add)
        print(f"{i+1}/{len(df)}, {to_add}")
    except:
        new_col.append('')

with open("output.txt", "w") as txt_file:
    for i in new_col:
        txt_file.write(str(i))
        txt_file.write("\n")

# read output.txt into dataframe
df['reviewer_gender'] = new_col





In [None]:
# make sure reviewer_gender is either male or female
def check_in_male_female(gender):
    gender = gender.strip()
    if gender == "male" or gender == "female":
        return gender
    return ''

df['reviewer_gender'].apply(check_in_male_female)



In [None]:
# count reviewer_gender
df.head()

In [22]:
import spacy
nlp = spacy.load("en_core_web_lg")

doc1 = nlp('''My fiance and I could  not have asked for more from our stay at the Fort Hill Inn.  Terry, our host, was very welcoming as well as helpful, leaving us maps and suggestions of things to do.   The room was beautiful, cozy and clean.  The neighborhood was quiet and a great location; a very easy, short trip to downtown. <br/>We will not only  be staying here for future trips to Boston but will also be recommending it to any family or friends planning to visit the area.''')
doc2=nlp('location')
doc3=nlp('hospitality')
doc4=nlp('communication')
doc5=nlp('value')
doc6=nlp('area')
doc6=nlp('transportation')

print(doc1.similarity(doc2))
print(doc1.similarity(doc3))
print(doc1.similarity(doc4))
print(doc1.similarity(doc5))
print(doc1.similarity(doc6))


# # apply to dataframe
# df['location'] = df['comments'].apply(lambda x: nlp(str(x)).similarity(nlp('location')))
# df['hospitality'] = df['comments'].apply(lambda x: nlp(str(x)).similarity(nlp('price')))
# df['communication'] = df['comments'].apply(lambda x: nlp(str(x)).similarity(nlp('communication')))
# df['value'] = df['comments'].apply(lambda x: nlp(str(x)).similarity(nlp('value')))


0.5257461683778064
0.4031611358510172
0.3906865421436786
0.4593304679055108
0.4171291489534105


In [None]:
import os
import openai
import spacy
from collections import Counter
nlp = spacy.load('en_core_web_sm')

def nlpDocToWords(doc):
    words = [token.lemma_.lower() for token in doc if token.is_stop != True and token.is_punct != True and token.pos_ !='X' and token.pos_ !='SPACE' and token.dep_ !='dep']
    return ' '.join(words)
openai.api_key = 'xxx'

restart_sequence = "\n"

out = []

for i, comment in enumerate(df['comments']):
  try:
    response = openai.Completion.create(
      engine="davinci-instruct-beta-v3",
      prompt=f"label the following airbnb review in a few categories:\n\"\"\"\n{comment}\n\"\"\"",
      temperature=0.09,
      max_tokens=64,
      top_p=1,
      frequency_penalty=0,
      presence_penalty=0,
      stop=["\"\"\""]
    )
    out.append(nlpDocToWords(nlp(response.choices[0].text)))
    # print(response.choices[0].text)
    print(f"{i+1}/{len(df)}")
    # if (i == 20):
    #   break
  except:
    out.append('')


In [6]:
import pandas as pd
df = pd.read_csv('reviews_clone_with_senti.csv')
# drop assessments
df.drop(['assessments'], axis=1, inplace=True)
# text file lines to array


In [11]:
with open("output.txt", "r") as txt_file:
    out = txt_file.readlines()
# remove \n from output
out = [x.strip() for x in out]

In [14]:
# put output into dataframe
df['reviewer_gender'] = out
# export dataframe
df.to_csv('reviews_clone_with_senti_and_gen.csv', index=False)

In [1]:
import os
import pandas as pd
import openai
import spacy
from collections import Counter
nlp = spacy.load('en_core_web_lg')

df = pd.read_csv('reviews_clone_with_senti_and_gen.csv')


In [2]:

# for each comment, get the adjectives
out_adj = []
out_noun =[]
for i, comment in enumerate(df['comments']):
    try:
        if i==100:
            break

        doc = nlp(comment)
        # get all adjectives
        adj = [token.lemma_.lower() for token in doc if token.pos_ == 'ADJ']
        # get all nouns
        # noun = [token.lemma_.lower() for token in doc if token.pos_ == 'NOUN']
        noun = [token.lemma_.lower() for token in doc if token.dep_ == 'nsubj']
        # remove stop words
        adj = [x for x in adj if x not in spacy.lang.en.STOP_WORDS]
        noun = [x for x in noun if x not in spacy.lang.en.STOP_WORDS]
        # remove duplicates
        adj = list(set(adj))
        noun = list(set(noun))
        # add to output

        out_adj.append(' '.join(adj))
        out_noun.append(' '.join(noun))
        print(f"{i+1}/{len(df)},{' '.join(adj)}")
        print(f"Noun:,{' '.join(noun)}")

    except:
        out_adj.append('')
        out_noun.append('')

# save to text file
with open("adj.txt", "w") as txt_file:
    for i in out_adj:
        txt_file.write(str(i))
        txt_file.write("\n")

with open("n.txt", "w") as txt_file:
    for i in out_noun:
        txt_file.write(str(i))
        txt_file.write("\n")



1/98288,great affordable perfect
Noun:,alterntv room
2/98288,beautiful warm
Noun:,
3/98288,sweet nice ideal complete
Noun:,<
4/98288,soft visible good mini 3rd blustery private cozy fluffy
Noun:,t boston terry
5/98288,accessible fantastic overpriced great hospitable
Noun:,location accomodation terry
6/98288,able safe great easy happy comfortable close
Noun:,space partner room
7/98288,small cosy useful wonderful helpful gorgeous easy comfortable warm
Noun:,condo terry staircase niggle
8/98288,great small clean
Noun:,room
9/98288,small good large nice wonderful extra beautiful clean close
Noun:,location place
10/98288,friendly good clean charming happy informative quick ready
Noun:,room terry
11/98288,difficult good wonderful steep helpful old clean
Noun:,room fault apartment terry
12/98288,nice
Noun:,reason room terry breakfast
13/98288,accessible sure nice helpful center perfect late comfortable fresh clean
Noun:,< room station br/>-we location br/>i
14/98288,great local helpful chille

In [4]:
out_nouns_aggregate = ' '.join(out_noun)
out_adj_aggregate = ' '.join(out_adj)

# count nouns
noun_count = Counter(out_nouns_aggregate.split())
# sorted
noun_count = sorted(noun_count.items(), key=lambda x: x[1], reverse=True)


In [5]:
noun_count

[('place', 15793),
 ('location', 10018),
 ('apartment', 9144),
 ('room', 6887),
 ('host', 5134),
 ('bed', 4550),
 ('house', 4195),
 ('space', 3117),
 ('neighborhood', 2562),
 ('home', 2123),
 ('bathroom', 1939),
 ('stay', 1836),
 ('<', 1828),
 ('communication', 1791),
 ('parking', 1638),
 ('kitchen', 1583),
 ('area', 1333),
 ('thing', 1163),
 ('unit', 1083),
 ('studio', 1054),
 ('bedroom', 1041),
 ('check', 988),
 ('people', 830),
 ('jonathan', 731),
 ('boston', 721),
 ('picture', 676),
 ('guest', 675),
 ('family', 653),
 ('building', 629),
 ('experience', 621),
 ('airbnb', 591),
 ('shower', 571),
 ('amenity', 562),
 ('br/>i', 544),
 ('husband', 522),
 ('bill', 509),
 ('owner', 496),
 ('station', 464),
 ('br/>we', 454),
 ('issue', 447),
 ('spot', 436),
 ('wall', 423),
 ('accommodation', 416),
 ('alex', 416),
 ('friend', 414),
 ('view', 406),
 ('michelle', 395),
 ('lori', 386),
 ('street', 380),
 ('t', 373),
 ('walk', 373),
 ('wifi', 369),
 ('boris', 369),
 ('minute', 368),
 ('tiffany',

In [6]:
# add to dataframe
df['nouns'] = out_noun
df['adjectives'] = out_adj

In [14]:
x=92222
print(df.iloc[x].comments)
print(df.iloc[x].nouns)
print(df.iloc[x].adjectives)



This is an awesome place, location and price- can’t beat it! Host was super attentive and got the maintenance guy out there ASAP to fix a small issue.<br/><br/>Traffic was fine, it’s white noise.  Lots of outlets in the room is a plus! A loud 15 minutes one night because of loud people in the hall. <br/><br/>Bathrooms were 99% available, easy to share.<br/><br/>Overall, terrific time, would recommend!
lot host br/><br/>bathroom
attentive small loud awesome easy white available terrific fine


In [15]:
df.to_csv('reviews_clone_fin.csv', index=False)