In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

plt.style.use('ggplot')

import nltk

In [2]:
#Read Datasets
df = pd.read_csv('./Datasets/reviews.csv')

In [3]:
df['comments'].values[0]

'Cute and cozy place. Perfect location to everything! '

In [4]:
print(df.shape)

(84849, 6)


In [5]:
df.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,7202016,38917982,2015-07-19,28943674,Bianca,Cute and cozy place. Perfect location to every...
1,7202016,39087409,2015-07-20,32440555,Frank,Kelly has a great room in a very central locat...
2,7202016,39820030,2015-07-26,37722850,Ian,"Very spacious apartment, and in a great neighb..."
3,7202016,40813543,2015-08-02,33671805,George,Close to Seattle Center and all it has to offe...
4,7202016,41986501,2015-08-10,34959538,Ming,Kelly was a great host and very accommodating ...


In [6]:
example = df['comments'][5001]
print(example)

Everything went very smoothly and room was as advertised.  Bed was very comfortable and rate was fair. 


In [7]:
nltk.download('punkt_tab')
tokens = nltk.word_tokenize(example)
print(tokens)

['Everything', 'went', 'very', 'smoothly', 'and', 'room', 'was', 'as', 'advertised', '.', 'Bed', 'was', 'very', 'comfortable', 'and', 'rate', 'was', 'fair', '.']


[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\mende\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [8]:
nltk.download('averaged_perceptron_tagger_eng')
nltk.pos_tag(tokens)

[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\mende\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


[('Everything', 'VBG'),
 ('went', 'VBD'),
 ('very', 'RB'),
 ('smoothly', 'RB'),
 ('and', 'CC'),
 ('room', 'NN'),
 ('was', 'VBD'),
 ('as', 'IN'),
 ('advertised', 'JJ'),
 ('.', '.'),
 ('Bed', 'NNP'),
 ('was', 'VBD'),
 ('very', 'RB'),
 ('comfortable', 'JJ'),
 ('and', 'CC'),
 ('rate', 'NN'),
 ('was', 'VBD'),
 ('fair', 'JJ'),
 ('.', '.')]

In [9]:
tagged = nltk.pos_tag(tokens)
tagged [:10]

[('Everything', 'VBG'),
 ('went', 'VBD'),
 ('very', 'RB'),
 ('smoothly', 'RB'),
 ('and', 'CC'),
 ('room', 'NN'),
 ('was', 'VBD'),
 ('as', 'IN'),
 ('advertised', 'JJ'),
 ('.', '.')]

In [10]:
nltk.download('maxent_ne_chunker_tab')
nltk.download('words')
entities = nltk.chunk.ne_chunk(tagged)
entities.pprint()

[nltk_data] Downloading package maxent_ne_chunker_tab to
[nltk_data]     C:\Users\mende\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker_tab is already up-to-date!
[nltk_data] Downloading package words to
[nltk_data]     C:\Users\mende\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


(S
  Everything/VBG
  went/VBD
  very/RB
  smoothly/RB
  and/CC
  room/NN
  was/VBD
  as/IN
  advertised/JJ
  ./.
  (PERSON Bed/NNP)
  was/VBD
  very/RB
  comfortable/JJ
  and/CC
  rate/NN
  was/VBD
  fair/JJ
  ./.)


In [11]:
#VADER Sentiment Analysis
from nltk.sentiment import SentimentIntensityAnalyzer
from tqdm.notebook import tqdm

nltk.download('vader_lexicon')
sia = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     C:\Users\mende\AppData\Roaming\nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [12]:
sia.polarity_scores("I have no complaints and our stay was very comfortable. ")

{'neg': 0.338, 'neu': 0.414, 'pos': 0.248, 'compound': -0.079}

In [13]:
sia.polarity_scores(example)

{'neg': 0.0, 'neu': 0.718, 'pos': 0.282, 'compound': 0.7089}

In [14]:
#Run polarity scores in entire datasets

# Convert all non-string comments to empty strings
df['comments'] = df['comments'].astype(str).fillna('')

res = {}
for i, row in tqdm(df.iterrows(), total = len(df)):#total = len(df)) is for progress bar
    comments = row['comments']
    myid = row['id']
    res[myid] = sia.polarity_scores(comments)

  0%|          | 0/84849 [00:00<?, ?it/s]

In [15]:
#Store in panda database
vaders = pd.DataFrame(res).T #.T flip the matrix
vaders = vaders.reset_index().rename(columns={'index': 'id'})
vaders = vaders.merge(df, how = 'left')

In [16]:
#Sentiment Data
vaders.head()

Unnamed: 0,id,neg,neu,pos,compound,listing_id,date,reviewer_id,reviewer_name,comments
0,38917982,0.0,0.462,0.538,0.7901,7202016,2015-07-19,28943674,Bianca,Cute and cozy place. Perfect location to every...
1,39087409,0.0,0.609,0.391,0.9872,7202016,2015-07-20,32440555,Frank,Kelly has a great room in a very central locat...
2,39820030,0.043,0.772,0.185,0.8718,7202016,2015-07-26,37722850,Ian,"Very spacious apartment, and in a great neighb..."
3,40813543,0.035,0.765,0.2,0.8313,7202016,2015-08-02,33671805,George,Close to Seattle Center and all it has to offe...
4,41986501,0.0,0.655,0.345,0.9783,7202016,2015-08-10,34959538,Ming,Kelly was a great host and very accommodating ...


In [111]:
#save the vader to excel
def determine_sentiment(row):
    if row['pos'] > row['neg'] and row['pos'] > row['neu']:
        return "Positive"
    elif row['neg'] > row['pos'] and row['neg'] > row['neu']:
        return "Negative"
    else:
        return "Neutral"
        
vaders['sentiment'] = vaders.apply(determine_sentiment, axis=1)

columns_to_save = ['listing_id', 'sentiment', 'compound', 'id', 'date', 'reviewer_id', 'reviewer_name', 'comments']
final_df = vaders[columns_to_save]

final_df.to_excel('sentiment_analysis_results.xlsx', index=False)

In [71]:
import torch
from transformers import AutoTokenizer, AutoModelForSequenceClassification
import torch.nn.functional as F

tokenizer = AutoTokenizer.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")
model = AutoModelForSequenceClassification.from_pretrained("cardiffnlp/twitter-roberta-base-sentiment-latest")

Some weights of the model checkpoint at cardiffnlp/twitter-roberta-base-sentiment-latest were not used when initializing RobertaForSequenceClassification: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
- This IS expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing RobertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [72]:
def process_with_sliding_window(text, tokenizer, model, max_length=512, stride=128):
    inputs = tokenizer(
        text,
        return_tensors="pt",
        max_length=max_length,
        truncation=True,
        stride=stride,
        return_overflowing_tokens=True,
        padding=True,
    )

    outputs = []

    for i in range(len(inputs['input_ids'])):
        chunk_input_ids = inputs['input_ids'][i].unsqueeze(0)
        chunk_attention_mask = inputs['attention_mask'][i].unsqueeze(0)

        with torch.no_grad():
            output = model(input_ids=chunk_input_ids, attention_mask=chunk_attention_mask)
            outputs.append(output.logits) 

    aggregated_output = torch.mean(torch.cat(outputs, dim=0), dim=0)
    
    probabilities = F.softmax(aggregated_output, dim=-1)

    return probabilities


In [75]:
# Convert all non-string comments to empty strings
df['comments'] = df['comments'].astype(str).fillna('')

roberta = {}
for i, row in tqdm(df.iterrows(), total=len(df)):  # total=len(df) is for progress bar
    comments = row['comments']
    myid = row['id']
    roberta[myid] = process_with_sliding_window(comments, tokenizer, model)

  0%|          | 0/84849 [00:00<?, ?it/s]

In [86]:
def get_prediction(tensor):
    classes = ['Negative', 'Neutral', 'Positive']
    predicted_index = torch.argmax(tensor).item()
    return classes[predicted_index]

In [98]:
predictions_df = pd.DataFrame({
    "id": list(roberta.keys()),
    "prediction": [get_prediction(tensor) for tensor in roberta.values()]
})

In [99]:
roberta_final = df.merge(predictions_df, how='left')

In [100]:
roberta_final

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments,prediction
0,7202016,38917982,2015-07-19,28943674,Bianca,Cute and cozy place. Perfect location to every...,Positive
1,7202016,39087409,2015-07-20,32440555,Frank,Kelly has a great room in a very central locat...,Positive
2,7202016,39820030,2015-07-26,37722850,Ian,"Very spacious apartment, and in a great neighb...",Positive
3,7202016,40813543,2015-08-02,33671805,George,Close to Seattle Center and all it has to offe...,Positive
4,7202016,41986501,2015-08-10,34959538,Ming,Kelly was a great host and very accommodating ...,Positive
...,...,...,...,...,...,...,...
84844,3624990,50436321,2015-10-12,37419458,Ryan,The description and pictures of the apartment ...,Positive
84845,3624990,51024875,2015-10-17,6933252,Linda,We had an excellent stay. It was clean and com...,Positive
84846,3624990,51511988,2015-10-20,19543701,Jaime,"Gran ubicación, cerca de todo lo atractivo del...",Positive
84847,3624990,52814482,2015-11-02,24445024,Jørgen,"Very good apartement, clean and well sized. Si...",Positive


In [96]:
df

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,7202016,38917982,2015-07-19,28943674,Bianca,Cute and cozy place. Perfect location to every...
1,7202016,39087409,2015-07-20,32440555,Frank,Kelly has a great room in a very central locat...
2,7202016,39820030,2015-07-26,37722850,Ian,"Very spacious apartment, and in a great neighb..."
3,7202016,40813543,2015-08-02,33671805,George,Close to Seattle Center and all it has to offe...
4,7202016,41986501,2015-08-10,34959538,Ming,Kelly was a great host and very accommodating ...
...,...,...,...,...,...,...
84844,3624990,50436321,2015-10-12,37419458,Ryan,The description and pictures of the apartment ...
84845,3624990,51024875,2015-10-17,6933252,Linda,We had an excellent stay. It was clean and com...
84846,3624990,51511988,2015-10-20,19543701,Jaime,"Gran ubicación, cerca de todo lo atractivo del..."
84847,3624990,52814482,2015-11-02,24445024,Jørgen,"Very good apartement, clean and well sized. Si..."


In [103]:
output_file = "roberta_final_predictions.csv"
roberta_final.to_csv(output_file, index=False)