### Explore reviews, detect languages and perform sentiment analysis

In [1]:
# exploration of reviews
import numpy as np
import pandas as pd
import os
import matplotlib.pyplot as plt
#pd.set_option('display.max_rows', 5000000)
#pd.set_option('display.max_columns', 5000000)

In [2]:
reviews = pd.read_csv("Data\\raw\\reviews.csv")

In [3]:
reviews.head()

Unnamed: 0,listing_id,id,date,reviewer_id,reviewer_name,comments
0,3884,847944,2012-01-07,1400020,Steve,We had a very comfortable 6 night stay in Regi...
1,3884,1697446,2012-07-13,2021169,Morrin,We ended up staying in Regina's own apartment ...
2,23163,101588,2010-09-20,227165,Nathan,Incredible apartment in an ideal location. The...
3,498646,1445317,2012-06-09,2476967,Lea,We had a great time in Romans flat. The appart...
4,23163,157152,2010-12-22,286036,Hugh,"The apartment was huge, we felt like we were s..."


In [4]:
reviews.info(verbose=True, show_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 566103 entries, 0 to 566102
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   listing_id     566103 non-null  int64 
 1   id             566103 non-null  int64 
 2   date           566103 non-null  object
 3   reviewer_id    566103 non-null  int64 
 4   reviewer_name  566103 non-null  object
 5   comments       566080 non-null  object
dtypes: int64(3), object(3)
memory usage: 25.9+ MB


In [5]:
#drop 23 rows with NA comment values
reviews_cleaned = reviews.dropna(subset=['comments'])

In [6]:
listings = len(reviews.listing_id.unique())
days = len(reviews.date.unique())

print(f'The are {listings} unique listings over {days} days.')

print(f'The listings start on {reviews.date.min()} and end {reviews.date.max()}')

The are 8066 unique listings over 4379 days.
The listings start on 2010-05-07 and end 2023-09-17


In [7]:
### Sentiment analysis ###
from langdetect import detect, DetectorFactory
DetectorFactory.seed = 0

In [8]:
# Sample a subset of the data for language detection due to large dataset size
sentiment_df = reviews_cleaned['comments'].sample(n=1000, random_state=1)

In [9]:
# Detect language for the sample
def try_detect(text):
    try:
        return detect(text)
    except:
        return "unknown"
    
detected_languages = sentiment_df.apply(
    lambda x: "unknown" if x.strip() == "" else try_detect(x)
)

In [10]:
# View the distribution of languages detected in the sample
detected_languages.value_counts()

comments
en         653
de          67
fr          58
cs          49
es          35
it          28
sk          17
ru          17
ko          16
nl          12
zh-cn        5
pt           5
ca           4
da           3
unknown      3
af           3
ja           2
pl           2
tr           2
hu           2
id           2
tl           2
ro           2
sw           1
sl           1
lt           1
fi           1
so           1
zh-tw        1
et           1
hr           1
uk           1
bg           1
sv           1
Name: count, dtype: int64

In [11]:
# Append the detected_languages Series to the sentiment_df DataFrame
sentiment_df = sentiment_df.to_frame().assign(language=detected_languages)
sentiment_df.head()

Unnamed: 0,comments,language
160522,"Huge space, plenty of room and a lovely bathro...",en
519952,"Beautiful apartment with perfect location, nea...",en
320869,The place is much better than the pictures. Th...,en
365182,Es waren schöne 3 Tage in Prag.<br/>Die Wohnun...,de
557270,"The place was nice and cozy, had everything we...",en


In [12]:
from transformers import pipeline, DistilBertTokenizer

# select model
sentiment_pipeline = pipeline(model="nlptown/bert-base-multilingual-uncased-sentiment")




In [13]:
# Retrieve the data you want to analyze
data = sentiment_df['comments'].tolist()

In [14]:
# Initialize the tokenizer
tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-multilingual-cased")

In [15]:
# Function to truncate texts to the maximum sequence length
def truncate_texts(texts, max_length=300):
    truncated_texts = []
    for text in texts:
        # Encode the texts, truncating or padding to max_length
        # The tokenizer's encode_plus method automatically handles max_length
        # and truncates longer texts while ensuring special tokens are added.
        encoded_text = tokenizer.encode_plus(
            text, 
            max_length=max_length, 
            truncation=True, 
            add_special_tokens=True,  # Adds [CLS] and [SEP] tokens
            return_tensors='pt'  # Returns PyTorch tensors
        )
        # Convert the token IDs back to a string
        truncated_text = tokenizer.decode(encoded_text['input_ids'][0], skip_special_tokens=True)
        truncated_texts.append(truncated_text)
    return truncated_texts

data = truncate_texts(data)  # Truncate texts to fit within the model's limit

In [16]:
# Perform sentiment analysis
results = sentiment_pipeline(data)

In [17]:
# Extract sentiment labels and scores to separate lists
sentiment_labels = [result['label'] for result in results]
sentiment_scores = [result['score'] for result in results]

# Append these lists as new columns to your DataFrame
sentiment_df['sentiment_label'] = sentiment_labels
sentiment_df['sentiment_score'] = sentiment_scores

In [18]:
sentiment_df

Unnamed: 0,comments,language,sentiment_label,sentiment_score
160522,"Huge space, plenty of room and a lovely bathro...",en,5 stars,0.896758
519952,"Beautiful apartment with perfect location, nea...",en,5 stars,0.914622
320869,The place is much better than the pictures. Th...,en,5 stars,0.751109
365182,Es waren schöne 3 Tage in Prag.<br/>Die Wohnun...,de,4 stars,0.458200
557270,"The place was nice and cozy, had everything we...",en,4 stars,0.639269
...,...,...,...,...
164799,súper conformtable <br/>incredible apartment,en,5 stars,0.797104
514785,De lokatie van het appartement was super voor ...,nl,3 stars,0.364248
183533,"Ottima posizione, appartamento comodo e carino",it,5 stars,0.575432
110500,a convenient and quiet place - we enjoy our st...,en,5 stars,0.547836
