In [1]:
from ftlangdetect import detect
from germansentiment import SentimentModel
import pandas as pd

In [2]:
model = SentimentModel()

def predict_sentiments(text):
    classes, probabilities = model.predict_sentiment([text], output_probabilities = True)
    return pd.Series({
        'sentiment': classes[0], 
        'positive_sentiment': probabilities[0][0][1], 
        'negative_sentiment': probabilities[0][1][1], 
        'neutral_sentiment': probabilities[0][2][1]
    })
    
def detect_language(text):
    try:
        result = detect(text=text, low_memory=False)
        return result['lang']
    except Exception as e:
        return None

In [11]:
# Andere Sprachen werden falsch predictet.
model.predict_sentiment(["В квартире было грязно и воняло.", "El apartamento era muy bonito.", "Die Wohnung war dreckig und hat gestunken.", "Die Wohnung war sauber."], output_probabilities = True)

(['negative', 'negative', 'neutral', 'positive'],
 [[['positive', 0.126670241355896],
   ['negative', 0.8723633289337158],
   ['neutral', 0.0009664089884608984]],
  [['positive', 0.28107449412345886],
   ['negative', 0.6973143219947815],
   ['neutral', 0.021611185744404793]],
  [['positive', 0.0007139815716072917],
   ['negative', 0.2380208820104599],
   ['neutral', 0.761265218257904]],
  [['positive', 0.9703390002250671],
   ['negative', 0.028706924989819527],
   ['neutral', 0.0009540801402181387]]])

# Title Sentiments

In [41]:
df_airbnb = pd.read_csv('../data/airbnb/March_2024/listings.csv')
df_airbnb.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 13362 entries, 0 to 13361
Data columns (total 18 columns):
 #   Column                          Non-Null Count  Dtype  
---  ------                          --------------  -----  
 0   id                              13362 non-null  int64  
 1   name                            13362 non-null  object 
 2   host_id                         13362 non-null  int64  
 3   host_name                       13353 non-null  object 
 4   neighbourhood_group             13362 non-null  object 
 5   neighbourhood                   13362 non-null  object 
 6   latitude                        13362 non-null  float64
 7   longitude                       13362 non-null  float64
 8   room_type                       13362 non-null  object 
 9   price                           8400 non-null   float64
 10  minimum_nights                  13362 non-null  int64  
 11  number_of_reviews               13362 non-null  int64  
 12  last_review                     

In [None]:
df_airbnb[['sentiment', 'positive_sentiment', 'negative_sentiment', 'neutral_sentiment']] = df_airbnb['name'].apply(predict_sentiments)

In [45]:
df_airbnb[['id', 'sentiment', 'positive_sentiment', 'negative_sentiment', 'neutral_sentiment']].to_csv("../data/airbnb/March_2024/sentiments.csv", index=False)

# Review Sentiments

In [13]:
df_airbnb = pd.read_csv('../data/airbnb/March_2024/listings.csv')
df_reviews = pd.read_csv('../data/airbnb/March_2024/reviews.csv')
df_reviews.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 503153 entries, 0 to 503152
Data columns (total 6 columns):
 #   Column         Non-Null Count   Dtype 
---  ------         --------------   ----- 
 0   listing_id     503153 non-null  int64 
 1   id             503153 non-null  int64 
 2   date           503153 non-null  object
 3   reviewer_id    503153 non-null  int64 
 4   reviewer_name  503153 non-null  object
 5   comments       503116 non-null  object
dtypes: int64(3), object(3)
memory usage: 23.0+ MB


In [12]:
filtered_reviews = df_reviews[df_reviews['listing_id'].isin(df_airbnb['id'])]
filtered_reviews = filtered_reviews.dropna(subset=["comments"])
filtered_reviews['language'] = filtered_reviews['comments'].apply(detect_language)

In [None]:
filtered_reviews[['sentiment', 'positive_sentiment', 'negative_sentiment', 'neutral_sentiment']] = filtered_reviews['comments'].apply(predict_sentiments)

In [None]:
filtered_reviews[['id', 'sentiment', 'positive_sentiment', 'negative_sentiment', 'neutral_sentiment', 'date', 'listing_id', 'language']].to_csv("../data/airbnb/review_sentiments.csv", index=False)

In [14]:
df_review_sentiments = pd.read_csv('../data/airbnb/review_sentiments.csv')
df_review_sentiments.head(5)

Unnamed: 0,id,listing_id,date,sentiment,positive_sentiment,negative_sentiment,neutral_sentiment
0,4283,3176,2009-06-20,positive,0.991684,0.008207,0.000109
1,14159242,265408,2014-06-13,positive,0.99965,0.00034,9e-06
2,27230771,265408,2015-02-28,negative,0.051247,0.948726,2.7e-05
3,134722,3176,2010-11-07,positive,0.987947,0.011883,0.000169
4,144064,3176,2010-11-24,neutral,0.001159,0.002875,0.995966
