In [19]:
#Load Data
import pandas as pd

df_yelp = pd.read_csv('./sentiment labelled sentences/yelp_labelled.txt', names=['sentence', 'label'], sep='\t')
df_amazon = pd.read_csv('./sentiment labelled sentences/amazon_cells_labelled.txt', names=['sentence', 'label'], sep='\t')
df_imdb = pd.read_csv('./sentiment labelled sentences/imdb_labelled.txt', names=['sentence', 'label'], sep='\t')

print("Yelp: ", df_yelp)
print("Amazon: ", df_amazon)
print("IMDB: ", df_imdb)

df_all = pd.concat([df_yelp, df_amazon, df_imdb])
print("All: ", df_all)

Yelp:                                                sentence  label
0                             Wow... Loved this place.      1
1                                   Crust is not good.      0
2            Not tasty and the texture was just nasty.      0
3    Stopped by during the late May bank holiday of...      1
4    The selection on the menu was great and so wer...      1
..                                                 ...    ...
995  I think food should have flavor and texture an...      0
996                           Appetite instantly gone.      0
997  Overall I was not impressed and would not go b...      0
998  The whole experience was underwhelming, and I ...      0
999  Then, as if I hadn't wasted enough of my life ...      0

[1000 rows x 2 columns]
Amazon:                                                sentence  label
0    So there is no way for me to plug it in here i...      0
1                          Good case, Excellent value.      1
2                            

In [None]:
#Explore Data
import re, numpy as np, pandas as pd
from collections import Counter
from sklearn.feature_extraction.text import ENGLISH_STOP_WORDS

percent = df_yelp['label'].value_counts(normalize=True, dropna=False).sort_index() * 100
print("Yelp Verteilung (%):\n", percent.round(2))

percent = df_amazon['label'].value_counts(normalize=True, dropna=False).sort_index() * 100
print("Amazon Verteilung (%):\n", percent.round(2))

percent = df_imdb['label'].value_counts(normalize=True, dropna=False).sort_index() * 100
print("IMDB Verteilung (%):\n", percent.round(2))

df = df_all.copy()

#Tokenizer
_token_re = re.compile(r"[A-Za-z]+(?:'[A-Za-z]+)?")
def tok(s): 
    if not isinstance(s, str): return []
    return _token_re.findall(s.lower())

#Grundüberblick
print("Shape:", df.shape)
print("Missing:", df.isna().sum().to_dict())
print("\nLabel-Verteilung (Count/%)")
counts = df['label'].value_counts().sort_index()
pct = (df['label'].value_counts(normalize=True).sort_index()*100).round(2)
print(pd.DataFrame({'count': counts, 'percent': pct}))

#Kern-Metriken je Text
def stats(text):
    s = text if isinstance(text, str) else ""
    t = tok(s)
    n = len(t)
    return pd.Series({
        'char_len': len(s),
        'word_len': n,
        'avg_word_len': (sum(len(x) for x in t)/n) if n else 0.0,
        'stop_ratio': (sum(x in ENGLISH_STOP_WORDS for x in t)/(n or 1)),
        'ttr': (len(set(t))/(n or 1)),
    })

fe = df['sentence'].apply(stats)
df_e = pd.concat([df[['label']], fe], axis=1)

print("\nGesamt-Deskriptoren:")
print(df_e.describe().round(3).T)

print("\nMittelwerte je Label:")
print(df_e.groupby('label').mean(numeric_only=True).round(3))

#N-Grams (Top 10)
def top_ngrams(texts, n=1, k=10, drop_stop=True):
    c = Counter()
    for s in texts:
        t = tok(s)
        if drop_stop: t = [w for w in t if w not in ENGLISH_STOP_WORDS]
        grams = t if n==1 else [" ".join(t[i:i+n]) for i in range(len(t)-n+1)]
        c.update(grams)
    return pd.DataFrame(c.most_common(k), columns=[f'{n}-gram','count'])

print("\nTop Unigrams (gesamt):")
print(top_ngrams(df['sentence'], n=1, k=10))

print("\nTop Bigrams (gesamt):")
print(top_ngrams(df['sentence'], n=2, k=10))

for lbl in sorted(df['label'].unique()):
    subset = df.loc[df['label']==lbl, 'sentence']
    print(f"\nTop Unigrams (Label={lbl}):")
    print(top_ngrams(subset, n=1, k=10))
    print(f"\nTop Bigrams (Label={lbl}):")
    print(top_ngrams(subset, n=2, k=10))

#Vokabulargröße (ohne Stopwords)
def vocab_size(texts):
    v=set()
    for s in texts: v.update([w for w in tok(s) if w not in ENGLISH_STOP_WORDS])
    return len(v)

print("\nVokabular gesamt (ohne Stopwords):", vocab_size(df['sentence']))
print("Vokabular je Label (ohne Stopwords):")
print(df.groupby('label')['sentence'].apply(vocab_size))

Shape: (2748, 2)
Missing: {'sentence': 0, 'label': 0}

Label-Verteilung (Count/%)
       count  percent
label                
0       1362    49.56
1       1386    50.44

Gesamt-Deskriptoren:
               count    mean      std  min     25%     50%     75%     max
label         2748.0   0.504    0.500  0.0   0.000   1.000   1.000     1.0
char_len      2748.0  71.528  201.987  7.0  32.000  55.000  87.000  7944.0
word_len      2748.0  12.896   33.489  0.0   6.000  10.000  16.000  1297.0
avg_word_len  2748.0   4.498    0.974  0.0   3.923   4.364   4.909    13.0
stop_ratio    2748.0   0.494    0.181  0.0   0.417   0.500   0.609     1.0
ttr           2748.0   0.956    0.073  0.0   0.917   1.000   1.000     1.0

Mittelwerte je Label:
       char_len  word_len  avg_word_len  stop_ratio    ttr
label                                                     
0        74.302    13.467         4.462       0.515  0.955
1        68.802    12.334         4.533       0.474  0.956

Top Unigrams (gesamt):


Explore Data

Reviews with a score of 4 and 5 were considered to be positive, and scores of 1 and 2 to be negative.

Amazon [1000 rows x 2 columns]: contains reviews and scores for products sold on amazon.com in the cell phones and accessories category.

IMDb [748 rows x 2 columns]: refers to the IMDb movie review sentiment dataset originally introduced by Maas et al. as a benchmark for sentiment analysis.

Yelp [1000 rows x 2 columns]: refers to the dataset from the Yelp dataset challenge from which we extracted the restaurant reviews.
