#**Sentimentanalyse-Modell**

##**Bibliotheken**


In [None]:
# Installation der notwendigen Bibliotheken
!pip install scikit-learn pandas nltk

import re
import time
import numpy as np

import pandas as pd

from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem import PorterStemmer



##**Datensatz**

###**Datenimport**

In [None]:
# Download des Datasets und Ausgabe des Dateipfads

import kagglehub

path = kagglehub.dataset_download("kritanjalijain/amazon-reviews")

print("Path to dataset files:", path)

Path to dataset files: /kaggle/input/amazon-reviews


###**Daten√ºbersicht**

**√úbersicht: Trainingsdatensatz**

In [None]:
# CSV-Datei des Trainings-Datensatzes von kaggle laden
df_train = pd.read_csv(path + '/train.csv')

# Tabellenspalten umbenennen
df_train.columns = ['Sentiment', 'Titel', 'Inhalt']

# Sentiment-Werte von 1 und 2 in bin√§re Werte 0 und 1 umwandeln
# 0 = negativ, 1 = positiv
df_train['Sentiment'] = df_train['Sentiment'].replace({1: 0, 2: 1})


**√úbersicht: Testdatensatz**

In [None]:
# CSV-Datei des Test-Datensatzes von kaggle laden
df_test = pd.read_csv(path + '/test.csv')

# Tabellenspalten umbenennen
df_test.columns = ['Sentiment', 'Titel', 'Inhalt']

# Sentiment-Werte von 1 und 2 in bin√§re Werte 0 und 1 umwandeln
# 0 = negativ, 1 = positiv
df_test['Sentiment'] = df_test['Sentiment'].replace({1: 0, 2: 1})


**Allgemeine Informationen**

In [None]:
# Informationen √ºber den Trainings-Datensatz
print('Informationen √ºber den Trainingsdatensatz\n')
print(df_train.info())

print('\n\n')

# Informationen √ºber den Test-Datensatz
print('Informationen √ºber den Testdatensatz\n')
print(df_test.info())



Informationen √ºber den Trainingsdatensatz

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3599999 entries, 0 to 3599998
Data columns (total 3 columns):
 #   Column     Dtype 
---  ------     ----- 
 0   Sentiment  int64 
 1   Titel      object
 2   Inhalt     object
dtypes: int64(1), object(2)
memory usage: 82.4+ MB
None



Informationen √ºber den Testdatensatz

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 399999 entries, 0 to 399998
Data columns (total 3 columns):
 #   Column     Non-Null Count   Dtype 
---  ------     --------------   ----- 
 0   Sentiment  399999 non-null  int64 
 1   Titel      399975 non-null  object
 2   Inhalt     399999 non-null  object
dtypes: int64(1), object(2)
memory usage: 9.2+ MB
None


**Leere Eintr√§ge**

In [None]:
# Fehlende Werte des Trainings-Datensatzes
print('Fehlende Werte des Trainingsdatensatzes')
print(df_train.isnull().sum())

print('\n\n')

# Fehlende Werte Test-Datensatzes
print('Fehlende Werte des Testdatensatzes')
print(df_test.isnull().sum())

Fehlende Werte des Trainingsdatensatzes
Sentiment      0
Titel        207
Inhalt         0
dtype: int64



Fehlende Werte des Testdatensatzes
Sentiment     0
Titel        24
Inhalt        0
dtype: int64


**Verteilung der Klassen**

In [None]:
# Verteilung der beiden Sentiment Klassen ermitteln
print('Trainingsdatensatz:')
print(df_train['Sentiment'].value_counts())
print('\n')
print('Testdatensatz:')
print(df_test['Sentiment'].value_counts())



Trainingsdatensatz:
Sentiment
0    1800000
1    1799999
Name: count, dtype: int64


Testdatensatz:
Sentiment
0    200000
1    199999
Name: count, dtype: int64


###**Datenbereinigung**

In [None]:
# Festlegen der relevanten Spalten
df_train = df_train[['Inhalt', 'Titel', 'Sentiment']]
df_test = df_test[['Inhalt', 'Titel', 'Sentiment']]

# Fehlende Werte entfernen (F√ºr komplett leere Eintr√§ge)
df_train = df_train.dropna(subset=['Inhalt', 'Titel', 'Sentiment'])
df_test = df_test.dropna(subset=['Inhalt', 'Titel', 'Sentiment'])

# Duplikate (identische Zeilen) entfernen
df_train = df_train.drop_duplicates()
df_test = df_test.drop_duplicates()


## **Vorbereitung**

###**Textvearbeitung**

####**NLTK**

In [None]:
# NLTK-Resourcen herunterladen
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab')


# Funktion zur Textbereinigung
def preprocess_text(text):

    # Entfernen von Sonderzeichen, Links und Zahlen
    text = re.sub(r'http\S+|www\S+|[^a-zA-Z\s]', '', text)

    # Umwandeln der Texte zu Kleinbuchstaben
    text = text.lower()

    # Text in kleinere Teile zerlegen (Tokenisierung)
    tokens = word_tokenize(text)

    # Entfernen von Stoppw√∂rter: Wird in Vektorisierung ermittelt
    # tokens = [word for word in tokens if word not in stopwords.words('english')]

    # W√∂rter auf Grundform bringen (Lemmatisierung)
    lemmatizer = WordNetLemmatizer()
    tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # W√∂rter auf Wurzel reduzieren (Stemming)
    # stemmer = PorterStemmer()
    # tokens = [stemmer.stem(word) for word in tokens]


    # Umwandlung zur√ºck zu einem String
    return ' '.join(tokens)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


####**SpaCy**

```
import spacy

# Laden des englischen spacy Sprachmodells
nlp = spacy.load("en_core_web_sm")

# Textbereinigung und -vorverarbeitung
def preprocess_text(text):

    # Entfernen von Sonderzeichen, Links und Zahlen
    text = re.sub(r'http\S+|www\S+|[^a-zA-Z\s]', '', text)

    # Umwandeln der Textee in Kleinbuchstaben
    text = text.lower()

    # Verarbeitung der Texte
    doc = nlp(text)

    # Lemmatisierung und Entfernen von Stoppw√∂rter
    tokens = [token.lemma_ for token in doc if not token.is_stop]

    # Zusammenf√ºgen der tokens zu einem String
    return ' '.join(tokens)

```



####**Verarbeitung: Datens√§tze**

In [None]:
# Installation der swifter-Bibliothek, um die Verarbeitung zu beschleunigen
!pip install swifter
import swifter

# Auswahl einer zuf√§llige Stichprobe von 10000 Zeilen aus dem Trainings-Datensatz (mit festgelegtem Seed f√ºr die Reproduzierbarkeit)
df_train = df_train.sample(10000, random_state=64)

# Textvorverarbeitung auf die Spalten 'Titel' und 'Inhalt' des Trainings-Datensatzes anwenden
# Das Ergebnis wird in der neuen Spalte 'Cleaned_Text_Train' gespeichert
df_train['Cleaned_Text_Train'] = df_train['Titel'].swifter.apply(preprocess_text) + " " + df_train['Inhalt'].swifter.apply(preprocess_text)


# Auswahl einer zuf√§llige Stichprobe von 2000 Zeilen aus dem Test-Datensatz (mit festgelegtem Seed f√ºr die Reproduzierbarkeit)
df_test = df_test.sample(2000, random_state=64)

# Textvorverarbeitung auf die Spalten 'Titel' und 'Inhalt' des Test-Datensatzes anwenden
# Das Ergebnis wird in der neuen Spalte 'Cleaned_Text_Test' gespeichert
df_test['Cleaned_Text_Test'] = df_test['Titel'].swifter.apply(preprocess_text) + " " + df_test['Inhalt'].swifter.apply(preprocess_text)




Pandas Apply:   0%|          | 0/10000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/10000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/2000 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/2000 [00:00<?, ?it/s]

###**Vektorisierung**

####**TF-IDF**

**Zielvariablen**

In [None]:
# Festlegen der Zielvariablen (Sentiment) f√ºr die Trainings- und Testdaten
y_train = df_train['Sentiment']
y_test = df_test['Sentiment']

**Grid-Search**


```
# Festlegen der Parameter und ihrer annehmbaren Werte
max_features_values = [10000, 20000, 50000]
ngram_ranges = [(1,1), (1,2), (1,3)]
stop_words_options = [None, 'english']
min_df_values = [1, 2, 5]
max_df_values = [0.9, 0.95, 1.0]
use_idf_options = [True, False]
smooth_idf_options = [True, False]
sublinear_tf_options = [True, False]
norm_options = ['l1', 'l2', None]

# Variablen zur Speicherung der besten Parameterkombination und der zugeh√∂rigen Genauigkeit
best_score = 0
best_params = {}

# Grid-Search f√ºr alle Parameterkombinationen
for max_features in max_features_values:
    for ngram_range in ngram_ranges:
        for stop_words in stop_words_options:
            for min_df in min_df_values:
                for max_df in max_df_values:
                    for use_idf in use_idf_options:
                        for smooth_idf in smooth_idf_options:
                            for sublinear_tf in sublinear_tf_options:
                                for norm in norm_options:

                                    # Durchf√ºhren der Vektorisierung mit aktueller Parameterkombination
                                    vectorizer = TfidfVectorizer(
                                        max_features=max_features,
                                        ngram_range=ngram_range,
                                        stop_words=stop_words,
                                        min_df=min_df,
                                        max_df=max_df,
                                        use_idf=use_idf,
                                        smooth_idf=smooth_idf,
                                        sublinear_tf=sublinear_tf,
                                        norm=norm
                                    )

                                    # Umwandeln der Textdaten in TF-IDF-Vektoren
                                    X_train_vec = vectorizer.fit_transform(df_train['Cleaned_Text_Train'])
                                    X_test_vec = vectorizer.transform(df_test['Cleaned_Text_Test'])

                                    # Training des Modells
                                    model = MultinomialNB()
                                    model.fit(X_train_vec, y_train)

                                    # Berechnen der Modellgenauigkeit auf Trainings- und Testdaten
                                    train_accuracy = model.score(X_train_vec, y_train)
                                    test_accuracy = model.score(X_test_vec, y_test)

                                     # Ausgabe der aktuellen Parameterkombination und zugeh√∂rigen Genauigkeit
                                    print(f"Max Features: {max_features}, N-Grams: {ngram_range}, Stopwords: {stop_words}, "
                                          f"min_df: {min_df}, max_df: {max_df}, use_idf: {use_idf}, smooth_idf: {smooth_idf}, "
                                          f"sublinear_tf: {sublinear_tf}, norm: {norm}")
                                    print(f"Trainingsgenauigkeit: {train_accuracy:.4f}, Testgenauigkeit: {test_accuracy:.4f}")

                                    # Speichern der besten Parameterkombination
                                    if test_accuracy > best_score:
                                        best_score = test_accuracy
                                        best_params = {
                                            "max_features": max_features,
                                            "ngram_range": ngram_range,
                                            "stop_words": stop_words,
                                            "min_df": min_df,
                                            "max_df": max_df,
                                            "use_idf": use_idf,
                                            "smooth_idf": smooth_idf,
                                            "sublinear_tf": sublinear_tf,
                                            "norm": norm
                                        }

# Ausgabe der besten Kombination aller Parameter
print("Beste TF-IDF-Kombination:", best_params)

```


*Beste TF-IDF-Kombination basierend auf GridSearch:*

*Max Features: 50000, N-Grams: (1, 3), Stopwords: None, min_df: 2, max_df: 0.9, use_idf: True, smooth_idf: False, sublinear_tf: True, norm: l2*

**Kreuzvalidierung**

```
from sklearn.model_selection import cross_val_score
import itertools

# Festlegen der Parameter und ihrer annehmbaren Werte
max_features_values = [10000, 20000, 50000]
ngram_ranges = [(1,1), (1,2), (1,3)]
stop_words_options = [None, 'english']
min_df_values = [1, 2, 5]
max_df_values = [0.9, 0.95, 1.0]
use_idf_options = [True, False]
smooth_idf_options = [True, False]
sublinear_tf_options = [True, False]
norm_options = ['l1', 'l2', None]

# Variablen zur Speicherung der besten Parameterkombination und der zugeh√∂rigen Genauigkeit
best_score = 0
best_params = {}

# Erstellen einer Liste aller m√∂glichen Parameterkombinationen
param_combinations = itertools.product(
    max_features_values, ngram_ranges, stop_words_options, min_df_values, max_df_values,
    use_idf_options, smooth_idf_options, sublinear_tf_options, norm_options
)

for params in param_combinations:
    max_features, ngram_range, stop_words, min_df, max_df, use_idf, smooth_idf, sublinear_tf, norm = params

    # Durchf√ºhren der Vektorisierung mit aktueller Parameterkombination
    vectorizer = TfidfVectorizer(
        max_features=max_features,
        ngram_range=ngram_range,
        stop_words=stop_words,
        min_df=min_df,
        max_df=max_df,
        use_idf=use_idf,
        smooth_idf=smooth_idf,
        sublinear_tf=sublinear_tf,
        norm=norm
    )

    # Umwandeln der Textdaten in TF-IDF-Vektoren
    X_train_vec = vectorizer.fit_transform(df_train['Cleaned_Text_Train'])

    # Training des Modells
    model = MultinomialNB()

    # Berechnen der Modellgenauigkeit mit Kreuzvalidierung
    scores = cross_val_score(model, X_train_vec, y_train, cv=5, scoring='accuracy')
    mean_score = np.mean(scores)

    # Ausgabe der aktuellen Parameterkombination und zugeh√∂rigen Genauigkeit
    print(f"Params: {params} -> Kreuzvalidierungsgenauigkeit: {mean_score:.4f}")

    # Speichern der besten Parameterkombination
    if mean_score > best_score:
        best_score = mean_score
        best_params = {
            "max_features": max_features,
            "ngram_range": ngram_range,
            "stop_words": stop_words,
            "min_df": min_df,
            "max_df": max_df,
            "use_idf": use_idf,
            "smooth_idf": smooth_idf,
            "sublinear_tf": sublinear_tf,
            "norm": norm
        }


# Ausgabe der besten Kombination aller Parameter
print("Beste TF-IDF-Kombination basierend auf Kreuzvalidierung:", best_params)

```

*Beste TF-IDF-Kombination basierend auf Kreuzvalidierung:*

*Max Features: 50000, N-Grams: (1, 3), Stopwords: None, min_df: 2, max_df: 0.9, use_idf: False, smooth_idf: True, sublinear_tf: True, norm: None*


**Parameter einf√ºgen**

In [None]:
# Erstellen des TF-IDF Vektorisierers mit den ermittelten besten Parametern
# Parameter der Kreuzvalidierung:
vectorizer = TfidfVectorizer(max_features=50000, ngram_range=(1, 3), stop_words=None,min_df=2, max_df=0.9, use_idf=False, smooth_idf=True, sublinear_tf=True, norm=None)

# Umwandeln der Trainingsdaten in TF-IDF-Vektoren
X_train_vectorized = vectorizer.fit_transform(df_train['Cleaned_Text_Train'])

# Anwendung des TF-IDF-Vektorizers auf die verarbeiteten Testdaten
X_test_vectorized = vectorizer.transform(df_test['Cleaned_Text_Test'])

## **Modell**

**Grid-Search**

```
# Festlegen der Alpha-Werte, die getestet werden sollen
alpha_values = [0.01, 0.1, 0.5, 1.0, 5.0, 10.0]

# Variablen zur Speicherung des besten Alpha-Werts und der zugeh√∂rigen Genauigkeit
best_accuracy = 0
best_params = {}

# Grid-Search √ºber verschiedene Alpha-Werte
for alpha in alpha_values:
    # Trainieren des Modells mit dem aktuellen Alpha-Wert
    nb_model = MultinomialNB(alpha=alpha)
    nb_model.fit(X_train_vectorized, y_train)

    # Vorhersage auf den Testdaten
    y_pred = nb_model.predict(X_test_vectorized)

    # Berechnen der Genauigkeit auf den Testdaten
    accuracy = accuracy_score(y_test, y_pred)

    # Speichern des besten Alpha-Wert
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_params = {'alpha': alpha}

    #Ausgabe der aktuellen Modellgenauigkeit f√ºr den jeweiligen Alpha-Wert
    print(f"Alpha: {alpha} -> Accuracy: {accuracy:.4f}")

# Ausgabe des beste Alpha-Werts und der zugeh√∂rigen Genauigkeit
print("Bester Alpha-Wert:", best_params)
print("Beste Genauigkeit:", best_accuracy)

```

*Bester Alpha-Wert basierend auf Grid-Search:*

*alpha: 5.0*

**Kreuzvalidierung**

```
# Festlegen der Alpha-Werte, die getestet werden sollen
alpha_values = [0.01, 0.1, 0.5, 1.0, 5.0, 10.0]

# Variablen zur Speicherung des besten Alpha-Werts und der zugeh√∂rigen Genauigkeit
best_alpha = None
best_score = 0

# Durchsuchen der Alpha-Werte mit Kreuzvalidierung
for alpha in alpha_values:

    # Trainieren des Modells mit dem aktuellen Alpha-Wert
    nb_model = MultinomialNB(alpha=alpha)

    # 5-fache Kreuzvalidierung auf dem Trainingsdaten
    scores = cross_val_score(nb_model, X_train_vectorized, y_train, cv=5, scoring='accuracy')
    mean_score = np.mean(scores)

    # Ausgabe der durchschnittlichen Genauigkeit von der Kreuzvalidierung
    print(f"Alpha: {alpha} -> Durchschnittliche Genauigkeit: {mean_score:.4f}")

    # Speichern des besten Alpha-Wert
    if mean_score > best_score:
        best_score = mean_score
        best_alpha = alpha

# Trainieren des Modells mit dem besten Alpha-Wert
final_model = MultinomialNB(alpha=best_alpha)
final_model.fit(X_train_vectorized, y_train)

# Vorhersage treffen auf den Testdaten
y_pred = final_model.predict(X_test_vectorized)

# Ausgabe des besten Alpha-Wert und der zugeh√∂rigen Kreuzvalidierungsgenauigkeit
print("\nBestes Alpha:", best_alpha)
print("Beste Kreuzvalidierungsgenauigkeit:", best_score)

```

*Bester Alpha-Wert basierend auf Kreuzvalidierung:*

*alpha: 1.0*

**Modell**

In [None]:
# --- Modelltraining und Messung der Trainingszeit ---
# Starten des Timers f√ºr das Training
start_training = time.time()

# Initialisierung des Naive-Bayes-Modells
# Parameter der Kreuzvalidierung:
nb_model = MultinomialNB(alpha=1.0)

# Training des Modells mit den TF-IDF-Daten des Trainingsdatensatzes
nb_model.fit(X_train_vectorized, y_train)

# Berechnung der Dauer des Modelltrainings
training_time = time.time() - start_training


# --- Vorhersage und Messung der Vorhersagezeit ---
# Starten des Timers f√ºr die Vorhersage
start_prediction = time.time()

# Treffen der Vorhersagen f√ºr den Testdatensatz
y_pred = nb_model.predict(X_test_vectorized)

# Berechnung der Dauer der Vorhersage
prediction_time = time.time() - start_prediction


# --- Evaluation des Modells ---
# Berechnung der Metriken wie Precision, Recall und F1-Score zur Modellbewertung
report = classification_report(y_test, y_pred, output_dict=True)
conf_matrix = confusion_matrix(y_test, y_pred)

# Ermittlung der Werte f√ºr die Konfusionsmatrix
tn, fp, fn, tp = conf_matrix.ravel()


# --- Test-Samples und ihre Vorhersagen ---
# Beispieltexte zur Sentiment-Analyse
samples = [
    "The product quality is amazing, and the design is great. Highly recommend it!",  # Positiv
    "I had an excellent experience, fast delivery, and outstanding customer service!",  # Positiv
    "The product was defective and customer service was unhelpful. Terrible experience.",  # Negativ
    "This is the worst purchase I've ever made. Do not buy this product!"  # Negativ
]

# Bestimmung der vorhergesagten Sentiments und Wahrscheinlichkeiten der Beispieltexte
sample_results = []
for sample in samples:
    # Bereinigung und vektorisierung des Beispielsatzes
    cleaned_sample_text = preprocess_text(sample)
    sample_vectorized = vectorizer.transform([cleaned_sample_text])

    # Treffen einer Vorhersage und Bestimmung der Wahrscheinlichkeiten
    prediction = nb_model.predict(sample_vectorized)[0]
    prediction_proba = nb_model.predict_proba(sample_vectorized)[0]

    # Speicherung der Vorhersagen und Wahrscheinlichkeiten
    sample_results.append((prediction, prediction_proba[1], prediction_proba[0]))


# --- Ausgabe der Metriken zur Modellbewertung ---
# Ausgabe der Bewertungsmetriken
print("Genauigkeit:", accuracy_score(y_test, y_pred))
print("F1-Score Positiv:", report['1']['f1-score'])
print("F1-Score Negativ:", report['0']['f1-score'])
print("Pr√§zision Positiv:", report['1']['precision'])
print("Pr√§zision Negativ:", report['0']['precision'])
print("Recall Positiv:", report['1']['recall'])
print("Recall Negativ:", report['0']['recall'])
print("True Positives (TP):", tp)
print("True Negatives (TN):", tn)
print("False Positives (FP):", fp)
print("False Negatives (FN):", fn)
print("Trainingszeit (Sek.):", training_time)
print("Vorhersagezeit (Sek.):", prediction_time)

# Ausgabe der Vorhersagen und Wahrscheinlichkeiten der Beispieltexte
for i, result in enumerate(sample_results, 1):
    prediction, prob_positive, prob_negative = result
    sentiment = "Positiv" if prediction == 1 else "Negativ"
    print(f"Sample {i} Vorhersage: {sentiment}, Wahrscheinlichkeit Positiv: {prob_positive:.4f}, Wahrscheinlichkeit Negativ: {prob_negative:.4f}")


Genauigkeit: 0.879
F1-Score Positiv: 0.8801980198019802
F1-Score Negativ: 0.8777777777777778
Pr√§zision Positiv: 0.8907815631262525
Pr√§zision Negativ: 0.8672654690618763
Recall Positiv: 0.8698630136986302
Recall Negativ: 0.8885480572597138
True Positives (TP): 889
True Negatives (TN): 869
False Positives (FP): 109
False Negatives (FN): 133
Trainingszeit (Sek.): 0.08388566970825195
Vorhersagezeit (Sek.): 0.01555943489074707
Sample 1 Vorhersage: Positiv, Wahrscheinlichkeit Positiv: 1.0000, Wahrscheinlichkeit Negativ: 0.0000
Sample 2 Vorhersage: Positiv, Wahrscheinlichkeit Positiv: 0.9694, Wahrscheinlichkeit Negativ: 0.0306
Sample 3 Vorhersage: Negativ, Wahrscheinlichkeit Positiv: 0.0000, Wahrscheinlichkeit Negativ: 1.0000
Sample 4 Vorhersage: Negativ, Wahrscheinlichkeit Positiv: 0.0000, Wahrscheinlichkeit Negativ: 1.0000


**Filter der Bewertungen**

In [None]:
# Liste von Keywords f√ºr Bewertungen die sich auf Shop- oder Lieferzeit beziehen
keywords = [
    "shipping", "delivery", "shipment", "package", "tracking", "courier", "order",
    "customer service", "support", "seller", "store", "shop",
    "ordering", "replacement", "exchange", "policy", "warranty",
    "payment", "checkout", "invoice", "delay", "wrong item",
    "processing", "cancellation", "response", "email", "assistance"
]


# Erkennung der Bewertungen mit diesen Keywords
def ist_irrelevant(text):
    text_lower = text.lower()
    return any(keyword in text_lower for keyword in keywords)


## **Anwendung**

**Reale Bewertungen von Amazon**

In [None]:
# Reale Bewertungen sortiert nach Sterne-Bewertung

sample_texts = [
    # 1-Sterne-Bewertungen
    "We bought the Air Tags for our 3 outdoor cats and the result is more than disappointing. Although our cats only spend time around the house, neither the search function nor the ‚Äúplay sound‚Äù call signal work. It shows me where they should currently be, but that too is only very vague and is only updated very rarely. Then it's last seen at this and that time and that was in some cases already several hours ago. What else can you say... definitely not recommended for the high price.",
    "The first layer of insoles had come off before use. Since I have to wear orthopedic shoe insoles, they should come out anyway. They were so stuck that it took me 20 minutes. And the material spread everywhere almost by itself. Like blue powder in shoes as well as on carpet.",
    "I was expecting these sheets to be of decent quality after reading other people's reviews. However, I found the fabric to be very thin compared and there were quality issues in the manufacturing because the stitching was open in some parts of the zippers. Definitively not worth for this price!!! The more basic option from IKEA is cheaper and way better quality than this product.",
    "After 2 hours of assembly with skilled tinkers, checks after checks, there was an incompatible part on the trampoline ring... so I had to pack everything up and return the trampoline at my expense. Big pain and huge disappointment for the children...",
    "I'm halfway through the book, and I keep finding questions that are double (but have different answers). Many questions answer i doubt, and some are worded so poorly that it is impossible to answer them correctly. I have been browsing the internet for a year now, studying for my exam, and the more I read this book, the more I feel that many of these questions are just copied and pasted from different websites.",
    "24 Red Bull carriers. And there are simply 3 doses missing. Just like that. Delivery delivered incorrectly twice and has not been brought to the right place. Then a large box in a small box where the Red Bull carrier was opened and 3 cans are simply missing. Never again!!",

    # 2-Sterne-Bewertungen
    "Quality is good for the price, but the fitted sheet is larger than described. Otherwise, the dimensions always fit, but not with this product, it is already much too big and does not fit properly.",
    "Unfortunately, the on/off controller is right next to the handle of the lint collection container.. every time you turn it off, you open the collection container at a flat rate and all lint and ground matter is spread around the room.",
    "I bought this as a gift but would be embarrassed to give it. There has been some water damage or it has been stored in a damp environment, as the cover and pages are ‚Äòwavy‚Äô In addition, there was a large, greasy hand print across the matt-finished front cover.",
    "Obviously I didn't expect quality comparable to other more famous brands, but they are worse than I thought. Poor quality of the fabric and especially of the velcro closure, which after 3/4 uses was already very worn out. Failed.",
    "The instructions are wrong in several places, and the pieces and bolts don‚Äôt fit together easily AT ALL. Ive been working on it for two days now and it‚Äôs always something. How to put together needs to be changed because this is ridiculous.",
    "The lid does not close properly on the thread, so you have to press on it again from above. The sieve sometimes falls through and is not suitable for the container.",

    # 3-Sterne-Bewertungen
    "The solubility of this product is average. Almost 20% is not dissolved in the water.",
    "Not the very good quality but medium value. Environmentally friendly but not pleasant for everyone's skin. Felt here they had saved a bit of everything and everything",
    "I think it's a great scale. Because it has many functions and can be connected to a mobile app. Body fat, muscle mass... and much is visible. It also has a nice design and is not high so it fits well under my bathroom cabinet. It's actually a shame to shove them under the cupboard. But I don't have that much space in the bathroom.",
    "The part itself is well made and makes a sturdy impression. The feeling on the scalp is very good. Unfortunately, I noticed that the wood splinters on the handle, which fits so well in the hand. I would therefore not buy the product again.",
    "Unfortunately, it is easy to overlook the fact that this large pack comes without a handle. It probably costs a maximum of 5 cents to produce. It is annoying to search everywhere to see if you still have such a plastic piece lying somewhere.",
    "The taste is really good and when it comes to espresso macchiato, you don't have many to choose from. Unfortunately, 1-3 of the bottles had already tipped over with almost every delivery. I can't decipher the expiration date, but I think the locking system is not optimal.",

    # 4-Sterne-Bewertungen
    "That was my first matcha tea - simply delicious. At the same time, I bought another matcha tea from my trusted Asian store. There are serious differences. However, I have to say that I won't order again for the price and will have to subtract one star.",
    "Great handy jump rope The length could be easily adjusted. When stored in the included bag, the rope must be ‚Äústraightened‚Äù slightly before use",
    "The cable does what it should, but what bothers me is the price per meter. For 11 euros, I already get 5 meters of cable. Well everything else is great!",
    "I bought the microphone for recording ASMR and sound music. Visually, it makes a good impression, there are several adapters, so it can be connected to any device with USB or micro USB, etc. Now about sound quality: even if you have a few options directly on the microphone to ‚Äúadjust‚Äù it, it is not optimal sound quality. Echo function is good, 'mute' mode is also okay. When setting the microphone sensitivity, the recordings have an unpleasant noise, the more sensitive you adjust, the louder it rushes. For normal podcasts or interviews, for people who are still ‚Äúat the beginning‚Äù, the price/performance ratio is okay. However, anyone who values sound quality can and should go for a more expensive, better-made product.",
    "Visually, the knife block looks good. The knives also look high-quality and fit well in the hand. However, they could definitely be sharper. However, they are sufficient for everyday use.",
    "The product works really well. The software is easy to use The software is actually the most important and it is very good. Temperature measurement is also very well tested with a third-party device for comparison. What is less good is the connection to the smartphone, the range was certainly not reached by me. But it can also be due to other circumstances.",

    # 5-Sterne-Bewertungen
    "We already have cups from this brand and because we are very satisfied with them, we ordered some more. For me, it is particularly positive that they can be completely washed in the dishwasher. With virtually all other thermal mugs, the mug itself cannot be put in the dishwasher. Usually just the additional parts, if any. In addition, the cups also look very chic and, for my taste, keep warm enough. In addition, the lid works perfectly and is absolutely tight. Unfortunately, the function of the new comfort lock is not fully understood to me, except that it is an intermediate space that cannot be cleaned properly. Otherwise I love the cups and would buy them again.",
    "The device does exactly what it is supposed to do. That is exactly what I expect, without tinkling. Some authors describe that the device is loud and takes a long time, but I don't feel that way. I can also read the scale of the fill height, in contrast to others who complain about this. There is no difference compared to the old device. The metal case gets really warm on the outside, was to be expected. The cable is long enough, the lid opens so that you can easily clean. Good price/quality ratio.",
    "Had originally ordered a Ninja double chamber air fryer. 3√ó from different vendors and dented and damaged each time, no quality, why just this hype? I therefore chose the Cosori hot air fryer and I am extremely satisfied. Great fryer, very good workmanship and quality!! Contrary to various reviews, the function keys at the top are very easy to read and adjust. I can only recommend it with a clear conscience üòÄ",
    "I bought the cylinder on sale, the shipping was super fast and the goods delivered to the house the next day and were super packaged. The cylinder came in an original box, which we have now also placed on the shelf, as this is our spare cylinder and you can easily remove the cartridge from there. The additional packaging was very safe and arrived undamaged.",
    "The Bonsenkitchen electric milk frother is a real game changer for my morning coffee! The handheld milk frother is lightweight, handy and battery-operated, which makes it super practical for everyday use. In just a few seconds, it creates a wonderfully creamy milk froth ‚Äî perfect for my cappuccino or latte macchiato. It is also ideal for matcha or other drinks. The stainless steel whisk looks stable and the speed is just right for a great result. I particularly like the fact that the milk frother is easy to clean. After use, simply rinse it briefly under running water and it is ready for use again. Absolutely unbeatable for the price and a must for every coffee lover!",
    "The NESCAF√â DOLCE GUSTO Caf√© au Lait Decaffeinato is now my absolute favorite among coffee capsules! The taste is wonderfully mild and full-bodied, and best of all: It is low in carbohydrates and caffeine-free, which is perfect to enjoy at any time of day. Unfortunately, the caffeine-free version is rarely found in retail stores, which is why I am all the happier to get it via Amazon. Thanks to Prime, the 3-pack was delivered very quickly, which is another plus point. The 48 portions are enough for me for a while, and the quality is consistently convincing. For anyone looking for caffeine-free coffee with a full flavor, a clear recommendation!",

]


# Verarbeitung der realen Bewertungen
for i, text in enumerate(sample_texts, 1):

    # √úberpr√ºfen ob die Bewertung die festgelegten Keywords enth√§lt
    relevant = not ist_irrelevant(text)
    status = "Produkt Bewertung" if relevant else "Sonstige Bewertung"
    print(f"{i}. {status}: {text}")

    # Text vorverarbeiten und vektoriseren
    cleaned_text = preprocess_text(text)
    sample_vectorized = vectorizer.transform([cleaned_text])

    # Sentiment vorhersagen und Wahrscheinlichkeit ausgeben
    prediction = nb_model.predict(sample_vectorized)
    prediction_proba = nb_model.predict_proba(sample_vectorized)

    print("Sentiment:", "Positiv" if prediction[0] == 1 else "Negativ")
    print("Negativ:", prediction_proba[0][0])
    print("Positiv:", prediction_proba[0][1])
    print("-")



1. Produkt Bewertung: We bought the Air Tags for our 3 outdoor cats and the result is more than disappointing. Although our cats only spend time around the house, neither the search function nor the ‚Äúplay sound‚Äù call signal work. It shows me where they should currently be, but that too is only very vague and is only updated very rarely. Then it's last seen at this and that time and that was in some cases already several hours ago. What else can you say... definitely not recommended for the high price.
Sentiment: Negativ
Negativ: 0.99999989723438
Positiv: 1.0276565479974189e-07
-
2. Produkt Bewertung: The first layer of insoles had come off before use. Since I have to wear orthopedic shoe insoles, they should come out anyway. They were so stuck that it took me 20 minutes. And the material spread everywhere almost by itself. Like blue powder in shoes as well as on carpet.
Sentiment: Negativ
Negativ: 0.9060461005703869
Positiv: 0.09395389942959675
-
3. Produkt Bewertung: I was expecti

**Reale Bewertungen anderer Shops**

In [None]:
sample_texts = [
    # Bewertungen von Otto
    "Exactly what I was looking for. The garment has to fit and be very comfortable, especially in my free time at home. This is the case. Nice and soft inside. The advice from other customers to order a little smaller was correct. I usually tend to go for a L, but took an M here. I have since bought two more.",
    "A package for 120 euros was not delivered to me and nobody took care of it, I do not recommend it.",
    "I ordered it when it arrived it was too short. But I didn't care at that moment because this belt is too thin and sensitive to pressure. Don't buy it I have a lot of experience with belts and that's not possible.",

    # Bewertungen von MediaMarkt
    "Cons: Changeover due to missing home button. The Iphone 14 has replaced an Iphone 8 and there really is a world of difference. Initially, the familiar home button is missing, but you soon get used to it. The new Iphone has a great display and a really good camera. Otherwise, it's just familiar Apple quality.",
    "I use the appliance every day, especially the Crema coffee in the morning. My son loves this coffee too. The appliance cleans itself really well before starting. The operation and description are self-explanatory. It's a real eye-catcher in the kitchen. Grinder runs great (about 3.30 in the morningüòâ). Unfortunately, we had to push the small water hose in the appliance a little because it had a kink, so that the water ran through drop by drop or very slowly. The milk frother also works great so that nothing stands in the way of a latte. Otherwise, this is a coffee machine that can only make a coffee lover happy.",
    "I ordered my PS5 in a bundle with GOW on 1.12.2022 directly at 10 o'clock in the morning in a Media Markt and have not received anything until today. What is this? Where is my PS5? I will never buy anything from Media Markt again.",

    # Bewertungen von IKEA
    "I like the design. Form and function go hand in hand. Installation in old buildings is challenging. Washers are a good addition. + Handle well integrated into the shape + Smooth-running + Good price-performance ratio + Plenty of storage space, not just for shoes - Material for the mechanism not robust enough - Assembly requires high precision (may be difficult in old buildings) - Dimensional accuracy not sufficient for precise assembly of several units",
    "Unfortunately, the drill holes are so faulty that I could only screw in half of the screws. As a result, the shelf is so unstable that I can't put it on the balcony as planned. It will fall over with the next little breeze! What a pity. It would be ideal in terms of size!",
    "I ordered the basket a few years ago and now wanted another one for my Kallax but the quality is no longer the same... what a shame, the opening to pull it out is also much smaller.",

]


# Verarbeitung der Bewertungen anderer Shops
for i, text in enumerate(sample_texts, 1):

    # √úberpr√ºfen ob die Bewertung die festgelegten Keywords enth√§lt
    relevant = not ist_irrelevant(text)
    status = "Produkt Bewertung" if relevant else "Sonstige Bewertung"
    print(f"{i}. {status}: {text}")

    # Text vorverarbeiten und vektoriseren
    cleaned_text = preprocess_text(text)
    sample_vectorized = vectorizer.transform([cleaned_text])

    # Sentiment vorhersagen und Wahrscheinlichkeit ausgeben
    prediction = nb_model.predict(sample_vectorized)
    prediction_proba = nb_model.predict_proba(sample_vectorized)

    print("Sentiment:", "Positiv" if prediction[0] == 1 else "Negativ")
    print("Negativ:", prediction_proba[0][0])
    print("Positiv:", prediction_proba[0][1])
    print("-")



1. Sonstige Bewertung: Exactly what I was looking for. The garment has to fit and be very comfortable, especially in my free time at home. This is the case. Nice and soft inside. The advice from other customers to order a little smaller was correct. I usually tend to go for a L, but took an M here. I have since bought two more.
Sentiment: Positiv
Negativ: 0.000899960377872641
Positiv: 0.9991000396220926
-
2. Sonstige Bewertung: A package for 120 euros was not delivered to me and nobody took care of it, I do not recommend it.
Sentiment: Negativ
Negativ: 0.9999629887867882
Positiv: 3.7011213214328214e-05
-
3. Sonstige Bewertung: I ordered it when it arrived it was too short. But I didn't care at that moment because this belt is too thin and sensitive to pressure. Don't buy it I have a lot of experience with belts and that's not possible.
Sentiment: Negativ
Negativ: 0.9999999999636202
Positiv: 3.643632113503624e-11
-
4. Produkt Bewertung: Cons: Changeover due to missing home button. The I