In [None]:
%pip install numpy tensorflow

In [3]:
import pandas as pd

In [None]:
%pip install nltk
%pip install seaborn

In [None]:
%pip install scikit-learn

In [6]:
import seaborn as sns
import matplotlib.pyplot as plt
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelBinarizer
from nltk.corpus import stopwords, wordnet
from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize,sent_tokenize
from nltk.tokenize.toktok import ToktokTokenizer
from nltk.stem import LancasterStemmer,WordNetLemmatizer
from nltk import pos_tag
from sklearn.linear_model import LogisticRegression,SGDClassifier
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import SVC
from sklearn.metrics import classification_report,confusion_matrix,accuracy_score

In [7]:
df = pd.read_csv("movie_reviews.csv")

# Vorverarbeitung der Texten

In [None]:
tokenizer = nltk.ToktokTokenizer()

#Englische stopwords
nltk.download('stopwords')
stopword_list=stopwords.words('english')

In [9]:
#Funktion zur Tokenisierung des Textes und Entfernung von Stoppwörtern
def tokenize(text):
    tokens = tokenizer.tokenize(text)
    tokens = [token.strip() for token in tokens]
    filtered_tokens = [token for token in tokens if token.lower() not in stopword_list]

    return filtered_tokens

In [10]:
#Wende die Funktion auf die Spalte "comment" an
df['comment']=df['comment'].apply(tokenize)

In [None]:
print(df.comment.loc[0:1])

In [None]:
# Lemmatisierung
# Download der erforderlichen Datensätze
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger_eng') # Für POS tagging

In [13]:
lemmatizer = WordNetLemmatizer()
# Zuordnung von POS (Part-of-Speech)-Tags von NLTK zu WordNet
# Die Zuordnung von POS-Tags verbessert die Genauigkeit, indem sie sicherstellt, dass die Wörter korrekt lemmatisiert werden.
def get_wordnet_pos(tag):
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN

def lemmatize(text):
    pos_tags = pos_tag(text)
    lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags]
    new_text = ' '.join(lemmatized_words)
    return new_text

df['comment']=df['comment'].apply(lambda text: lemmatize(text))



In [None]:
print(df['comment'])

In [15]:
import re
# Entferne Sonderzeichen, Jahre
def remove_things(text):
    # Remove punctuation
    text = re.sub(r"[^\w\s]", "", text)

    # Remove years
    text = re.sub(r"\b\d{4}\b", "", text)

    # Remove special characters
    text=re.sub(r'[^a-zA-z0-9\s]','',text)
    return text


df['comment']=df['comment'].apply(remove_things)

In [None]:
df.comment.loc[0:0]

Term Frequency-Inverse Document Frequency model (TF-IDF)

Es wird verwendet, um Textdokumente in eine Matrix von tf-idf-Merkmalen umzuwandeln.

In [17]:
#Tfidf vectorizer

def tfidf_vectorize(text, tfidf = None):
    if tfidf is None:
        tfidf=TfidfVectorizer(min_df=0.0,max_df=1.0,use_idf=True,ngram_range=(1,3), max_features=100000)
        tfidf.fit(df.comment)

    new_text = tfidf.transform(text)
    print(f"Vocabulary size: {len(tfidf.get_feature_names_out())}")

    return new_text, tfidf


In [None]:
text_count_matrix, fitted_tfidf = tfidf_vectorize(df.comment)

In [None]:
print(text_count_matrix)

In [20]:
X =text_count_matrix
y = df.sentiment

In [21]:
#Training- und Testdaten splitten
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y, random_state=42)

## Neuronales Netz

In [None]:
%pip install tensorflow==2.12

In [23]:
import numpy as np
import tensorflow as tf
import keras
from keras import models

In [24]:
from keras.utils import to_categorical
from keras.layers import Dense, Dropout

In [25]:
X_train_dense = X_train.toarray()
#One hot encoding für die Label wird benutzt weil das geeignet für Multi-Label Klassifikation ist
y_train_onehot = to_categorical(y_train)

In [26]:
X_test_dense = X_train.toarray()
y_test_onehot = to_categorical(y_train)

In [27]:
#Training- und Validation-Daten splitten
X_train_final, X_val, y_train_final, y_val = train_test_split(
    X_train_dense, y_train_onehot, test_size=0.15, random_state=42, stratify=y_train_onehot
)

In [28]:
seq_model = keras.Sequential()
#Input layer
seq_model.add(Dense(128, activation='tanh', input_shape=(X_train_dense.shape[1],)))
seq_model.add(Dropout(0.2))
seq_model.add(Dense(64, activation='tanh'))
seq_model.add(Dropout(0.3))
seq_model.add(Dense(32, activation='tanh'))
seq_model.add(Dropout(0.4))
#Output layer
seq_model.add(Dense(3, activation='softmax')) # <-- Aktivierungsfuktion für Multi-Label Klassifiation


seq_model.compile(loss='categorical_crossentropy',
              optimizer='adam', metrics = ['accuracy'])

In [None]:
# definiere batch size und epochs
# training und validation loss darstellen

history = seq_model.fit(X_train_final, y_train_final,
validation_data = (X_val, y_val),
epochs= 10,
batch_size = 32)

In [None]:
#Bewertung des Modells
scores = seq_model.evaluate(X_test_dense, y_test_onehot, verbose=0)
print("Accuracy: %.2f%%" % (scores[1]*100))

In [None]:
plt.plot(history.history['loss'], label='Training Loss')
plt.plot(history.history['val_loss'], label='Validation Loss')
plt.legend()
plt.show()

In [28]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Confusion Matrix

In [None]:
# 1: Vorhersagen generieren
y_pred_prob = seq_model.predict(X_test_dense)
y_pred = np.argmax(y_pred_prob, axis=1)

# 2:True Labels umwandeln (One-Hot kodiert)
y_test_classes = np.argmax(y_test_onehot, axis=1)

# 3: Confusion Matrix generieren
cm = confusion_matrix(y_test_classes, y_pred)

# 4: Visualisierung
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=["Negative", "Positive", "Neutral"])
disp.plot(cmap=plt.cm.Blues)
plt.show()

# Test Input

In [83]:
def run_NLP_pipeline(text, tfidf):
    t1 = tokenize(text)
    t2 = lemmatize(t1)
    t3 = remove_things(t2)
    t4, _ = tfidf_vectorize([t3], tfidf = tfidf)
    t5 = t4.toarray()
    return t5

In [None]:
# Du kannst hier deinen Kommentar schreiben
sentence = ["This is not good"]
sentence_1 = run_NLP_pipeline(sentence, fitted_tfidf)

print(f"Shape of input for model: {sentence_1.shape}")


In [None]:
predictions = seq_model.predict(sentence_1)

# Ermitteln der vorhergesagten Klasse (Index der maximalen Wahrscheinlichkeit)
predicted_class = predictions.argmax(axis=-1)

# Zuordnung des Index zur Klasse
class_labels = ["Negative", "Positive", "Neutral"]  # 0, 1 , 2
predicted_label = class_labels[predicted_class[0]]

# Fomarttieren
formatted_predictions = {label: round(prob, 4) for label, prob in zip(class_labels, predictions[0])}
print(f"Probabilities: {formatted_predictions}")

print(f"Predicted Class Index: {predicted_class[0]}")
print(f"Predicted Label: {predicted_label}")

# Ein paar Visualisierungen

In [None]:
%pip install wordcloud

In [None]:
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt

In [None]:
# Stoppwörter
custom_stopwords = set(STOPWORDS)  # Get default stopwords
custom_stopwords.update(["movie", "film", "scene", "character",
                         "plot", "make", "one", "actor", "see",
                         "watch", "action", "drama"])  # Add domain-specific words
print(custom_stopwords)
WC=WordCloud(width=1000,height=500,max_words=500,min_font_size=5, stopwords=custom_stopwords)

In [None]:
#word cloud für positive Wörter
plt.figure(figsize=(10,10))
positive_text=df[df.sentiment == 1].comment
positive_string = ", ".join(positive_text)
positive_words=WC.generate(positive_string)
plt.imshow(positive_words,interpolation='bilinear')
plt.show

In [None]:
#Word cloud für negative Wörter
plt.figure(figsize=(10,10))
negative_text=df[df.sentiment == 0].comment
negative_string = ", ".join(negative_text)
negative_words=WC.generate(negative_string)
plt.imshow(negative_words,interpolation='bilinear')
plt.show

In [None]:
#Word cloud für neutrale Wörter
plt.figure(figsize=(10,10))
neutral_text=df[df.sentiment == 2].comment
neutral_string = ", ".join(neutral_text)
neutral_words=WC.generate(neutral_string)
plt.imshow(neutral_words,interpolation='bilinear')
plt.show