In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score
from sklearn.feature_extraction.text import TfidfVectorizer
from transformers import pipeline
import joblib
import os

# Laden der bereinigten CSV-Datei
df = pd.read_csv("df.csv")

# Schritt 1: Sentimentanalyse hinzufügen
print("Starte Sentimentanalyse...")

# Verwende Hugging Face's Transformers Pipeline für die Sentimentanalyse
sentiment_analyzer = pipeline("sentiment-analysis")
# Sicherstellen, dass alle Einträge Strings sind
df['Filtered_Content'] = df['Filtered_Content'].fillna("").astype(str)

# Funktion zur Berechnung des Sentiments
def compute_sentiment(text):
    if isinstance(text, str) and len(text) > 0:
        result = sentiment_analyzer(text[:512])[0]  # Text auf 512 Zeichen begrenzen
        return result['label'], result['score']
    else:
        return "NEUTRAL", 0.0  # Standardwerte für ungültige Einträge

# Berechnung von Sentiment und Scores
df['Sentiment'], df['Sentiment_Score'] = zip(*df['Filtered_Content'].apply(compute_sentiment))


# Speichern der Ergebnisse in einer Datei
df.to_csv("sentiment_results.csv", index=False)
print("Sentimentanalyse abgeschlossen und gespeichert.")

# Schritt 2: TF-IDF Vektorisierung
print("Starte TF-IDF Vektorisierung...")
tfidf_vectorizer = TfidfVectorizer(max_features=1000)  # Begrenze auf 1000 Features für bessere Performance
X_tfidf = tfidf_vectorizer.fit_transform(df['Filtered_Content'])
y = df['NegoOutcomeLabel']

# Speichern des TF-IDF-Vektorizers
joblib.dump(tfidf_vectorizer, "tfidf_vectorizer.pkl")

# Schritt 3: Train-Test-Split
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# Schritt 4: Training eines komplexeren Modells
print("Starte Training eines RandomForest-Klassifikators...")
rf_classifier = RandomForestClassifier(random_state=42, n_estimators=100)
rf_classifier.fit(X_train, y_train)

# Vorhersagen
y_pred = rf_classifier.predict(X_test)

# Evaluation
print("Modellergebnisse:")
print("Model Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

# Speichern des Modells
joblib.dump(rf_classifier, "random_forest_model.pkl")

# Schritt 5: Speichern der Ergebnisse
results = {
    "accuracy": accuracy_score(y_test, y_pred),
    "classification_report": classification_report(y_test, y_pred, output_dict=True)
}
results_df = pd.DataFrame(results)
results_df.to_csv("model_results.csv", index=False)

print("Alle Ergebnisse gespeichert.")

# Dokumentation der Top-TFIDF-Features
feature_importances = rf_classifier.feature_importances_
feature_names = tfidf_vectorizer.get_feature_names_out()
important_features = sorted(zip(feature_importances, feature_names), reverse=True)[:50]
important_features_df = pd.DataFrame(important_features, columns=["Importance", "Feature"])
important_features_df.to_csv("top_tfidf_features.csv", index=False)

print("Top TF-IDF Features gespeichert.")

# Schritt 6: Korrelation von Sentiment mit Erfolg
print("Berechne Korrelation zwischen Sentiment und Erfolg...")
sentiment_success = df.groupby(['Sentiment', 'NegoOutcomeLabel']).size().unstack(fill_value=0)
sentiment_success.to_csv("sentiment_success_correlation.csv")

print("Korrelationsergebnisse gespeichert.")
