In [None]:
# 📦 Pakete importieren
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score

# 🔧 Alternative Stopword-Liste (offline verwendbar)
basic_stopwords = {
    "the", "and", "to", "of", "a", "in", "that", "is", "for", "on", "with",
    "as", "was", "at", "by", "an", "be", "this", "have", "from", "or", "it",
    "are", "not", "has", "but", "had", "they", "you", "he", "she", "we", "will",
    "their", "his", "her", "about", "would", "there", "what", "when", "which",
    "who", "were", "can", "them", "all", "been", "one", "if"
}

# 📁 CSV-Dateien laden & labeln
true_df = pd.read_csv("/content/True.csv")
fake_df = pd.read_csv("/content/Fake.csv")
true_df["label"] = 0
fake_df["label"] = 1

# 🧾 Zusammenführen
combined_df = pd.concat([true_df, fake_df], ignore_index=True)

# 📊 Explorative Datenanalyse
combined_df["text_length"] = combined_df["text"].apply(lambda x: len(str(x).split()))

plt.figure(figsize=(10, 6))
sns.histplot(data=combined_df, x="text_length", hue="label", bins=100, kde=True, element="step")
plt.title("Verteilung der Textlängen in echten und gefälschten Artikeln")
plt.xlabel("Wortanzahl im Artikel")
plt.ylabel("Anzahl Artikel")
plt.legend(title="Label", labels=["Real (0)", "Fake (1)"])
plt.grid(True)
plt.tight_layout()
plt.show()

# WordCloud-Funktion
def create_wordcloud(text, title):
    wc = WordCloud(width=800, height=400, background_color="white",
                   stopwords=basic_stopwords, collocations=False).generate(text)
    plt.figure(figsize=(10, 5))
    plt.imshow(wc, interpolation="bilinear")
    plt.axis("off")
    plt.title(title, fontsize=16)
    plt.tight_layout()
    plt.show()

# WordClouds erzeugen
create_wordcloud(" ".join(combined_df[combined_df["label"] == 1]["text"].dropna()), "WordCloud – Fake News")
create_wordcloud(" ".join(combined_df[combined_df["label"] == 0]["text"].dropna()), "WordCloud – Real News")

# 🤖 Machine Learning: Klassifikation
X = combined_df["text"].astype(str)
y = combined_df["label"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

tfidf_vectorizer = TfidfVectorizer(max_features=5000, stop_words="english")
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
X_test_tfidf = tfidf_vectorizer.transform(X_test)

rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model.fit(X_train_tfidf, y_train)

y_pred = rf_model.predict(X_test_tfidf)

# 📈 Ergebnisse
print("🔎 Accuracy:", accuracy_score(y_test, y_pred))
print("\n📊 Classification Report:\n", classification_report(y_test, y_pred, target_names=["Real", "Fake"]))
print("🧩 Confusion Matrix:\n", confusion_matrix(y_test, y_pred))


Test