<a href="https://colab.research.google.com/github/Amit-sheikh/Amit-sheikh/blob/main/ml_dl_nlp.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [7]:
# --- Fake vs Real News detection from News-_dataset.zip ---
import zipfile
import os
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import classification_report, accuracy_score

# === 1Ô∏è‚É£ ZIP ‡¶´‡¶æ‡¶á‡¶≤ extract ‡¶ï‡¶∞‡¶æ ===
zip_path = "News-_dataset.zip"
extract_dir = "news_dataset_extracted"

if not os.path.exists(extract_dir):
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        zip_ref.extractall(extract_dir)
    print("‚úÖ ZIP extracted successfully!")
else:
    print("‚ö° Already extracted.")

# === 2Ô∏è‚É£ CSV ‡¶´‡¶æ‡¶á‡¶≤‡¶ó‡ßÅ‡¶≤‡ßã ‡¶≤‡ßã‡¶° ‡¶ï‡¶∞‡¶æ ===
fake_path = os.path.join(extract_dir, "Fake.csv")
true_path = os.path.join(extract_dir, "True.csv")

fake = pd.read_csv(fake_path, on_bad_lines='skip', encoding='utf-8')
true = pd.read_csv(true_path, on_bad_lines='skip', encoding='utf-8')

# ‡¶ï‡ßã‡¶® ‡¶ï‡¶≤‡¶æ‡¶Æ‡ßá ‡¶ü‡ßá‡¶ï‡ßç‡¶∏‡¶ü ‡¶Ü‡¶õ‡ßá ‡¶∏‡ßá‡¶ü‡¶æ ‡¶ñ‡ßÅ‡¶Å‡¶ú‡ßá ‡¶¨‡ßá‡¶∞ ‡¶ï‡¶∞‡¶æ
def find_text_column(df):
    possible_cols = ['text', 'content', 'title', 'headline', 'description']
    for col in df.columns:
        if col.lower() in possible_cols:
            return col
    for col in df.columns:
        if df[col].dtype == object:
            return col
    return df.columns[0]

fake_text_col = find_text_column(fake)
true_text_col = find_text_column(true)

# ‡¶≤‡ßá‡¶¨‡ßá‡¶≤ ‡¶§‡ßà‡¶∞‡¶ø ‡¶ï‡¶∞‡¶æ
fake_df = pd.DataFrame({'text': fake[fake_text_col].astype(str), 'label': 0})
true_df = pd.DataFrame({'text': true[true_text_col].astype(str), 'label': 1})

# ‡¶Æ‡¶ø‡¶∂‡¶ø‡ßü‡ßá ‡¶¶‡ßá‡¶ì‡ßü‡¶æ (shuffle)
df = pd.concat([fake_df, true_df], ignore_index=True).sample(frac=1, random_state=42).reset_index(drop=True)

print(f"üìä Total samples: {len(df)}")
print(df.head())

# === 3Ô∏è‚É£ Train/Test Split ===
X = df['text']
y = df['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# === 4Ô∏è‚É£ TF-IDF Vectorization ===
tfidf = TfidfVectorizer(max_features=20000, ngram_range=(1,2), stop_words='english')
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

# === 5Ô∏è‚É£ Logistic Regression ===
print("\nüöÄ Training Logistic Regression...")
lr = LogisticRegression(max_iter=1000)
lr.fit(X_train_tfidf, y_train)
y_pred_lr = lr.predict(X_test_tfidf)

print("\nüìà Logistic Regression Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_lr))
print(classification_report(y_test, y_pred_lr, target_names=['Fake','True']))

# === 6Ô∏è‚É£ Naive Bayes ===
print("\nüöÄ Training Naive Bayes...")
nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)
y_pred_nb = nb.predict(X_test_tfidf)

print("\nüìà Naive Bayes Results:")
print("Accuracy:", accuracy_score(y_test, y_pred_nb))
print(classification_report(y_test, y_pred_nb, target_names=['Fake','True']))

print("\n‚úÖ Done!")


‚úÖ ZIP extracted successfully!
üìä Total samples: 44898
                                                text  label
0  Ben Stein Calls Out 9th Circuit Court: Committ...      0
1  Trump drops Steve Bannon from National Securit...      1
2  Puerto Rico expects U.S. to lift Jones Act shi...      1
3   OOPS: Trump Just Accidentally Confirmed He Le...      0
4  Donald Trump heads for Scotland to reopen a go...      1

üöÄ Training Logistic Regression...

üìà Logistic Regression Results:
Accuracy: 0.9465478841870824
              precision    recall  f1-score   support

        Fake       0.96      0.94      0.95      4696
        True       0.93      0.95      0.94      4284

    accuracy                           0.95      8980
   macro avg       0.95      0.95      0.95      8980
weighted avg       0.95      0.95      0.95      8980


üöÄ Training Naive Bayes...

üìà Naive Bayes Results:
Accuracy: 0.9413140311804009
              precision    recall  f1-score   support

        Fake

FileNotFoundError: [Errno 2] No such file or directory: 'combined_news.csv'