In [7]:
import numpy as np
import pandas as pd
import re
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
import gradio as gr
import os
from datetime import datetime

# Log function
def log_metrics(message):
    with open(r'D:\Fact_check\model_metrics.txt', 'a') as f:
        f.write(f"[{datetime.now().strftime('%Y-%m-%d %H:%M:%S')}] {message}\n")

# Check files
files = os.listdir('D:\\Fact_check')
print("Files in D:\\Fact_check:", files)
log_metrics(f"Files in D:\\Fact_check: {files}")

# Load and combine datasets
try:
    df_news = pd.read_csv(r'D:\Fact_check\news.csv', index_col=0)
except FileNotFoundError:
    raise FileNotFoundError("news.csv not found in D:\\Fact_check")
try:
    df_toi = pd.read_csv(r'D:\Fact_check\india-news-headlines.csv')
    df_toi = df_toi.rename(columns={'headline_text': 'text', 'headline_category': 'subject'})
    df_toi['title'] = df_toi['text'].apply(lambda x: (x[:80] + '...') if len(str(x)) > 80 else str(x))
    df_toi['label'] = 'REAL'
    df_toi = df_toi[['title', 'text', 'label']].sample(1000, random_state=7)
except FileNotFoundError:
    print("india-news-headlines.csv not found. Using only news.csv.")
    df_toi = pd.DataFrame()
    log_metrics("india-news-headlines.csv not found. Using only news.csv.")

df_combined = pd.concat([df_news[['title', 'text', 'label']], df_toi], ignore_index=True)
print(f"Combined dataset shape: {df_combined.shape}")
print(f"Label counts:\n{df_combined['label'].value_counts()}")
log_metrics(f"Combined dataset shape: {df_combined.shape}, Label counts: {df_combined['label'].value_counts().to_dict()}")

# Check for dataset imbalance
real_ratio = df_combined['label'].value_counts(normalize=True).get('REAL', 0)
if real_ratio > 0.8 or real_ratio < 0.2:
    print("Warning: Dataset is imbalanced. REAL articles ratio:", f"{real_ratio:.2%}")
    log_metrics(f"Warning: Dataset imbalance. REAL ratio: {real_ratio:.2%}")

# Clean text function
def clean_text(text):
    if not isinstance(text, str):
        text = str(text)
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Combine title and text
df_combined['combined_text'] = df_combined['title'].apply(clean_text) + ' ' + df_combined['text'].apply(clean_text)
labels = df_combined['label']

# Filter Indian/international articles
india_keywords = 'india|pakistan|kashmir|modi|diplomacy|asia|united nations|africa|iran|china|ethiopia'
df_india = df_combined[df_combined['combined_text'].str.contains(india_keywords, case=False, na=False)]
print(f"Indian/international articles: {len(df_india)}")
if len(df_india) > 0:
    print(df_india[['title', 'label']].head())
log_metrics(f"Indian/international articles: {len(df_india)}")

# Split dataset
x_train, x_test, y_train, y_test = train_test_split(df_combined['combined_text'], labels, test_size=0.2, random_state=7)

# Initialize TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(stop_words='english', max_df=0.7, min_df=2, ngram_range=(1, 2))
tfidf_train = tfidf_vectorizer.fit_transform(x_train)
tfidf_test = tfidf_vectorizer.transform(x_test)

# Tune PassiveAggressiveClassifier
param_grid = {'C': [0.01, 0.1, 1.0]}
pac = PassiveAggressiveClassifier(max_iter=100, random_state=7, early_stopping=True, validation_fraction=0.1)
grid_search = GridSearchCV(pac, param_grid, cv=5, scoring='accuracy')
grid_search.fit(tfidf_train, y_train)
best_pac = grid_search.best_estimator_
print(f"Best C: {grid_search.best_params_['C']}")
log_metrics(f"Best C: {grid_search.best_params_['C']}")

# Evaluate model
y_pred = best_pac.predict(tfidf_test)
accuracy = accuracy_score(y_test, y_pred) * 100
conf_matrix = confusion_matrix(y_test, y_pred, labels=['FAKE', 'REAL'])
class_report = classification_report(y_test, y_pred, labels=['FAKE', 'REAL'])
print(f'Accuracy: {accuracy:.2f}%')
print(f'Confusion Matrix:\n{conf_matrix}')
print(f'Classification Report:\n{class_report}')
log_metrics(f"Accuracy: {accuracy:.2f}%, Confusion Matrix: {conf_matrix.tolist()}, Classification Report: {class_report}")

# Test known articles
real_article = df_combined[df_combined['label'] == 'REAL']['combined_text'].iloc[0]
fake_article = df_combined[df_combined['label'] == 'FAKE']['combined_text'].iloc[0]
real_tfidf = tfidf_vectorizer.transform([real_article])
fake_tfidf = tfidf_vectorizer.transform([fake_article])
print(f"Prediction for a known REAL article: {best_pac.predict(real_tfidf)[0]}")
print(f"Prediction for a known FAKE article: {best_pac.predict(fake_tfidf)[0]}")
log_metrics(f"Known REAL prediction: {best_pac.predict(real_tfidf)[0]}, Known FAKE prediction: {best_pac.predict(fake_tfidf)[0]}")

# Prediction history
prediction_history = []

# Prediction function for Gradio
def predict_news(title, text, source=None):
    if not title or not text or not isinstance(title, str) or not isinstance(text, str):
        return "Error: Please provide valid title and text (non-empty strings)."
    
    combined = clean_text(title + ' ' + text)
    vectorized_input = tfidf_vectorizer.transform([combined])
    prediction = best_pac.predict(vectorized_input)[0]
    probability = best_pac.decision_function(vectorized_input)[0]
    confidence = (1 / (1 + np.exp(-probability))) if prediction == 'REAL' else (1 / (1 + np.exp(probability)))
    
    # Check for Indian/international relevance
    is_india_related = bool(re.search(india_keywords, combined, re.IGNORECASE))
    india_note = "Relevant to Indian/international affairs." if is_india_related else "Not detected as Indian/international affairs."
    
    prediction_history.append({
        "Title": title[:50] + ("..." if len(title) > 50 else ""),
        "Text": text[:50] + ("..." if len(text) > 50 else ""),
        "Source": source[:50] if source else "None",
        "Prediction": prediction,
        "Confidence": f"{confidence:.2%}",
        "India/Int'l": "Yes" if is_india_related else "No"
    })
    
    color = "green" if prediction == 'REAL' else "red"
    result = f"<span style='color: {color}; font-weight: bold;'>This news is predicted to be: {prediction}</span> (Confidence: {confidence:.2%})"
    if source:
        result += f"<br>Source (not used in prediction): {source}"
    result += f"<br>{india_note}"
    return result

# Clear history function
def clear_history():
    prediction_history.clear()
    return "Prediction history cleared.", get_prediction_history()

# History function
def get_prediction_history():
    if not prediction_history:
        return "No predictions yet."
    return pd.DataFrame(prediction_history).to_html(index=False, classes="table table-striped")

# Example articles (prioritize Indian/international)
if len(df_india) > 0:
    real_example = df_india[df_india['label'] == 'REAL'].iloc[0] if 'REAL' in df_india['label'].values else df_combined[df_combined['label'] == 'REAL'].iloc[0]
    fake_example = df_india[df_india['label'] == 'FAKE'].iloc[0] if 'FAKE' in df_india['label'].values else df_combined[df_combined['label'] == 'FAKE'].iloc[0]
else:
    real_example = df_combined[df_combined['label'] == 'REAL'].iloc[0]
    fake_example = df_combined[df_combined['label'] == 'FAKE'].iloc[0]
examples = [
    [real_example['title'], real_example['text'], "Times of India"],
    [fake_example['title'], fake_example['text'], None]
]

# Enhanced CSS
custom_css = """
<style>
    .gr-button { background-color: #007BFF; color: white; border-radius: 5px; padding: 10px; }
    .gr-button:hover { background-color: #0056b3; }
    .gr-input, .gr-textbox { border-radius: 5px; border: 1px solid #ccc; padding: 8px; font-size: 16px; }
    .gr-text { font-size: 16px; color: #333; }
    .table { width: 100%; border-collapse: collapse; margin-top: 10px; }
    .table th, .table td { border: 1px solid #ddd; padding: 10px; text-align: left; font-size: 14px; }
    .table th { background-color: #007BFF; color: white; }
    .table tr:nth-child(even) { background-color: #f9f9f9; }
    h1 { color: #007BFF; font-size: 28px; }
    h3 { color: #333; font-size: 20px; }
    .gr-panel { border: 1px solid #ddd; border-radius: 5px; padding: 15px; margin-bottom: 15px; }
</style>
"""

# Gradio UI
with gr.Blocks(theme=gr.themes.Soft(), css=custom_css) as interface:
    gr.Markdown("# Fake News Detector")
    gr.Markdown("Enter a news article's title, text, and optional source to predict if it's **REAL** or **FAKE**. Handles various news topics, including Indian and international affairs.")
    
    with gr.Tabs():
        with gr.Tab("Predict"):
            with gr.Row():
                with gr.Column(scale=2):
                    gr.Markdown("### Input Article")
                    title_input = gr.Textbox(lines=2, label="News Title", placeholder="Enter the news article title...")
                    text_input = gr.Textbox(lines=8, label="News Text", placeholder="Enter the news article text...")
                    source_input = gr.Textbox(lines=1, label="Source (Optional)", placeholder="Enter source (e.g., Times of India, BBC)")
                    with gr.Row():
                        predict_button = gr.Button("Predict")
                        clear_button = gr.Button("Clear History")
                    output = gr.HTML(label="Prediction Result")
                
                with gr.Column(scale=1):
                    gr.Markdown("### Example Articles")
                    gr.Examples(
                        examples=examples,
                        inputs=[title_input, text_input, source_input],
                        label="Test with sample news articles"
                    )
        
        with gr.Tab("History"):
            gr.Markdown("### Prediction History")
            history_output = gr.HTML(get_prediction_history())
    
    predict_button.click(
        fn=predict_news,
        inputs=[title_input, text_input, source_input],
        outputs=output
    ).then(
        fn=get_prediction_history,
        inputs=None,
        outputs=history_output
    )
    
    clear_button.click(
        fn=clear_history,
        inputs=None,
        outputs=[output, history_output]
    )

interface.launch()

Files in D:\Fact_check: ['india-news-headlines.csv', 'model_metrics.txt', 'news.csv']
Combined dataset shape: (7335, 3)
Label counts:
label
REAL    4171
FAKE    3164
Name: count, dtype: int64
Indian/international articles: 1806
                                                title label
2         Kerry to go to Paris in gesture of sympathy  REAL
5                                         Tehran, USA  FAKE
8   Fact check: Trump and Clinton at the 'commande...  REAL
9   Iran reportedly makes new push for uranium con...  REAL
18  What's in that Iran bill that Obama doesn't like?  REAL
Best C: 0.1
Accuracy: 92.57%
Confusion Matrix:
[[584  55]
 [ 54 774]]
Classification Report:
              precision    recall  f1-score   support

        FAKE       0.92      0.91      0.91       639
        REAL       0.93      0.93      0.93       828

    accuracy                           0.93      1467
   macro avg       0.92      0.92      0.92      1467
weighted avg       0.93      0.93      0.93    

