In [None]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import silhouette_score
import matplotlib.pyplot as plt
import seaborn as sns
import re
import pickle
import joblib
import os
from collections import Counter

# Step 1: Load and prepare the data
print("Loading data...")
df = pd.read_csv('webscraped_dataset.csv')

# Display dataset info
print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")

# Check for missing values in article_content
missing_content = df['article_content'].isna().sum()
print(f"Missing article_content values: {missing_content} ({missing_content/len(df):.2%})")

# Filter out rows with missing article_content if needed
if missing_content > 0:
    df = df.dropna(subset=['article_content'])
    print(f"Dataset shape after removing rows with missing content: {df.shape}")

# Step 2: Text preprocessing function
def clean_text(text):
    if isinstance(text, str):
        # Remove URLs
        text = re.sub(r'https?://\S+', '', text)
        # Remove special characters and preserve spaces
        text = re.sub(r'[^\w\s]', ' ', text)
        # Remove extra whitespace
        text = re.sub(r'\s+', ' ', text)
        # Convert to lowercase
        text = text.lower().strip()
        return text
    return ''

# Clean the article content - THIS IS THE MAIN CHANGE, using article_content instead of headline
print("Cleaning article content...")
df['cleaned_content'] = df['article_content'].apply(clean_text)

# Add domain-specific stopwords that don't help determine the category
custom_stopwords = [
    'said', 'says', 'according', 'reported', 'reuters', 'ap', 'news',
    'report', 'today', 'yesterday', 'week', 'month', 'year', 'day',
    'told', 'announced', 'statement', 'released', 'published', 'posted',
    'wrote', 'article', 'story', 'comment', 'update', 'copyright'
]

# Step 3: Create TF-IDF features from article content
print("Creating TF-IDF features from article content...")
vectorizer = TfidfVectorizer(
    max_features=10000,  # Using more features since articles have more content
    min_df=2,            # Term must appear in at least 2 documents
    max_df=0.85,         # Ignore terms that appear in more than 85% of documents
    ngram_range=(1, 2),  # Include single words and bigrams
    stop_words=list(set(['english'] + custom_stopwords))
)

X = vectorizer.fit_transform(df['cleaned_content'])
print(f"TF-IDF matrix shape: {X.shape}")

# Step 4: Dimensionality reduction for faster processing
print("Reducing dimensions with TruncatedSVD...")
n_components = min(300, X.shape[1] - 1)  # Don't use more components than we have features
svd = TruncatedSVD(n_components=n_components)
X_reduced = svd.fit_transform(X)
print(f"Explained variance: {svd.explained_variance_ratio_.sum():.2%}")

# Step 5: Apply K-Means clustering
# We want 4 specific categories: Business, Politics, Arts/Culture/Celebrities, Sports
num_clusters = 4
print(f"Applying K-Means clustering with {num_clusters} clusters...")

# Try multiple initializations to get the best clustering
best_kmeans = None
best_score = -1

for i in range(10):  # Try 10 different initializations
    kmeans = KMeans(n_clusters=num_clusters, random_state=i, n_init=10)
    clusters = kmeans.fit_predict(X_reduced)
    
    # Calculate silhouette score
    score = silhouette_score(X_reduced, clusters)
    print(f"Initialization {i+1}/10: Silhouette Score = {score:.4f}")
    
    if score > best_score:
        best_score = score
        best_kmeans = kmeans

kmeans = best_kmeans
print(f"Best silhouette score: {best_score:.4f}")

# Assign clusters to each article
df['cluster'] = kmeans.predict(X_reduced)

# Step 6: Analyze the clusters in depth
def get_top_terms_per_cluster(model, vectorizer, svd, n_terms=30):
    # Get components from SVD to map back to original features
    original_space_centroids = svd.inverse_transform(model.cluster_centers_)
    
    # Get the most important terms for each cluster
    order_centroids = original_space_centroids.argsort()[:, ::-1]
    terms = vectorizer.get_feature_names_out()
    
    cluster_terms = {}
    for i in range(model.n_clusters):
        top_terms = [terms[ind] for ind in order_centroids[i, :n_terms]]
        cluster_terms[i] = top_terms
    
    return cluster_terms

print("\nExtracting top terms for each cluster...")
top_terms = get_top_terms_per_cluster(kmeans, vectorizer, svd)

print("\nTop terms in each cluster:")
for cluster, terms in top_terms.items():
    print(f"Cluster {cluster}: {', '.join(terms)}")

# Step 7: Check the distribution of articles across clusters
cluster_distribution = df['cluster'].value_counts().sort_index()
print("\nDistribution of articles across clusters:")
print(cluster_distribution)

# Step 8: Visualize clusters using t-SNE
print("\nVisualizing clusters with t-SNE (this may take a while)...")
tsne = TSNE(n_components=2, random_state=42, perplexity=30, n_iter=1000)
X_tsne = tsne.fit_transform(X_reduced)

plt.figure(figsize=(12, 10))
scatter = plt.scatter(X_tsne[:, 0], X_tsne[:, 1], c=df['cluster'], cmap='viridis', alpha=0.7, s=50)
plt.colorbar(scatter, label='Cluster')
plt.title('News Articles Clustered by Content', fontsize=15)
plt.xlabel('t-SNE dimension 1')
plt.ylabel('t-SNE dimension 2')
plt.tight_layout()
plt.savefig('article_clusters_visualization.png')
print("Cluster visualization saved as 'article_clusters_visualization.png'")

# Step 9: Determine category for each cluster based on top terms
# This is a critical step - we'll see samples from each cluster to help map them
print("\nAnalyzing sample articles from each cluster:")
for cluster in range(num_clusters):
    cluster_samples = df[df['cluster'] == cluster].head(3)
    print(f"\nCluster {cluster} samples:")
    for idx, row in cluster_samples.iterrows():
        print(f"- Headline: {row['headline']}")
        # Print first 100 chars of content as a preview
        content_preview = row['article_content'][:100].replace('\n', ' ').strip() + '...'
        print(f"  Content preview: {content_preview}")

# Step 10: Map clusters to categories
# This mapping should be updated based on the analysis of cluster contents
print("\nAssigning category labels to clusters...")
cluster_to_category = {
    # These are placeholder assignments - update after analyzing your results
    0: "Business",
    1: "Politics",
    2: "Arts/Culture/Celebrities",
    3: "Sports"
}

# Display the mapping
print("Cluster to category mapping:")
for cluster, category in cluster_to_category.items():
    print(f"Cluster {cluster} -> {category} (based on top terms: {', '.join(top_terms[cluster][:5])})")

# Assign categories to the dataset
df['category'] = df['cluster'].map(cluster_to_category)

# Step 11: Save the models
print("\nSaving models...")
os.makedirs('models', exist_ok=True)

# Save the vectorizer
with open('models/tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

# Save the SVD model
joblib.dump(svd, 'models/svd_model.joblib')

# Save the kmeans model
joblib.dump(kmeans, 'models/kmeans_model.joblib')

# Save the cluster to category mapping
with open('models/cluster_to_category.pkl', 'wb') as f:
    pickle.dump(cluster_to_category, f)

# Save categorized data
output_df = df[['headline', 'article_title', 'article_url', 'cluster', 'category']]
output_df.to_csv('categorized_news.csv', index=False)
print("Categorized news saved to 'categorized_news.csv'")

# Step 12: Create a function to predict categories for new headlines or articles
def predict_category(text, vectorizer, svd, kmeans_model, cluster_mapping):
    """Predict the category of a news article or headline based on content"""
    # Clean the text
    cleaned_text = clean_text(text)
    
    # Transform to TF-IDF features
    text_tfidf = vectorizer.transform([cleaned_text])
    
    # Apply dimensionality reduction
    text_reduced = svd.transform(text_tfidf)
    
    # Predict the cluster
    cluster = kmeans_model.predict(text_reduced)[0]
    
    # Map the cluster to a category
    category = cluster_mapping.get(cluster, "Unknown")
    
    return category, cluster

# Step 13: Test with example headlines and articles from each expected category
test_content = {
    "Business": [
        "The stock market plunged today as investors reacted to rising inflation numbers. The Dow Jones Industrial Average fell by over 500 points, while the NASDAQ saw a 3% decline. Tech stocks were particularly hard hit, with major companies like Apple and Microsoft seeing significant drops in share value. Economic analysts suggest that this market correction reflects growing concerns about the Federal Reserve's potential interest rate hikes.",
        "The merger between two major retail chains was approved by regulators today. The $4.2 billion deal will create the largest department store company in the country, with over 500 locations nationwide. Shareholders from both companies overwhelmingly supported the merger, which is expected to generate cost savings of approximately $500 million annually through combined operations and supply chain efficiencies."
    ],
    "Politics": [
        "The President delivered a major policy speech today outlining his administration's legislative agenda for the coming year. Key priorities include infrastructure investment, climate change initiatives, and healthcare reform. Opposition leaders were quick to criticize the proposals, calling them too expensive and overreaching. Political analysts note that the success of this agenda will largely depend on gaining support from moderate senators in the upcoming vote.",
        "Election officials have certified the results of last month's gubernatorial race after completing a mandatory recount. The final tally showed the challenger winning by just 1,200 votes out of more than 2.3 million ballots cast. This marks the closest gubernatorial election in the state's history and ends weeks of legal challenges and vote verification procedures."
    ],
    "Arts/Culture/Celebrities": [
        "The acclaimed director's latest film received a standing ovation at its festival premiere last night. Critics are already praising the cinematography and powerful performances from the ensemble cast. The three-hour drama, which explores themes of family and identity, is expected to be a major contender during awards season. The film's lead actress, who underwent a physical transformation for the role, is being singled out for particular acclaim.",
        "The pop star surprised fans yesterday by releasing an unannounced album at midnight. The 14-track collection features collaborations with several prominent artists and represents a significant departure from her previous musical style. Social media has been flooded with reactions from fans and critics alike, with many praising the artist's willingness to experiment with new sounds and personal lyrical themes."
    ],
    "Sports": [
        "The underdog team completed their Cinderella run last night, winning the championship in a thrilling overtime victory. The team's star player scored the winning points with just 3 seconds remaining on the clock, capping off a remarkable comeback from a 15-point deficit. This marks the franchise's first title in their 50-year history and sets up a celebration parade scheduled for this weekend in the downtown area.",
        "The veteran quarterback announced his retirement today after 18 seasons in the league. During his career, he led his teams to three championships and was selected for the all-star game seven times. Team officials and former teammates attended the emotional press conference where he thanked fans and reflected on his achievements. The team is expected to retire his jersey number in a ceremony planned for next season's home opener."
    ]
}

print("\n--- MODEL EVALUATION ---")
correct_predictions = 0
total_predictions = 0

for expected_category, contents in test_content.items():
    print(f"\nTesting {expected_category} content:")
    for i, content in enumerate(contents):
        predicted_category, cluster = predict_category(content, vectorizer, svd, kmeans, cluster_to_category)
        is_correct = predicted_category == expected_category
        
        if is_correct:
            correct_predictions += 1
        
        total_predictions += 1
        
        print(f"Example {i+1}:")
        # Print just the first 100 characters of the test content
        content_preview = content[:100].replace('\n', ' ').strip() + "..."
        print(f"Content preview: {content_preview}")
        print(f"Predicted: {predicted_category} (Cluster {cluster})")
        print(f"Expected: {expected_category}")
        print(f"Correct: {'✓' if is_correct else '✗'}")
        print("-" * 50)

overall_accuracy = correct_predictions / total_predictions if total_predictions else 0
print(f"\nOverall accuracy: {overall_accuracy:.2f} ({correct_predictions}/{total_predictions})")

# Step 14: Create a simple function for headline classification
def headline_classifier():
    """Interactive function to classify headlines or content"""
    print("\n--- HEADLINE & CONTENT CLASSIFIER ---")
    print("Enter text to classify (type 'quit' to exit):")
    
    while True:
        text = input("\nEnter headline or article content: ")
        if text.lower() == 'quit':
            break
        
        category, cluster = predict_category(text, vectorizer, svd, kmeans, cluster_to_category)
        print(f"Predicted Category: {category} (Cluster {cluster})")
        print(f"Top terms in this cluster: {', '.join(top_terms[cluster][:10])}")

# Run the interactive classifier
headline_classifier()

print("\nModel training and evaluation complete!")
print("To classify new headlines or articles, you can:")
print("1. Run this script again and use the interactive classifier at the end")
print("2. Import the classifier function from another script")
print("3. Use the models saved in the 'models/' directory with your own code")