# Data Preprocessing for MIND Dataset

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import TfidfVectorizer

# Load the news dataset 
news_df = pd.read_csv(r'C:\Users\PC\Desktop\CONTENT FILTER\data\news.tsv',  
                      sep='\t', 
                      header=None, 
                      names=['nid', 'category', 'subcategory', 'title', 
                             'abstract', 'url', 'title_entities', 'abstract_entities'])

print("Data loaded successfully!")
print(f"Found {len(news_df)} articles")

Data loaded successfully!
Found 51282 articles


In [6]:
# Basic preprocessing
news_df['text'] = news_df['title'] + ' ' + news_df['abstract']
news_df['text'] = news_df['text'].fillna('')

# Clean text
def clean_text(text):
    text = text.lower()
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return text

news_df['clean_text'] = news_df['text'].apply(clean_text)

# Create TF-IDF features
vectorizer = TfidfVectorizer(max_features=5000, stop_words='english')
tfidf_matrix = vectorizer.fit_transform(news_df['clean_text'])

# Save processed data
news_df.to_pickle(r'C:\Users\PC\Desktop\CONTENT FILTER\results\processed_news.pkl')
pd.DataFrame(tfidf_matrix.toarray(), index=news_df.index).to_pickle(r'C:\Users\PC\Desktop\CONTENT FILTER\results\tfidf_features.pkl')

# Export vocabulary for later use
import pickle
with open(r'C:\Users\PC\Desktop\CONTENT FILTER\results\tfidf_vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

print(f"Processed {len(news_df)} news articles")
print(f"TF-IDF matrix shape: {tfidf_matrix.shape}")

Processed 51282 news articles
TF-IDF matrix shape: (51282, 5000)
