In [None]:

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import string
import re
from collections import Counter
from textblob import Word

df = pd.read_excel('318NewsDataSet.xlsx')
df.columns = [col.strip().replace(' ', '_') for col in df.columns]


In [None]:

stopwords = set([
    'a', 'about', 'above', 'after', 'again', 'against', 'all', 'am', 'an', 'and', 'any',
    'are', 'as', 'at', 'be', 'because', 'been', 'before', 'being', 'below', 'between',
    'both', 'but', 'by', 'could', 'did', 'do', 'does', 'doing', 'down', 'during', 'each',
    'few', 'for', 'from', 'further', 'had', 'has', 'have', 'having', 'he', 'her', 'here',
    'hers', 'herself', 'him', 'himself', 'his', 'how', 'i', 'if', 'in', 'into', 'is', 'it',
    'its', 'itself', 'me', 'more', 'most', 'my', 'myself', 'no', 'nor', 'not', 'of', 'off',
    'on', 'once', 'only', 'or', 'other', 'our', 'ours', 'ourselves', 'out', 'over', 'own',
    'same', 'she', 'should', 'so', 'some', 'such', 'than', 'that', 'the', 'their', 'theirs',
    'them', 'themselves', 'then', 'there', 'these', 'they', 'this', 'those', 'through', 'to',
    'too', 'under', 'until', 'up', 'very', 'was', 'we', 'were', 'what', 'when', 'where',
    'which', 'while', 'who', 'whom', 'why', 'will', 'with', 'you', 'your', 'yours',
    'yourself', 'yourselves', 'said', 'also', 'sh', 'r', 'one', 'would', 'get', 'could', 'us',
    'like', 'make', 'many', 'however', 'must', 'still', 'even', 'much', 'new', 'take', 'two',
    'use', 'may', 'well', 'back', 'around', 'another', 'since', 'year', 'yet', 'without',
    'first', 'mr', 'can'
])

def clean_text(text):
    text = str(text).lower()
    text = text.translate(str.maketrans('', '', string.digits + string.punctuation))
    tokens = text.split()
    return ' '.join([w for w in tokens if w not in stopwords])

df['Processed_Content'] = df['Content'].apply(clean_text)


In [None]:

def lemmatize_tokens(text):
    return [Word(word).lemmatize() for word in text.split()]

df['Tokens'] = df['Processed_Content'].apply(lemmatize_tokens)


In [None]:

df['Doc_Length'] = df['Tokens'].apply(len)

plt.figure(figsize=(10, 5))
sns.histplot(df['Doc_Length'], bins=30, kde=True)
plt.title('Document Length Distribution')
plt.xlabel('Word Count')
plt.ylabel('Frequency')
plt.grid(True)
plt.savefig("length_distribution.png")
plt.show()


In [None]:

all_words = [word for tokens in df['Tokens'] for word in tokens]
top_words = Counter(all_words).most_common(20)
words, counts = zip(*top_words)

plt.figure(figsize=(12, 5))
sns.barplot(x=list(words), y=list(counts), palette='viridis')
plt.title('Top 20 Most Frequent Words')
plt.xticks(rotation=45)
plt.xlabel('Word')
plt.ylabel('Frequency')
plt.tight_layout()
plt.savefig("word_frequencies.png")
plt.show()


In [None]:

plt.figure(figsize=(12, 5))
sns.countplot(data=df, x='Year', order=sorted(df['Year'].unique()))
plt.title('Article Count by Year')
plt.xlabel('Year')
plt.ylabel('Count')
plt.grid(True)
plt.savefig("reporting_trend_yearly.png")
plt.show()

plt.figure(figsize=(12, 5))
sns.countplot(data=df, x='Month', order=['January','February','March','April','May','June',
                                         'July','August','September','October','November','December'])
plt.title('Article Count by Month')
plt.xlabel('Month')
plt.ylabel('Count')
plt.xticks(rotation=45)
plt.grid(True)
plt.tight_layout()
plt.savefig("reporting_trend_monthly.png")
plt.show()


In [None]:

df.to_csv("preprocessed_data.csv", index=False)
