In [None]:
# Import libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from collections import Counter
import os

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

# Load dataset
df = pd.read_csv('fnspid.csv')

# Convert date to datetime
df['date'] = pd.to_datetime(df['date'], utc=True).dt.tz_convert('UTC-04:00')

# 1. Descriptive Statistics
# Headline length
df['headline_length'] = df['headline'].apply(lambda x: len(x.split()))
print("Headline Length Statistics:")
print(df['headline_length'].describe())

# Articles per publisher
publisher_counts = df['publisher'].value_counts()
print("\nTop 5 Publishers by Article Count:")
print(publisher_counts.head())

# Publication date trends
df['date_only'] = df['date'].dt.date
daily_counts = df.groupby('date_only').size()
print("\nDaily Article Counts:")
print(daily_counts.describe())

# 2. Text Analysis (Keyword Extraction)
stop_words = set(stopwords.words('english'))
def extract_keywords(text):
    words = word_tokenize(text.lower())
    return [word for word in words if word.isalnum() and word not in stop_words]

# Apply keyword extraction
df['keywords'] = df['headline'].apply(extract_keywords)
all_keywords = [word for keywords in df['keywords'] for word in keywords]
keyword_freq = Counter(all_keywords)
print("\nTop 10 Keywords:")
print(keyword_freq.most_common(10))

# 3. Time Series Analysis (Publication Frequency)
# Hourly publication trends
df['hour'] = df['date'].dt.hour
hourly_counts = df.groupby('hour').size()

# Plotting
plt.figure(figsize=(10, 6))
hourly_counts.plot(kind='bar')
plt.title('Article Publication by Hour (UTC-4)')
plt.xlabel('Hour of Day')
plt.ylabel('Number of Articles')
plt.savefig('hourly_publication.png')
plt.close()

# Daily publication trends
plt.figure(figsize=(10, 6))
daily_counts.plot()
plt.title('Daily Article Publication Trends')
plt.xlabel('Date')
plt.ylabel('Number of Articles')
plt.savefig('daily_publication.png')
plt.close()

# 4. Publisher Analysis
# Extract domains from publisher (assuming some are emails)
df['publisher_domain'] = df['publisher'].apply(lambda x: x.split('@')[-1] if '@' in x else x)
domain_counts = df['publisher_domain'].value_counts()
print("\nTop 5 Publisher Domains:")
print(domain_counts.head())

# Save domain counts plot
plt.figure(figsize=(10, 6))
domain_counts.head(10).plot(kind='bar')
plt.title('Top 10 Publisher Domains by Article Count')
plt.xlabel('Domain')
plt.ylabel('Number of Articles')
plt.savefig('publisher_domains.png')
plt.close()

: 