In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud

In [None]:

# Load dataset
df = pd.read_csv("../data/metadata.csv")

# Inspect dataset
print("Shape:", df.shape)
print(df.info())
print(df.head())

# Missing values
print(df.isnull().sum().head(20))

# Basic stats
print(df.describe())

In [None]:
# Part 2: Data Cleaning and Preparation
# Handle missing publication dates
df['publish_time'] = pd.to_datetime(df['publish_time'], errors='coerce')
df = df.dropna(subset=['publish_time'])

# Extract year
df['year'] = df['publish_time'].dt.year

# Abstract word count
df['abstract_word_count'] = df['abstract'].fillna("").apply(lambda x: len(x.split()))


In [None]:
# Part 3: Analysis & Visualization

# 1. Papers by Year
year_counts = df['year'].value_counts().sort_index()
plt.figure(figsize=(8,5))
sns.barplot(x=year_counts.index, y=year_counts.values, color="skyblue")
plt.title("Publications by Year")
plt.ylabel("Number of Papers")
plt.show()

# 2. Top Journals
top_journals = df['journal'].value_counts().head(10)
plt.figure(figsize=(8,5))
sns.barplot(y=top_journals.index, x=top_journals.values, color="coral")
plt.title("Top 10 Journals Publishing COVID-19 Research")
plt.xlabel("Number of Papers")
plt.show()

# 3. Word Cloud from Titles
text = " ".join(df['title'].dropna().astype(str))
wc = WordCloud(width=800, height=400, background_color="white").generate(text)
plt.figure(figsize=(10,6))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.title("Most Frequent Words in Paper Titles")
plt.show()

# 4. Distribution by Source
source_counts = df['source_x'].value_counts().head(10)
plt.figure(figsize=(8,5))
sns.barplot(y=source_counts.index, x=source_counts.values, color="green")
plt.title("Top Data Sources")
plt.xlabel("Number of Papers")
plt.show()
