# 1️⃣ Data Loading & Basic Exploration

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# Load dataset (adjust path if needed)
df = pd.read_csv('../data/metadata.csv', low_memory=False)

# Quick overview
print("Shape:", df.shape)
print(df.info())
print(df.isnull().sum().sort_values(ascending=False).head(10))
df.head()


# 2️⃣ Data Cleaning & Preparation

In [None]:
# Convert publish_time to datetime
df['publish_time'] = pd.to_datetime(df['publish_time'], errors='coerce')
df['year'] = df['publish_time'].dt.year

# Example of filling/removing
# Drop rows without title or year
df_clean = df.dropna(subset=['title','year'])

# Optional: abstract word count
df_clean['abstract_word_count'] = df_clean['abstract'].fillna('').apply(lambda x: len(x.split()))


3️⃣ Data Analysis & Visualization

## a) Papers by Year

In [None]:
year_counts = df_clean['year'].value_counts().sort_index()

plt.figure(figsize=(8,5))
sns.barplot(x=year_counts.index, y=year_counts.values, color='skyblue')
plt.title('Publications by Year')
plt.xlabel('Year')
plt.ylabel('Number of Papers')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()


## b) Top Journals

In [None]:
top_journals = (df_clean['journal']
                .value_counts()
                .head(10))

plt.figure(figsize=(8,5))
sns.barplot(y=top_journals.index, x=top_journals.values, color='orange')
plt.title('Top 10 Journals Publishing COVID-19 Research')
plt.xlabel('Number of Papers')
plt.tight_layout()
plt.show()


## Word cloud of paper titles

In [None]:
from wordcloud import WordCloud

titles = ' '.join(df_clean['title'].dropna())
wc = WordCloud(width=800, height=400, background_color='white').generate(titles)

plt.figure(figsize=(12,6))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
plt.title('Most Frequent Words in Paper Titles')
plt.show()
