# 🦠 CORD-19 Research Analysis – Final Project

This notebook contains data loading, cleaning, analysis, visualization, and a sample Streamlit app for exploring the CORD-19 dataset.


## Part 1: Data Loading and Exploration

In [None]:

import pandas as pd

# Load the dataset
df = pd.read_csv("metadata.csv")

# Preview
df.head()


In [None]:

# Shape and info
print("Shape:", df.shape)
print(df.info())

# Missing values (top 10 columns with most missing)
df.isnull().sum().sort_values(ascending=False).head(10)


## Part 2: Data Cleaning and Preparation

In [None]:

# Drop rows missing key info
df = df.dropna(subset=["title", "publish_time"])

# Convert publish_time to datetime and extract year
df["publish_time"] = pd.to_datetime(df["publish_time"], errors="coerce")
df["year"] = df["publish_time"].dt.year

# Abstract word count
df["abstract_word_count"] = df["abstract"].fillna("").apply(lambda x: len(x.split()))

df.head()


## Part 3: Data Analysis and Visualization

In [None]:

import matplotlib.pyplot as plt
from collections import Counter
from wordcloud import WordCloud

# Publications per year
year_counts = df["year"].value_counts().sort_index()
year_counts.plot(kind="bar", title="Publications by Year")
plt.show()


In [None]:

# Top journals
top_journals = df["journal"].value_counts().head(10)
top_journals.plot(kind="barh", title="Top 10 Journals")
plt.show()


In [None]:

# Frequent words in titles
words = " ".join(df["title"].dropna()).lower().split()
word_counts = Counter(words)
word_counts.most_common(15)


In [None]:

# Word cloud of titles
wc = WordCloud(width=800, height=400, background_color="white").generate(" ".join(words))
plt.imshow(wc, interpolation="bilinear")
plt.axis("off")
plt.show()


## Part 4: Streamlit Application (Code Only)

In [None]:

# Save this as app.py if you want to run it separately

import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt

df = pd.read_csv("metadata.csv")
df["publish_time"] = pd.to_datetime(df["publish_time"], errors="coerce")
df["year"] = df["publish_time"].dt.year

st.title("CORD-19 Data Explorer")
st.write("Interactive exploration of COVID-19 research papers")

year_range = st.slider("Select year range", int(df["year"].min()), int(df["year"].max()), (2020, 2021))
filtered = df[(df["year"] >= year_range[0]) & (df["year"] <= year_range[1])]

st.write(filtered.sample(5))
st.bar_chart(filtered["year"].value_counts().sort_index())
st.bar_chart(filtered["journal"].value_counts().head(10))

# Example histogram of abstract lengths
fig, ax = plt.subplots()
filtered["abstract"].fillna("").apply(lambda x: len(x.split())).hist(bins=50, ax=ax)
ax.set_xlabel("Word Count")
ax.set_ylabel("Number of Papers")
st.pyplot(fig)


## Part 5: Reflection
- Learned data cleaning, visualization, and how to build an interactive dashboard.
- Insight: Surge of papers in 2020, many on preprint servers.
- Challenge: Handling missing values in abstracts and journals.
- Takeaway: Practiced full data science workflow with Python, pandas, matplotlib, and Streamlit.
