# CORD-19 â€” Analysis Notebook

This notebook implements the assignment: Part 1 (loading & exploration), Part 2 (cleaning), Part 3 (analysis & exports), Part 4 (Streamlit hooks), and Part 5 (short reflection). It prefers a small `metadata_sample.csv` for fast iteration but can create one from the full `metadata.csv` if present. Outputs are written to an `outputs/` folder at the repo root.

In [1]:
# Part 1: Robust loading of the sample or original metadata file
import pandas as pd
from pathlib import Path
import sys

def find_or_create_sample(nrows=50000):
    candidates = [
        Path('metadata_sample.csv'),
        Path.cwd() / 'metadata_sample.csv',
        Path.cwd().parent / 'metadata_sample.csv',
        Path('metadata.csv'),
        Path.cwd() / 'data' / 'metadata.csv',
        Path.cwd().parent / 'metadata.csv',
        Path.cwd().parent / 'data' / 'metadata.csv',
    ]
    # try direct sample locations first
    for p in candidates[:3]:
        if p.exists():
            return p
    # if no sample, try to make one from metadata.csv
    for o in candidates[3:]:
        if o.exists():
            try:
                print('Creating sample from', o)
                df = pd.read_csv(o, low_memory=False, nrows=nrows)
                out = Path.cwd() / 'metadata_sample.csv'
                df.to_csv(out, index=False)
                return out
            except Exception as e:
                print('Failed to create sample:', e)
                continue
    return None

sample_path = find_or_create_sample()
if sample_path is None:
    raise FileNotFoundError('No metadata_sample.csv or metadata.csv found in expected locations. Place dataset in project root or data/ folder.')

print('Loading', sample_path)
df = pd.read_csv(sample_path, low_memory=False)
print('Loaded shape:', df.shape)

Loading metadata_sample.csv
Loaded shape: (50000, 19)


In [2]:
# Quick exploration checks (Part 1)
print('Columns:', df.columns.tolist())
print('\nMissing values (top 10 cols):')
print(df.isnull().sum().sort_values(ascending=False).head(10))
print('\nData types:')
print(df.dtypes.value_counts())
print('\nFirst 5 rows:')
print(df.head())

Columns: ['cord_uid', 'sha', 'source_x', 'title', 'doi', 'pmcid', 'pubmed_id', 'license', 'abstract', 'publish_time', 'authors', 'journal', 'mag_id', 'who_covidence_id', 'arxiv_id', 'pdf_json_files', 'pmc_json_files', 'url', 's2_id']

Missing values (top 10 cols):
who_covidence_id    50000
arxiv_id            50000
s2_id               50000
mag_id              50000
pubmed_id           16876
pmc_json_files      14542
abstract            13997
sha                 10257
pdf_json_files      10257
authors              4630
dtype: int64

Data types:
object     15
float64     4
Name: count, dtype: int64

First 5 rows:
   cord_uid                                       sha source_x  \
0  ug7v899j  d1aafb70c066a2068b02786f8929fd9c900897fb      PMC   
1  02tnwd4m  6b0567729c2143a66d737eb0a2f63f2dce2e5a7d      PMC   
2  ejv2xln0  06ced00a5fc04215949aa72528f2eeaae1d58927      PMC   
3  2b73a28n  348055649b6b8cf2b9a376498df9bf41f7123605      PMC   
4  9785vg6d  5f48792a5fa08bed9f56016f4981ae2ca6031

In [3]:
# Part 2: Cleaning
def clean_metadata(df):
    d = df.copy()
    # publish_time -> datetime and year
    if 'publish_time' in d.columns:
        d['publish_time'] = pd.to_datetime(d['publish_time'], errors='coerce')
        d['year'] = d['publish_time'].dt.year
    else:
        d['year'] = None
    # title required for many analyses
    if 'title' in d.columns:
        d = d.dropna(subset=['title'])
        d['title'] = d['title'].astype(str)
        d['title_word_count'] = d['title'].str.split().str.len()
    else:
        d['title_word_count'] = 0
    # abstract word count
    if 'abstract' in d.columns:
        d['abstract_word_count'] = d['abstract'].fillna('').astype(str).str.split().str.len()
    else:
        d['abstract_word_count'] = 0
    # common fills
    if 'journal' in d.columns:
        d['journal'] = d['journal'].fillna('Unknown')
    if 'source_x' in d.columns:
        d['source_x'] = d['source_x'].fillna('Unknown')
    # dedupe heuristics
    if 's2_id' in d.columns:
        d = d.drop_duplicates(subset=['s2_id'])
    elif 'doi' in d.columns:
        d = d.drop_duplicates(subset=['doi'])
    else:
        d = d.drop_duplicates(subset=['title'])
    return d

df_clean = clean_metadata(df)
print('After cleaning shape:', df_clean.shape)

After cleaning shape: (1, 22)


In [4]:
# Save cleaned file to project root
out = Path.cwd() / 'metadata_cleaned.csv'
df_clean.to_csv(out, index=False)
print('Wrote cleaned csv to', out)

Wrote cleaned csv to d:\dataC transfer\xampp\htdocs\python_frameworks_assignment\metadata_cleaned.csv


In [5]:
# Part 3: Basic analysis and headless export (time series, top journals, n-grams, wordcloud)
import os, re
from collections import Counter
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
sns.set_style('whitegrid')
outputs = Path.cwd() / 'outputs'
outputs.mkdir(exist_ok=True)
# publications by year
if 'year' in df_clean.columns and df_clean['year'].notna().any():
    yearly = df_clean.groupby('year').size().sort_index()
    fig, ax = plt.subplots(figsize=(8,4))
    ax.bar(yearly.index.astype(int), yearly.values, color='tab:blue')
    ax.set_title('Publications by Year')
    fig.savefig(outputs / 'publications_by_year.png', dpi=150, bbox_inches='tight')
    plt.close(fig)
    print('Wrote publications_by_year.png')
# top journals
if 'journal' in df_clean.columns:
    topj = df_clean['journal'].value_counts().head(20)
    fig, ax = plt.subplots(figsize=(8,6))
    sns.barplot(x=topj.values, y=topj.index, palette='mako', ax=ax)
    fig.savefig(outputs / 'top_journals.png', dpi=150, bbox_inches='tight')
    plt.close(fig)
    print('Wrote top_journals.png')
# top unigrams/bigrams from titles
stop = set(WordCloud().stopwords)
def tokenize(s):
    s = re.sub(r'[^a-z0-9\s]', ' ', str(s).lower())
    toks = [t for t in s.split() if t not in stop and len(t)>1]
    return toks
unis = Counter()
bis = Counter()
for t in df_clean['title'].dropna().astype(str):
    toks = tokenize(t)
    unis.update(toks)
    bis.update([' '.join(x) for x in zip(toks, toks[1:])])
top_unis = unis.most_common(30)
top_bis = bis.most_common(30)
import csv
with open(outputs / 'top_unigrams.csv', 'w', newline='', encoding='utf-8') as fh:
    writer = csv.writer(fh)
    writer.writerow(['unigram','count'])
    writer.writerows(top_unis)
with open(outputs / 'top_bigrams.csv', 'w', newline='', encoding='utf-8') as fh:
    writer = csv.writer(fh)
    writer.writerow(['bigram','count'])
    writer.writerows(top_bis)
print('Wrote n-gram CSVs')
# wordcloud image
text = ' '.join(df_clean['title'].dropna().astype(str).tolist())
wc = WordCloud(width=1200, height=600, background_color='white', stopwords=stop, max_words=200).generate(text)
fig = plt.figure(figsize=(16,8))
plt.imshow(wc, interpolation='bilinear')
plt.axis('off')
fig.savefig(outputs / 'wordcloud_improved.png', dpi=150, bbox_inches='tight')
plt.close(fig)
print('Wrote wordcloud_improved.png')
print('Outputs written to', outputs)

Wrote publications_by_year.png



Passing `palette` without assigning `hue` is deprecated and will be removed in v0.14.0. Assign the `y` variable to `hue` and set `legend=False` for the same effect.

  sns.barplot(x=topj.values, y=topj.index, palette='mako', ax=ax)


Wrote top_journals.png
Wrote n-gram CSVs
Wrote wordcloud_improved.png
Outputs written to d:\dataC transfer\xampp\htdocs\python_frameworks_assignment\outputs


## Part 4: Streamlit integration notes
The Streamlit app `app.py` is provided at the repo root and reads the same `metadata_sample.csv` (or `metadata.csv`) used by this notebook. Run `streamlit run app.py` to view the interactive app.

## Part 5: Reflection
Write a short reflection here about key findings and challenges (placeholder).