# CORD-19 Dataset Analysis
## COVID-19 Research Papers Exploration

This notebook contains the complete analysis of the CORD-19 dataset metadata.

## Part 1: Data Loading and Basic Exploration

In [None]:
# Import required libraries
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from collections import Counter
import re
from wordcloud import WordCloud
import warnings
warnings.filterwarnings('ignore')

# Set plotting style
plt.style.use('default')
sns.set_palette("husl")

In [None]:
# Load the data
df = pd.read_csv('asset/metadata.csv')
print(f"Data loaded successfully! Shape: {df.shape}")

In [None]:
# Basic exploration
print("Dataset dimensions:", df.shape)
print("\nColumn names:")
print(df.columns.tolist())

In [None]:
# Data types
print("Data types:")
print(df.dtypes)

In [None]:
# First few rows
df.head()

In [None]:
# Missing values analysis
missing_values = df.isnull().sum()
missing_percentage = (missing_values / len(df)) * 100

missing_df = pd.DataFrame({
    'Missing Count': missing_values,
    'Missing Percentage': missing_percentage
})
missing_df = missing_df[missing_df['Missing Count'] > 0].sort_values('Missing Count', ascending=False)
print("Missing values summary:")
print(missing_df)

## Part 2: Data Cleaning and Preparation

In [None]:
# Create a cleaned version of the dataset
cleaned_df = df.copy()

# Convert publish_time to datetime
if 'publish_time' in cleaned_df.columns:
    cleaned_df['publish_time'] = pd.to_datetime(cleaned_df['publish_time'], errors='coerce')
    cleaned_df['year'] = cleaned_df['publish_time'].dt.year
    print("Added year column from publish_time")

# Create abstract word count
if 'abstract' in cleaned_df.columns:
    cleaned_df['abstract_word_count'] = cleaned_df['abstract'].fillna('').str.split().str.len()
    print("Added abstract_word_count column")

# Remove rows with missing titles
if 'title' in cleaned_df.columns:
    initial_count = len(cleaned_df)
    cleaned_df = cleaned_df.dropna(subset=['title'])
    print(f"Removed {initial_count - len(cleaned_df)} rows with missing titles")

print(f"\nCleaned data shape: {cleaned_df.shape}")

## Part 3: Data Analysis and Visualization

In [None]:
# Publications by year
if 'year' in cleaned_df.columns:
    year_counts = cleaned_df['year'].value_counts().sort_index()
    
    plt.figure(figsize=(12, 6))
    plt.bar(year_counts.index, year_counts.values, color='skyblue', edgecolor='navy')
    plt.title('COVID-19 Research Publications by Year', fontsize=16, fontweight='bold')
    plt.xlabel('Year', fontsize=12)
    plt.ylabel('Number of Publications', fontsize=12)
    plt.xticks(rotation=45)
    plt.grid(axis='y', alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    print(f"Peak publication year: {year_counts.idxmax()} with {year_counts.max()} papers")

In [None]:
# Top journals
if 'journal' in cleaned_df.columns:
    top_journals = cleaned_df['journal'].value_counts().head(10)
    
    plt.figure(figsize=(12, 8))
    plt.barh(range(len(top_journals)), top_journals.values, color='lightcoral')
    plt.yticks(range(len(top_journals)), top_journals.index)
    plt.title('Top 10 Journals Publishing COVID-19 Research', fontsize=16, fontweight='bold')
    plt.xlabel('Number of Publications', fontsize=12)
    plt.grid(axis='x', alpha=0.3)
    plt.tight_layout()
    plt.show()
    
    print(f"Top journal: {top_journals.index[0]} with {top_journals.iloc[0]} papers")

In [None]:
# Most frequent words in titles
if 'title' in cleaned_df.columns:
    all_titles = ' '.join(cleaned_df['title'].fillna('').astype(str))
    words = re.findall(r'\b[a-zA-Z]{3,}\b', all_titles.lower())
    
    # Remove common stop words
    stop_words = {'the', 'and', 'for', 'are', 'with', 'this', 'that', 'from', 'they', 'been', 'have', 'were', 'said', 'each', 'which', 'their', 'time', 'will', 'about', 'can', 'when', 'make', 'like', 'into', 'him', 'has', 'two', 'more', 'her', 'would', 'there', 'could', 'way', 'been', 'who', 'its', 'now', 'find', 'long', 'down', 'day', 'did', 'get', 'come', 'made', 'may', 'part'}
    filtered_words = [word for word in words if word not in stop_words]
    
    word_freq = Counter(filtered_words).most_common(15)
    words_list, counts_list = zip(*word_freq)
    
    plt.figure(figsize=(12, 8))
    plt.barh(range(len(words_list)), counts_list, color='lightgreen')
    plt.yticks(range(len(words_list)), words_list)
    plt.title('Top 15 Most Frequent Words in Paper Titles', fontsize=16, fontweight='bold')
    plt.xlabel('Frequency', fontsize=12)
    plt.grid(axis='x', alpha=0.3)
    plt.tight_layout()
    plt.show()

In [None]:
# Word cloud
if 'title' in cleaned_df.columns:
    all_titles = ' '.join(cleaned_df['title'].fillna('').astype(str))
    
    wordcloud = WordCloud(width=1200, height=600, background_color='white', 
                         colormap='viridis').generate(all_titles)
    
    plt.figure(figsize=(15, 8))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    plt.title('Word Cloud of Paper Titles', fontsize=20, fontweight='bold', pad=20)
    plt.tight_layout()
    plt.show()

In [None]:
# Distribution by source
if 'source_x' in cleaned_df.columns:
    source_counts = cleaned_df['source_x'].value_counts().head(8)
    
    plt.figure(figsize=(10, 8))
    colors = plt.cm.Set3(range(len(source_counts)))
    plt.pie(source_counts.values, labels=source_counts.index, autopct='%1.1f%%', 
            colors=colors, startangle=90)
    plt.title('Distribution of Papers by Source (Top 8)', fontsize=16, fontweight='bold')
    plt.axis('equal')
    plt.tight_layout()
    plt.show()

## Part 4: Summary Statistics and Insights

In [None]:
# Generate summary report
print("=== CORD-19 DATASET ANALYSIS SUMMARY ===")
print(f"Total papers analyzed: {len(cleaned_df):,}")

if 'year' in cleaned_df.columns:
    year_range = f"{cleaned_df['year'].min():.0f} - {cleaned_df['year'].max():.0f}"
    print(f"Publication years: {year_range}")

if 'journal' in cleaned_df.columns:
    unique_journals = cleaned_df['journal'].nunique()
    print(f"Unique journals: {unique_journals:,}")

if 'abstract_word_count' in cleaned_df.columns:
    avg_abstract_length = cleaned_df['abstract_word_count'].mean()
    print(f"Average abstract length: {avg_abstract_length:.1f} words")

if 'authors' in cleaned_df.columns:
    papers_with_authors = cleaned_df['authors'].notna().sum()
    print(f"Papers with author information: {papers_with_authors:,} ({papers_with_authors/len(cleaned_df)*100:.1f}%)")

print("\n=== KEY INSIGHTS ===")
print("1. The dataset provides comprehensive metadata for COVID-19 research papers")
print("2. Publication activity shows clear temporal patterns related to the pandemic")
print("3. Research is distributed across multiple journals and sources")
print("4. Title analysis reveals key research themes and focus areas")