# CORD-19 COVID-19 Research Paper Analysis

This notebook provides a comprehensive analysis of the CORD-19 dataset, focusing on metadata exploration, data cleaning, visualization, and insights about COVID-19 research publications.

## Assignment Overview
- **Part 1**: Data Loading and Basic Exploration
- **Part 2**: Data Cleaning and Preparation  
- **Part 3**: Data Analysis and Visualization
- **Part 4**: Streamlit Application Development
- **Part 5**: Documentation and Reflection

---

## Part 1: Environment Setup and Data Loading

In [None]:
# Import required libraries
import pandas as pd   # for data manipulation
import numpy as np # for numerical operations
import matplotlib.pyplot as plt # for data visualization
import seaborn as sns # for statistical data visualization and much more impressive data visualization
from collections import Counter # for counting hashable objects
import re # for regular expressions
import warnings # to manage warnings
warnings.filterwarnings('ignore') # ignore warnings for cleaner output

# Set display options
pd.set_option('display.max_columns', None)
pd.set_option('display.width', 1000)
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

print("All libraries imported successfully!")
print("Ready for CORD-19 data analysis")

✅ All libraries imported successfully!
📊 Ready for CORD-19 data analysis


In [None]:
# Load the CORD-19 metadata
print("Loading CORD-19 metadata...")
print("Note: This may take a few minutes due to large file size")

try:
    # Load the dataset - using low_memory=False to handle mixed types
    df = pd.read_csv('metadata.csv', low_memory=False)
    print(f"Successfully loaded {len(df):,} records")
    print(f"Dataset shape: {df.shape}")
    
except FileNotFoundError:
    print("Error: metadata.csv file not found!")
    print("Please ensure the CORD-19 metadata.csv file is in the current directory")
    df = None
except Exception as e:
    print(f"Error loading file: {e}")
    df = None

📁 Loading CORD-19 metadata...
⚠️  Note: This may take a few minutes due to large file size


## Part 2: Basic Data Exploration

In [None]:
# Basic information about the dataset
if df is not None:
    print("=" * 60)
    print("CORD-19 DATASET OVERVIEW")
    print("=" * 60)
    
    # Dataset dimensions
    print(f"Rows: {df.shape[0]:,}")
    print(f"Columns: {df.shape[1]:,}")
    print(f"Memory usage: {df.memory_usage().sum() / 1024**2:.1f} MB")

    print("\n" + "=" * 60)
    print("COLUMN INFORMATION")
    print("=" * 60)
    
    # Column names and types
    print(f"Column names ({len(df.columns)} total):")
    for i, col in enumerate(df.columns, 1):
        print(f"{i:2d}. {col}")

    print(f"\nData types summary:")
    print(df.dtypes.value_counts())
else:
    print("Cannot explore data - dataset not loaded")

In [None]:
# Display first and last few rows
if df is not None:
    print("=" * 60)
    print("FIRST 3 ROWS")
    print("=" * 60)
    display(df.head(3))
    
    print("\n" + "=" * 60)
    print("LAST 3 ROWS")
    print("=" * 60)
    display(df.tail(3))
    
    print("\n" + "=" * 60)
    print("BASIC STATISTICS FOR NUMERICAL COLUMNS")
    print("=" * 60)
    numeric_columns = df.select_dtypes(include=[np.number]).columns
    if len(numeric_columns) > 0:
        display(df[numeric_columns].describe())
    else:
        print("No numerical columns found")
else:
    print("Cannot display data - dataset not loaded")

## Part 3: Data Cleaning and Missing Values

In [None]:
# Analyze missing values
if df is not None:
    print("=" * 60)
    print("MISSING VALUES ANALYSIS")
    print("=" * 60)
    
    missing_data = pd.DataFrame({
        'Column': df.columns,
        'Missing_Count': df.isnull().sum(),
        'Missing_Percentage': (df.isnull().sum() / len(df)) * 100
    }).sort_values('Missing_Percentage', ascending=False)
    
    print(f"Missing values summary:")
    display(missing_data[missing_data['Missing_Count'] > 0])
    
    # Identify columns with high missing percentages
    high_missing = missing_data[missing_data['Missing_Percentage'] > 50]
    if not high_missing.empty:
        print(f"\nColumns with >50% missing values ({len(high_missing)} columns):")
        for _, row in high_missing.iterrows():
            print(f"   • {row['Column']}: {row['Missing_Percentage']:.1f}% missing")
    
    # Key columns for analysis
    key_columns = ['title', 'abstract', 'authors', 'journal', 'publish_time']
    print(f"\nMissing values in key columns:")
    for col in key_columns:
        if col in df.columns:
            missing_pct = (df[col].isnull().sum() / len(df)) * 100
            print(f"   • {col}: {missing_pct:.1f}% missing")
        else:
            print(f"   • {col}: Column not found")
else:
    print("Cannot analyze missing values - dataset not loaded")

In [None]:
# Create cleaned dataset
if df is not None:
    print("=" * 60)
    print("CREATING CLEANED DATASET")
    print("=" * 60)
    
    # Start with original dataset
    df_clean = df.copy()

    print(f"Original dataset: {len(df_clean):,} rows")
    
    # Remove rows where title is missing (essential for analysis)
    if 'title' in df_clean.columns:
        initial_count = len(df_clean)
        df_clean = df_clean.dropna(subset=['title'])
        removed = initial_count - len(df_clean)
        print(f"Removed {removed:,} rows with missing titles")
        print(f"After title cleanup: {len(df_clean):,} rows")

    # Fill missing abstracts with empty string for text analysis
    if 'abstract' in df_clean.columns:
        df_clean['abstract'] = df_clean['abstract'].fillna('')
        
    # Fill missing journal names with "Unknown"
    if 'journal' in df_clean.columns:
        df_clean['journal'] = df_clean['journal'].fillna('Unknown')
        
    # Fill missing authors with "Unknown"
    if 'authors' in df_clean.columns:
        df_clean['authors'] = df_clean['authors'].fillna('Unknown')
    
    print(f"Cleaned dataset ready: {len(df_clean):,} rows")
    print(f"Data reduction: {((len(df) - len(df_clean)) / len(df) * 100):.1f}%")

else:
    print("Cannot create cleaned dataset - original dataset not loaded")
    df_clean = None

## Part 4: Data Preparation and Feature Engineering

In [None]:
# Data preparation and feature engineering
if df_clean is not None:
    print("=" * 60)
    print("FEATURE ENGINEERING")
    print("=" * 60)
    
    # Convert publish_time to datetime
    if 'publish_time' in df_clean.columns:
        print("Converting publication dates...")
        df_clean['publish_time'] = pd.to_datetime(df_clean['publish_time'], errors='coerce')
        
        # Extract year from publication date
        df_clean['publication_year'] = df_clean['publish_time'].dt.year
        
        # Filter for reasonable years (2000-2024)
        valid_years = df_clean['publication_year'].between(2000, 2024, na=False)
        print(f"Papers with valid publication years: {valid_years.sum():,}")
        
        # Show year distribution
        year_counts = df_clean['publication_year'].value_counts().sort_index()
        print(f"Publication years range: {year_counts.index.min():.0f} - {year_counts.index.max():.0f}")
    
    # Create abstract word count
    if 'abstract' in df_clean.columns:
        print("Calculating abstract word counts...")
        df_clean['abstract_word_count'] = df_clean['abstract'].apply(
            lambda x: len(str(x).split()) if pd.notna(x) and x != '' else 0
        )
        print(f"Average abstract length: {df_clean['abstract_word_count'].mean():.1f} words")
    
    # Create title word count
    if 'title' in df_clean.columns:
        print("Calculating title word counts...")
        df_clean['title_word_count'] = df_clean['title'].apply(
            lambda x: len(str(x).split()) if pd.notna(x) else 0
        )
        print(f"Average title length: {df_clean['title_word_count'].mean():.1f} words")
    
    # Clean journal names
    if 'journal' in df_clean.columns:
        print("Processing journal names...")
        # Remove extra whitespace and standardize
        df_clean['journal_clean'] = df_clean['journal'].str.strip().str.title()
        unique_journals = df_clean['journal_clean'].nunique()
        print(f"Number of unique journals: {unique_journals:,}")
    
    print("Feature engineering completed!")
    
else:
    print("Cannot perform feature engineering - cleaned dataset not available")

## Part 5: Publication Trends Analysis

In [None]:
# Analyze publication trends over time
if df_clean is not None and 'publication_year' in df_clean.columns:
    print("=" * 60)
    print("PUBLICATION TRENDS ANALYSIS")
    print("=" * 60)
    
    # Get papers by year
    yearly_counts = df_clean['publication_year'].value_counts().sort_index()
    
    # Filter for COVID-era (2019-2022) to focus on pandemic research
    covid_era = yearly_counts[(yearly_counts.index >= 2019) & (yearly_counts.index <= 2022)]
    
    print("Publications by year (COVID era - 2019-2022):")
    for year, count in covid_era.items():
        if not pd.isna(year):
            print(f"   {int(year)}: {count:,} papers")
    
    # Calculate growth rates
    if len(covid_era) > 1:
        print(f"\nGrowth analysis:")
        for i in range(1, len(covid_era)):
            prev_year = covid_era.index[i-1]
            curr_year = covid_era.index[i]
            prev_count = covid_era.iloc[i-1]
            curr_count = covid_era.iloc[i]
            
            growth_rate = ((curr_count - prev_count) / prev_count) * 100
            print(f"   {int(prev_year)} → {int(curr_year)}: {growth_rate:+.1f}% change")
    
    # Find peak research year
    if not yearly_counts.empty:
        peak_year = yearly_counts.idxmax()
        peak_count = yearly_counts.max()
        print(f"\nPeak research year: {int(peak_year)} with {peak_count:,} papers")
        
    # Show recent years trend (2020-2022)
    recent_years = yearly_counts[(yearly_counts.index >= 2020) & (yearly_counts.index <= 2022)]
    if not recent_years.empty:
        print(f"\nCOVID-19 research output (2020-2022): {recent_years.sum():,} papers")
        
else:
    print("Cannot analyze publication trends - data not available")

## Part 6: Journal and Source Analysis

In [None]:
# Analyze journals and publication sources
if df_clean is not None:
    print("=" * 60)
    print("JOURNAL AND SOURCE ANALYSIS")
    print("=" * 60)
    
    if 'journal_clean' in df_clean.columns:
        # Top journals by publication count
        top_journals = df_clean['journal_clean'].value_counts().head(15)
        
        print("Top 15 journals publishing COVID-19 research:")
        for i, (journal, count) in enumerate(top_journals.items(), 1):
            percentage = (count / len(df_clean)) * 100
            print(f"{i:2d}. {journal}: {count:,} papers ({percentage:.1f}%)")
        
        # Journal diversity analysis
        total_journals = df_clean['journal_clean'].nunique()
        print(f"\nJournal diversity:")
        print(f"   • Total unique journals: {total_journals:,}")
        print(f"   • Journals with only 1 paper: {(df_clean['journal_clean'].value_counts() == 1).sum():,}")
        print(f"   • Journals with >100 papers: {(df_clean['journal_clean'].value_counts() > 100).sum():,}")
        
        # Concentration analysis
        top_10_share = (top_journals.head(10).sum() / len(df_clean)) * 100
        print(f"   • Top 10 journals represent: {top_10_share:.1f}% of all papers")
    
    # Analyze by source (if available)
    source_columns = ['source_x', 'source', 'database_name']
    source_col = None
    for col in source_columns:
        if col in df_clean.columns:
            source_col = col
            break
    
    if source_col:
        print(f"\nAnalysis by source ({source_col}):")
        source_counts = df_clean[source_col].value_counts().head(10)
        for source, count in source_counts.items():
            percentage = (count / len(df_clean)) * 100
            print(f"   • {source}: {count:,} papers ({percentage:.1f}%)")
    else:
        print(f"\nNo source column found")
        
else:
    print("Cannot analyze journals - data not available")

## Part 7: Title Text Analysis

In [None]:
# Analyze words in paper titles
if df_clean is not None and 'title' in df_clean.columns:
    print("=" * 60)
    print("TITLE TEXT ANALYSIS")
    print("=" * 60)
    
    # Function to extract words from titles
    def extract_words(titles):
        """Extract and clean words from titles"""
        all_words = []
        for title in titles.dropna():
            # Convert to lowercase and extract words
            words = re.findall(r'\b[a-zA-Z]{3,}\b', str(title).lower())
            all_words.extend(words)
        return all_words
    
    # Extract all words from titles
    print("Extracting words from titles...")
    title_words = extract_words(df_clean['title'])
    
    # Define common stop words to exclude
    stop_words = {
        'the', 'and', 'for', 'are', 'with', 'from', 'this', 'that', 'was', 'were',
        'been', 'have', 'has', 'had', 'will', 'would', 'could', 'should', 'may',
        'can', 'study', 'analysis', 'research', 'using', 'based', 'new', 'case',
        'cases', 'patients', 'patient', 'clinical', 'systematic', 'meta', 'review'
    }
    
    # Filter out stop words
    filtered_words = [word for word in title_words if word not in stop_words]
    
    # Count word frequencies
    word_counts = Counter(filtered_words)
    
    print(f"Total words found: {len(title_words):,}")
    print(f"Unique words (after filtering): {len(word_counts):,}")
    
    # Show most frequent words
    most_common = word_counts.most_common(20)
    print(f"\nTop 20 most frequent words in titles:")
    for i, (word, count) in enumerate(most_common, 1):
        percentage = (count / len(filtered_words)) * 100
        print(f"{i:2d}. {word}: {count:,} ({percentage:.1f}%)")
    
    # COVID-related terms analysis
    covid_terms = ['covid', 'coronavirus', 'sars', 'pandemic', 'outbreak', 'epidemic']
    print(f"\nCOVID-19 related terms frequency:")
    for term in covid_terms:
        count = word_counts.get(term, 0)
        if count > 0:
            percentage = (count / len(filtered_words)) * 100
            print(f"   • {term}: {count:,} ({percentage:.1f}%)")
    
    # Store word counts for visualization
    top_words_for_viz = dict(most_common[:15])
    
else:
    print("Cannot analyze titles - data not available")
    top_words_for_viz = {}

## Part 8: Data Visualizations

In [None]:
# Create visualizations
if df_clean is not None:
    print("=" * 60)
    print("CREATING VISUALIZATIONS")
    print("=" * 60)
    
    # Set up the plotting style
    plt.style.use('seaborn-v0_8')
    fig = plt.figure(figsize=(20, 15))
    
    # 1. Publications over time
    if 'publication_year' in df_clean.columns:
        ax1 = plt.subplot(2, 3, 1)
        yearly_counts = df_clean['publication_year'].value_counts().sort_index()
        covid_era = yearly_counts[(yearly_counts.index >= 2015) & (yearly_counts.index <= 2022)]
        
        plt.bar(covid_era.index, covid_era.values, color='steelblue', alpha=0.7)
        plt.title('COVID-19 Research Publications Over Time', fontsize=14, fontweight='bold')
        plt.xlabel('Publication Year')
        plt.ylabel('Number of Papers')
        plt.xticks(rotation=45)
        
        # Add trend line
        if len(covid_era) > 1:
            z = np.polyfit(covid_era.index, covid_era.values, 2)
            p = np.poly1d(z)
            plt.plot(covid_era.index, p(covid_era.index), "r--", alpha=0.8, linewidth=2)
        
        plt.grid(True, alpha=0.3)
    
    # 2. Top journals
    if 'journal_clean' in df_clean.columns:
        ax2 = plt.subplot(2, 3, 2)
        top_journals = df_clean['journal_clean'].value_counts().head(10)
        
        plt.barh(range(len(top_journals)), top_journals.values, color='forestgreen', alpha=0.7)
        plt.yticks(range(len(top_journals)), [j[:30] + '...' if len(j) > 30 else j for j in top_journals.index])
        plt.title('Top 10 Journals Publishing COVID-19 Research', fontsize=14, fontweight='bold')
        plt.xlabel('Number of Papers')
        plt.grid(True, alpha=0.3)
    
    # 3. Word frequency in titles
    if 'top_words_for_viz' in locals() and top_words_for_viz:
        ax3 = plt.subplot(2, 3, 3)
        words = list(top_words_for_viz.keys())[:10]
        counts = list(top_words_for_viz.values())[:10]
        
        plt.barh(range(len(words)), counts, color='coral', alpha=0.7)
        plt.yticks(range(len(words)), words)
        plt.title('Most Frequent Words in Titles', fontsize=14, fontweight='bold')
        plt.xlabel('Frequency')
        plt.grid(True, alpha=0.3)
    
    # 4. Abstract length distribution
    if 'abstract_word_count' in df_clean.columns:
        ax4 = plt.subplot(2, 3, 4)
        # Filter out outliers for better visualization
        word_counts = df_clean[df_clean['abstract_word_count'] <= 500]['abstract_word_count']
        
        plt.hist(word_counts, bins=50, color='purple', alpha=0.7, edgecolor='black')
        plt.title('Distribution of Abstract Lengths', fontsize=14, fontweight='bold')
        plt.xlabel('Number of Words')
        plt.ylabel('Number of Papers')
        plt.axvline(word_counts.mean(), color='red', linestyle='--', linewidth=2, 
                   label=f'Mean: {word_counts.mean():.0f} words')
        plt.legend()
        plt.grid(True, alpha=0.3)
    
    # 5. Publication by month (if data available)
    if 'publish_time' in df_clean.columns:
        ax5 = plt.subplot(2, 3, 5)
        # Focus on 2020-2021 for COVID-19 research patterns
        covid_papers = df_clean[df_clean['publication_year'].isin([2020, 2021])].copy()
        if not covid_papers.empty:
            covid_papers['month'] = covid_papers['publish_time'].dt.month
            monthly_counts = covid_papers['month'].value_counts().sort_index()
            
            month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
                          'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
            
            plt.bar(monthly_counts.index, monthly_counts.values, color='teal', alpha=0.7)
            plt.title('COVID-19 Research by Month (2020-2021)', fontsize=14, fontweight='bold')
            plt.xlabel('Month')
            plt.ylabel('Number of Papers')
            plt.xticks(range(1, 13), month_names)
            plt.grid(True, alpha=0.3)
    
    # 6. Year comparison pie chart
    if 'publication_year' in df_clean.columns:
        ax6 = plt.subplot(2, 3, 6)
        key_years = [2019, 2020, 2021, 2022]
        year_data = []
        year_labels = []
        
        for year in key_years:
            count = (df_clean['publication_year'] == year).sum()
            if count > 0:
                year_data.append(count)
                year_labels.append(f'{year}\n({count:,})')
        
        colors = ['lightblue', 'orange', 'lightgreen', 'pink']
        plt.pie(year_data, labels=year_labels, autopct='%1.1f%%', colors=colors[:len(year_data)])
        plt.title('Research Distribution by Key Years', fontsize=14, fontweight='bold')
    
    plt.tight_layout()
    plt.show()
    
    print("All visualizations created successfully!")
    
else:
    print("Cannot create visualizations - data not available")

In [None]:
# Create Word Cloud (if wordcloud package is available)
try:
    from wordcloud import WordCloud
    
    if df_clean is not None and 'title' in df_clean.columns:
        print("Creating word cloud from paper titles...")
        
        # Combine all titles into one text
        titles_text = ' '.join(df_clean['title'].dropna().astype(str))
        
        # Create word cloud
        plt.figure(figsize=(12, 8))
        wordcloud = WordCloud(
            width=1200, 
            height=600,
            background_color='white',
            max_words=100,
            colormap='viridis',
            stopwords=stop_words
        ).generate(titles_text)
        
        plt.imshow(wordcloud, interpolation='bilinear')
        plt.axis('off')
        plt.title('Word Cloud of Research Paper Titles', fontsize=16, fontweight='bold', pad=20)
        plt.tight_layout()
        plt.show()
        
        print("Word cloud created successfully!")
    
except ImportError:
    print("WordCloud package not available - install with: pip install wordcloud")
    
    # Alternative simple word frequency visualization
    if 'top_words_for_viz' in locals() and top_words_for_viz:
        plt.figure(figsize=(12, 6))
        words = list(top_words_for_viz.keys())[:15]
        counts = list(top_words_for_viz.values())[:15]
        
        plt.barh(range(len(words)), counts, color='skyblue', alpha=0.8)
        plt.yticks(range(len(words)), words)
        plt.title('Top 15 Words in Research Titles (Alternative to Word Cloud)', 
                 fontsize=14, fontweight='bold')
        plt.xlabel('Frequency')
        plt.grid(True, alpha=0.3)
        plt.tight_layout()
        plt.show()
        
        print("Alternative word frequency chart created!")

## Part 9: Streamlit Application Development

In [None]:
# Create Streamlit application code
streamlit_app_code = '''
import streamlit as st
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
from collections import Counter
import re

# Configure Streamlit page
st.set_page_config(
    page_title="CORD-19 Data Explorer",
    page_icon="🔬",
    layout="wide",
    initial_sidebar_state="expanded"
)

# Custom CSS for better styling
st.markdown("""
<style>
    .main-header {
        font-size: 3rem;
        color: #1f77b4;
        text-align: center;
        margin-bottom: 2rem;
    }
    .metric-card {
        background-color: #f0f2f6;
        padding: 1rem;
        border-radius: 10px;
        border-left: 4px solid #1f77b4;
    }
</style>
""", unsafe_allow_html=True)

@st.cache_data
def load_data():
    """Load and cache the CORD-19 dataset"""
    try:
        df = pd.read_csv('metadata.csv', low_memory=False)
        # Basic cleaning
        df = df.dropna(subset=['title'])
        df['publish_time'] = pd.to_datetime(df['publish_time'], errors='coerce')
        df['publication_year'] = df['publish_time'].dt.year
        df['journal'] = df['journal'].fillna('Unknown')
        df['abstract'] = df['abstract'].fillna('')
        df['abstract_word_count'] = df['abstract'].apply(lambda x: len(str(x).split()) if x else 0)
        return df
    except Exception as e:
        st.error(f"Error loading data: {e}")
        return None

def main():
    # Header
    st.markdown('<h1 class="main-header">CORD-19 Data Explorer</h1>', unsafe_allow_html=True)
    st.markdown("### Interactive exploration of COVID-19 research papers")
    
    # Load data
    with st.spinner("Loading CORD-19 dataset... This may take a moment."):
        df = load_data()
    
    if df is None:
        st.error("Failed to load data. Please ensure metadata.csv is available.")
        return
    
    # Sidebar filters
    st.sidebar.header("Filters & Options")
    
    # Year range filter
    min_year = int(df['publication_year'].min()) if not df['publication_year'].isna().all() else 2000
    max_year = int(df['publication_year'].max()) if not df['publication_year'].isna().all() else 2024
    
    year_range = st.sidebar.slider(
        "Select Year Range",
        min_value=min_year,
        max_value=max_year,
        value=(2019, 2022),
        step=1
    )
    
    # Filter data by year
    filtered_df = df[
        (df['publication_year'] >= year_range[0]) & 
        (df['publication_year'] <= year_range[1])
    ].copy()
    
    # Display metrics
    col1, col2, col3, col4 = st.columns(4)
    
    with col1:
        st.metric("Total Papers", f"{len(filtered_df):,}")
    
    with col2:
        unique_journals = filtered_df['journal'].nunique()
        st.metric("Unique Journals", f"{unique_journals:,}")
    
    with col3:
        avg_abstract_length = filtered_df['abstract_word_count'].mean()
        st.metric("Avg Abstract Length", f"{avg_abstract_length:.0f} words")
    
    with col4:
        year_span = year_range[1] - year_range[0] + 1
        st.metric("Years Covered", f"{year_span}")
    
    # Main content tabs
    tab1, tab2, tab3, tab4 = st.tabs(["Trends", "Journals", "Text Analysis", "Data Sample"])
    
    with tab1:
        st.header("Publication Trends Over Time")
        
        # Publications by year
        yearly_counts = filtered_df['publication_year'].value_counts().sort_index()
        
        col1, col2 = st.columns(2)
        
        with col1:
            fig, ax = plt.subplots(figsize=(10, 6))
            ax.bar(yearly_counts.index, yearly_counts.values, color='steelblue', alpha=0.7)
            ax.set_title('Publications by Year')
            ax.set_xlabel('Year')
            ax.set_ylabel('Number of Papers')
            plt.xticks(rotation=45)
            plt.grid(True, alpha=0.3)
            st.pyplot(fig)
        
        with col2:
            # Monthly distribution for selected years
            if len(filtered_df) > 0:
                monthly_data = filtered_df.copy()
                monthly_data['month'] = monthly_data['publish_time'].dt.month
                monthly_counts = monthly_data['month'].value_counts().sort_index()
                
                fig, ax = plt.subplots(figsize=(10, 6))
                month_names = ['Jan', 'Feb', 'Mar', 'Apr', 'May', 'Jun',
                              'Jul', 'Aug', 'Sep', 'Oct', 'Nov', 'Dec']
                ax.bar(monthly_counts.index, monthly_counts.values, color='orange', alpha=0.7)
                ax.set_title('Publications by Month')
                ax.set_xlabel('Month')
                ax.set_ylabel('Number of Papers')
                ax.set_xticks(range(1, 13))
                ax.set_xticklabels(month_names)
                plt.grid(True, alpha=0.3)
                st.pyplot(fig)
    
    with tab2:
        st.header("Journal Analysis")
        
        # Top journals
        top_journals = filtered_df['journal'].value_counts().head(15)
        
        col1, col2 = st.columns(2)
        
        with col1:
            st.subheader("Top Publishing Journals")
            fig, ax = plt.subplots(figsize=(10, 8))
            ax.barh(range(len(top_journals)), top_journals.values, color='forestgreen', alpha=0.7)
            ax.set_yticks(range(len(top_journals)))
            ax.set_yticklabels([j[:30] + '...' if len(j) > 30 else j for j in top_journals.index])
            ax.set_xlabel('Number of Papers')
            ax.set_title('Top 15 Journals')
            plt.grid(True, alpha=0.3)
            st.pyplot(fig)
        
        with col2:
            st.subheader("Journal Statistics")
            st.write(f"**Total unique journals:** {filtered_df['journal'].nunique():,}")
            st.write(f"**Journals with only 1 paper:** {(filtered_df['journal'].value_counts() == 1).sum():,}")
            st.write(f"**Journals with >10 papers:** {(filtered_df['journal'].value_counts() > 10).sum():,}")
            
            # Show top journals table
            st.subheader("Top 10 Journals Table")
            top_10 = filtered_df['journal'].value_counts().head(10).reset_index()
            top_10.columns = ['Journal', 'Papers']
            top_10['Percentage'] = (top_10['Papers'] / len(filtered_df) * 100).round(1)
            st.dataframe(top_10, use_container_width=True)
    
    with tab3:
        st.header("Text Analysis")
        
        # Word frequency analysis
        if len(filtered_df) > 0:
            # Extract words from titles
            all_titles = ' '.join(filtered_df['title'].dropna().astype(str))
            words = re.findall(r'\\b[a-zA-Z]{3,}\\b', all_titles.lower())
            
            # Filter stop words
            stop_words = {'the', 'and', 'for', 'are', 'with', 'from', 'this', 'that', 
                         'study', 'analysis', 'research', 'using', 'based', 'new'}
            filtered_words = [word for word in words if word not in stop_words]
            
            word_counts = Counter(filtered_words)
            top_words = dict(word_counts.most_common(20))
            
            col1, col2 = st.columns(2)
            
            with col1:
                st.subheader("Most Frequent Words in Titles")
                fig, ax = plt.subplots(figsize=(10, 8))
                words_list = list(top_words.keys())[:15]
                counts_list = list(top_words.values())[:15]
                
                ax.barh(range(len(words_list)), counts_list, color='coral', alpha=0.7)
                ax.set_yticks(range(len(words_list)))
                ax.set_yticklabels(words_list)
                ax.set_xlabel('Frequency')
                ax.set_title('Top 15 Words in Titles')
                plt.grid(True, alpha=0.3)
                st.pyplot(fig)
            
            with col2:
                st.subheader("Abstract Length Distribution")
                fig, ax = plt.subplots(figsize=(10, 6))
                word_counts = filtered_df[filtered_df['abstract_word_count'] <= 500]['abstract_word_count']
                ax.hist(word_counts, bins=30, color='purple', alpha=0.7, edgecolor='black')
                ax.axvline(word_counts.mean(), color='red', linestyle='--', linewidth=2,
                          label=f'Mean: {word_counts.mean():.0f} words')
                ax.set_xlabel('Number of Words')
                ax.set_ylabel('Number of Papers')
                ax.set_title('Abstract Length Distribution')
                ax.legend()
                plt.grid(True, alpha=0.3)
                st.pyplot(fig)
    
    with tab4:
        st.header("Data Sample")
        
        # Display options
        sample_size = st.slider("Sample size to display", 5, 50, 10)
        
        # Show random sample
        if len(filtered_df) > 0:
            sample_df = filtered_df.sample(n=min(sample_size, len(filtered_df)))
            display_columns = ['title', 'journal', 'publication_year', 'abstract_word_count']
            available_columns = [col for col in display_columns if col in sample_df.columns]
            
            st.subheader(f"Random Sample of {len(sample_df)} Papers")
            st.dataframe(sample_df[available_columns], use_container_width=True)
            
            # Download option
            csv = sample_df.to_csv(index=False)
            st.download_button(
                label="Download Sample as CSV",
                data=csv,
                file_name=f"cord19_sample_{sample_size}.csv",
                mime="text/csv"
            )

if __name__ == "__main__":
    main()
'''

# Save the Streamlit app to a file
with open('streamlit_app.py', 'w') as f:
    f.write(streamlit_app_code)

print("Streamlit application code created!")
print("File saved as: streamlit_app.py")
print("To run the app, use: streamlit run streamlit_app.py")

## Part 10: Summary and Key Findings

In [None]:
# Summary of findings and project completion
print("=" * 70)
print("PROJECT SUMMARY - CORD-19 DATA ANALYSIS")
print("=" * 70)

if df_clean is not None:
    print("\nCOMPLETED TASKS:")
    print("   ✓ Part 1: Data Loading and Basic Exploration")
    print("   ✓ Part 2: Data Cleaning and Preparation")
    print("   ✓ Part 3: Data Analysis and Visualization")
    print("   ✓ Part 4: Streamlit Application Development")
    print("   ✓ Part 5: Documentation and Insights")
    
    print(f"\nDATASET OVERVIEW:")
    print(f"   • Original records: {len(df):,}")
    print(f"   • After cleaning: {len(df_clean):,}")
    print(f"   • Data reduction: {((len(df) - len(df_clean)) / len(df) * 100):.1f}%")
    
    if 'publication_year' in df_clean.columns:
        valid_years = df_clean['publication_year'].between(2000, 2024, na=False)
        covid_era = df_clean[df_clean['publication_year'].between(2020, 2022, na=False)]
        
        print(f"\nKEY FINDINGS:")
        print(f"   • Papers with valid publication years: {valid_years.sum():,}")
        print(f"   • COVID-19 era papers (2020-2022): {len(covid_era):,}")
        
        if len(covid_era) > 0:
            peak_year = covid_era['publication_year'].value_counts().idxmax()
            peak_count = covid_era['publication_year'].value_counts().max()
            print(f"   • Peak COVID research year: {int(peak_year)} ({peak_count:,} papers)")
    
    if 'journal_clean' in df_clean.columns:
        top_journal = df_clean['journal_clean'].value_counts().index[0]
        top_count = df_clean['journal_clean'].value_counts().iloc[0]
        total_journals = df_clean['journal_clean'].nunique()
        
        print(f"   • Total unique journals: {total_journals:,}")
        print(f"   • Top journal: {top_journal} ({top_count:,} papers)")
    
    if 'abstract_word_count' in df_clean.columns:
        avg_abstract = df_clean['abstract_word_count'].mean()
        print(f"   • Average abstract length: {avg_abstract:.0f} words")
    
    print(f"\nDELIVERABLES CREATED:")
    print(f"   • Jupyter notebook: cord19_analysis.ipynb")
    print(f"   • Streamlit app: streamlit_app.py")
    print(f"   • Multiple visualizations and insights")
    
    print(f"\nNEXT STEPS:")
    print(f"   1. Run this notebook to see all visualizations")
    print(f"   2. Launch Streamlit app: streamlit run streamlit_app.py")
    print(f"   3. Explore interactive features in the web application")
    
else:
    print("Analysis incomplete - data loading failed")

print("\n" + "=" * 70)
print("ASSIGNMENT COMPLETED SUCCESSFULLY!")
print("=" * 70)