In [3]:
import pandas as pd
import numpy as np

df = pd.read_csv('data/publications.csv')

In [4]:
print("\nDATASET STRUCTURE:")
print(f"Shape: {df.shape[0]} rows × {df.shape[1]} columns")
print(f"Time Period: {df['year'].min()} - {df['year'].max()} ({df['year'].max() - df['year'].min() + 1} years)")
print(f"Unique Countries: {df['Name'].nunique()}")
print(f"\nCountries: {', '.join(sorted(df['Name'].unique()))}")



DATASET STRUCTURE:
Shape: 1000 rows × 12 columns
Time Period: 2003 - 2025 (23 years)
Unique Countries: 17

Countries: AUSTRALIA, BRAZIL, CANADA, CHINA, ENGLAND, FRANCE, GERMANY, INDIA, ITALY, JAPAN, NETHERLANDS, SOUTH KOREA, SPAIN, SWEDEN, SWITZERLAND, UNITED KINGDOM, USA


In [7]:

print("KEY PERFORMANCE METRICS:")

metrics = {
    'Total Research Output': df['Web of Science Documents'].sum(),
    'Total Citations': df['Times Cited'].sum(),
    'Avg Citations per Paper': df['Times Cited'].sum() / df['Web of Science Documents'].sum(),
    'Avg CNCI (baseline=1.0)': df['Category Normalized Citation Impact'].mean(),
    'Records with CNCI > 1.0': f"{(df['Category Normalized Citation Impact'] > 1.0).sum()} ({(df['Category Normalized Citation Impact'] > 1.0).mean() * 100:.1f}%)",
    'Avg % Papers in Top 10%': df['% Documents in Top 10%'].mean(),
    'Avg % Papers in Top 1%': df['% Documents in Top 1%'].mean(),
}

for key, value in metrics.items():
    if isinstance(value, float) or isinstance(value, int):
        print(f"{key:.<40} {value:>15.2f}")
    else:
        print(f"{key:.<40} {str(value):>15}")
print("DATA QUALITY:")

missing_total = df.isnull().sum().sum()
if missing_total == 0:
    print("Missing Values: 0 (No missing values detected)")
else:
    print("Missing Values by Column:")
    print(df.isnull().sum())

print(f"Duplicates: {df.duplicated().sum()}")


KEY PERFORMANCE METRICS:
Total Research Output...................        14861699
Total Citations.........................      1296496955
Avg Citations per Paper.................           87.24
Avg CNCI (baseline=1.0).................            1.29
Records with CNCI > 1.0.................     855 (85.5%)
Avg % Papers in Top 10%.................           17.59
Avg % Papers in Top 1%..................            1.77
DATA QUALITY:
Missing Values: 0 (No missing values detected)
Duplicates: 0
