In [1]:
# Load the data and understand its basic structure
import pandas as pd

# Load the dataset
df = pd.read_csv('../data/raw/Extracted_data.csv')

# Basic inspection
print("Dataset shape (rows, columns):", df.shape)
print("\nColumn names:")
print(df.columns.tolist())

Dataset shape (rows, columns): (228, 25)

Column names:
['Covidence #', 'Study ID', 'Title', 'Reviewer Name', 'Year of publication', 'Journal name', 'First author affiliation', 'Study design', 'Study setting', 'Urban–Rural', 'Region', 'Population focus', 'Sample size', 'Funding sources', 'Ethics approval reported', 'Verbatim text of limitations', '-- SAMPLING & DESIGN --', '-- MEASUREMENT & DATA --', '-- CONTEXT & LOGISTICS --', '-- ANALYSIS & GENERALIZABILITY --', '-- RESEARCH CAPACITY --', 'Number of limitations reported', 'Journal type ', 'Multi-site study ', 'Topic area']


In [2]:
# Check for missing data
print("=== MISSING DATA SUMMARY ===")
missing_data = df.isnull().sum()
missing_percent = (missing_data / len(df)) * 100

# Show only columns with missing data
missing_summary = pd.DataFrame({
    'Missing Count': missing_data,
    'Missing Percentage': missing_percent
})

# Display columns with any missing values
missing_summary[missing_summary['Missing Count'] > 0]

=== MISSING DATA SUMMARY ===


Unnamed: 0,Missing Count,Missing Percentage
Journal name,1,0.438596
Urban–Rural,1,0.438596
Population focus,1,0.438596
Sample size,1,0.438596
Funding sources,47,20.614035
-- SAMPLING & DESIGN --,38,16.666667
-- MEASUREMENT & DATA --,90,39.473684
-- CONTEXT & LOGISTICS --,174,76.315789
-- ANALYSIS & GENERALIZABILITY --,90,39.473684
-- RESEARCH CAPACITY --,221,96.929825


In [3]:
# Understand the nature of missing data
print("=== SAMPLE OF ROWS WITH MISSING CRITICAL DATA ===")

# Check rows with missing critical fields
critical_missing = df[df['Journal name'].isnull() | 
                     df['Population focus'].isnull() |
                     df['Sample size'].isnull()]

print("Rows with missing critical data:", len(critical_missing))
if len(critical_missing) > 0:
    display(critical_missing[['Study ID', 'Journal name', 'Population focus', 'Sample size']])

=== SAMPLE OF ROWS WITH MISSING CRITICAL DATA ===
Rows with missing critical data: 3


Unnamed: 0,Study ID,Journal name,Population focus,Sample size
13,Ejeliogu 2020,,Mixed child populations,194.0
117,Ikeanyi 2015,Nigerian Journal of Clinical Practice,,3442.0
227,Ogu 2023,BMC Health Services Research,Pregnant women; Mothers (postnatal); Health wo...,


In [4]:
# This is what you SHOULD document in your notebook:
print("DATA CLEANING STEP: Critical missing values identified")
print("Action Required: Manual verification in Covidence for studies:")
print(" - Study ID 13: Missing Journal name")
print(" - Study ID 117: Missing Population focus") 
print(" - Study ID 227: Missing Sample size")
print("Next: Update data in Covidence, re-export CSV, then reload data for analysis")

DATA CLEANING STEP: Critical missing values identified
Action Required: Manual verification in Covidence for studies:
 - Study ID 13: Missing Journal name
 - Study ID 117: Missing Population focus
 - Study ID 227: Missing Sample size
Next: Update data in Covidence, re-export CSV, then reload data for analysis
