In [3]:
import pandas as pd

# Raw GitHub URL
url = "/Users/macminiadi/Documents/Programs/VSCode/Healthcare/data/icd10codes.csv"

# Load CSV file
df_ICD10Codes = pd.read_csv(url) #, delimiter="\t")

# Preview
df_ICD10Codes.head()

# Data looks good in the preview, so we can continue with data analysis.

Unnamed: 0,icd10_codes,description
0,A000,"Cholera due to Vibrio cholerae 01, biovar chol..."
1,A001,"Cholera due to Vibrio cholerae 01, biovar eltor"
2,A009,"Cholera, unspecified"
3,A0100,"Typhoid fever, unspecified"
4,A0101,Typhoid meningitis


In [3]:
# ----------------------------
# 1. Basic Info & Overview
# ----------------------------
print("üìä Dataset shape:", df.shape)
print("\nüßæ Columns:\n", df.columns)
print("\nüìà Data types:\n", df.dtypes)
# ----------------------------
# 2. Null or Missing Values
# ----------------------------
print("\n‚ùì Missing values per column:\n", df.isnull().sum())

# Optional: Show rows with missing data
missing_rows = df[df.isnull().any(axis=1)]
print("\n‚ö†Ô∏è Rows with missing data:\n", missing_rows)
# ----------------------------
# 3. Duplicates
# ----------------------------
duplicates = df.duplicated().sum()
print(f"\nüìé Duplicate rows: {duplicates}")
# ----------------------------
# 4. Uniqueness & Key Checks
# ----------------------------
if 'icd10_codes' in df.columns:
    unique_codes = df['icd10_codes'].nunique()
    print(f"\nüîë Unique ICD Codes: {unique_codes}")
    if unique_codes < len(df):
        print("‚ö†Ô∏è Duplicate ICD codes found!")
# ----------------------------
# 5. Value Length/Format Checks
# ----------------------------
df['icd10_codes'] = df['icd10_codes'].astype(str).str.strip()

# Example: Ensure all ICD codes are <= 7 characters
invalid_length = df[df['icd10_codes'].str.len() > 7]
print(f"\nüîç ICD codes > 7 characters: {len(invalid_length)}")

üìä Dataset shape: (72184, 2)

üßæ Columns:
 Index(['icd10_codes', 'description'], dtype='object')

üìà Data types:
 icd10_codes    object
description    object
dtype: object

‚ùì Missing values per column:
 icd10_codes    0
description    0
dtype: int64

‚ö†Ô∏è Rows with missing data:
 Empty DataFrame
Columns: [icd10_codes, description]
Index: []

üìé Duplicate rows: 0

üîë Unique ICD Codes: 72184

üîç ICD codes > 7 characters: 0
