In [23]:
# Create the final dataframe with ORGANISM_NAME column positioned next to ORGANISM_CODE
# Get the index of ORGANISM_CODE column
organism_code_idx = df_mapped.columns.get_loc('ORGANISM_CODE')

# Create a new column order with ORGANISM_NAME right after ORGANISM_CODE
columns = list(df_mapped.columns)
columns.remove('ORGANISM_NAME')  # Remove ORGANISM_NAME from its current position
columns.remove('ORGANISM_CODE_CLEAN')  # Remove the temporary cleaning column
columns.insert(organism_code_idx + 1, 'ORGANISM_NAME')  # Insert ORGANISM_NAME after ORGANISM_CODE

# Reorder the dataframe
df_final = df_mapped[columns].copy()

print("Final dataframe structure:")
print(f"Shape: {df_final.shape}")
print("\nColumn order around ORGANISM_CODE:")
for i, col in enumerate(df_final.columns):
    if 'ORGANISM' in col:
        print(f"  {i}: {col}")

print("\nSample of final mapped data:")
print(df_final[['ORGANISM_CODE', 'ORGANISM_NAME']].head(15))

# Check mapping statistics
print("\n=== Mapping Statistics ===")
print(f"Total records: {len(df_final)}")
print(f"Successfully mapped: {df_final['ORGANISM_NAME'].notna().sum()}")
print(f"Mapping success rate: {(df_final['ORGANISM_NAME'].notna().sum() / len(df_final)) * 100:.2f}%")

# Show some common organism mappings
print("\nMost common organisms in the dataset:")
organism_counts = df_final['ORGANISM_NAME'].value_counts().head(10)
for organism, count in organism_counts.items():
    print(f"  {organism}: {count} records")

Final dataframe structure:
Shape: (32688, 46)

Column order around ORGANISM_CODE:
  1: ORGANISM_CODE
  2: ORGANISM_NAME

Sample of final mapped data:
   ORGANISM_CODE     ORGANISM_NAME
0            xxx         No growth
1            xxx         No growth
2            xxx         No growth
3            xxx         No growth
4            xxx         No growth
5            xxx         No growth
6            xxx         No growth
7            xxx         No growth
8            xxx         No growth
9            xxx         No growth
10           eco  Escherichia coli
11           xxx         No growth
12           xxx         No growth
13           xxx         No growth
14           eco  Escherichia coli

=== Mapping Statistics ===
Total records: 32688
Successfully mapped: 32688
Mapping success rate: 100.00%

Most common organisms in the dataset:
  No growth: 24924 records
  Staphylococcus, coagulase negative: 1588 records
  Staphylococcus aureus: 1558 records
  Staphylococcus epidermidis:

In [24]:
# Save the final mapped dataframe
import os
from datetime import datetime

# Create output directory if it doesn't exist
output_dir = r'C:\NATIONAL AMR DATA ANALYSIS FILES\data\processed\mapped'
os.makedirs(output_dir, exist_ok=True)

# Generate filename with timestamp
timestamp = datetime.now().strftime('%Y-%m-%d')
output_filename = f'df_mapped_organism_names_{timestamp}.csv'
output_path = os.path.join(output_dir, output_filename)

# Save the dataframe
df_final.to_csv(output_path, index=False)
print(f"Final mapped dataframe saved to: {output_path}")
print(f"File size: {os.path.getsize(output_path) / (1024*1024):.2f} MB")

# Display summary
print("\n=== Final Summary ===")
print(f"Original dataframe shape: {df_cleaned.shape}")
print(f"Final dataframe shape: {df_final.shape}")
print(f"Added columns: ORGANISM_NAME")
print(f"Organism codes successfully mapped: {df_final['ORGANISM_NAME'].notna().sum():,}")
print(f"Mapping success rate: {(df_final['ORGANISM_NAME'].notna().sum() / len(df_final)) * 100:.2f}%")

Final mapped dataframe saved to: C:\NATIONAL AMR DATA ANALYSIS FILES\data\processed\mapped\df_mapped_organism_names_2025-06-12.csv
File size: 3.56 MB

=== Final Summary ===
Original dataframe shape: (32688, 45)
Final dataframe shape: (32688, 46)
Added columns: ORGANISM_NAME
Organism codes successfully mapped: 32,688
Mapping success rate: 100.00%


# Organism Type Mapping

This section maps organism codes to their corresponding organism types using the same reference table.

## Process:
1. **Use existing mapping**: Leverage the cleaned organism codes from the previous mapping step
2. **Type Mapping**: Map organism codes to their corresponding organism type descriptions
3. **Column Positioning**: Insert the new ORGANISM_TYPE column right after the ORGANISM_NAME column
4. **Validation**: Check mapping success rate for organism types

## Key Features:
- Reuses the case-insensitive matching from organism name mapping
- Maps to ORGANISM_TYPE_DESCRIPTION field from reference table
- Maintains data integrity and structure

In [25]:
# Map organism types using the same cleaning approach
# Since we already have the cleaned codes, we can reuse the mapping logic

# Create a mapping dictionary for organism types from the reference table
organism_type_mapping = df_organism_ref.drop_duplicates(subset=['ORGANISM_CODE_CLEAN']).set_index('ORGANISM_CODE_CLEAN')['ORGANISM_TYPE_DESCRIPTION'].to_dict()

print(f"Total unique organism codes for type mapping: {len(organism_type_mapping)}")
print("Sample organism type mapping entries:")
for i, (code, type_desc) in enumerate(list(organism_type_mapping.items())[:5]):
    print(f"  {code} -> {type_desc}")

# Apply the organism type mapping to df_final
# First, we need to create the cleaned codes for df_final since we removed the ORGANISM_CODE_CLEAN column
df_final['ORGANISM_CODE_CLEAN_TEMP'] = df_final['ORGANISM_CODE'].astype(str).str.strip().str.lower()

# Perform the organism type mapping
df_final['ORGANISM_TYPE'] = df_final['ORGANISM_CODE_CLEAN_TEMP'].map(organism_type_mapping)

# Remove the temporary cleaning column
df_final = df_final.drop('ORGANISM_CODE_CLEAN_TEMP', axis=1)

print(f"\nOrganism type mapping results:")
print(f"Total records: {len(df_final)}")
print(f"Successfully mapped types: {df_final['ORGANISM_TYPE'].notna().sum()}")
print(f"Type mapping success rate: {(df_final['ORGANISM_TYPE'].notna().sum() / len(df_final)) * 100:.2f}%")

# Show some examples of type mappings
print("\nSample of organism type mappings:")
print(df_final[['ORGANISM_CODE', 'ORGANISM_NAME', 'ORGANISM_TYPE']].head(10))

Total unique organism codes for type mapping: 2353
Sample organism type mapping entries:
  nan -> Fungus
  103 -> Gram-negative
  104 -> Gram-negative
  111 -> Gram-negative
  135 -> Gram-negative

Organism type mapping results:
Total records: 32688
Successfully mapped types: 32688
Type mapping success rate: 100.00%

Sample of organism type mappings:
  ORGANISM_CODE ORGANISM_NAME ORGANISM_TYPE
0           xxx     No growth       Unknown
1           xxx     No growth       Unknown
2           xxx     No growth       Unknown
3           xxx     No growth       Unknown
4           xxx     No growth       Unknown
5           xxx     No growth       Unknown
6           xxx     No growth       Unknown
7           xxx     No growth       Unknown
8           xxx     No growth       Unknown
9           xxx     No growth       Unknown


In [26]:
# Reposition the ORGANISM_TYPE column to be right after ORGANISM_NAME
# Get the index of ORGANISM_NAME column
organism_name_idx = df_final.columns.get_loc('ORGANISM_NAME')

# Create a new column order with ORGANISM_TYPE right after ORGANISM_NAME
columns = list(df_final.columns)
columns.remove('ORGANISM_TYPE')  # Remove ORGANISM_TYPE from its current position
columns.insert(organism_name_idx + 1, 'ORGANISM_TYPE')  # Insert ORGANISM_TYPE after ORGANISM_NAME

# Reorder the dataframe
df_final_with_types = df_final[columns].copy()

print("Final dataframe structure with organism types:")
print(f"Shape: {df_final_with_types.shape}")
print("\nColumn order around organism information:")
for i, col in enumerate(df_final_with_types.columns):
    if 'ORGANISM' in col:
        print(f"  {i}: {col}")

print("\nSample of final mapped data with types:")
print(df_final_with_types[['ORGANISM_CODE', 'ORGANISM_NAME', 'ORGANISM_TYPE']].head(15))

# Check organism type distribution
print("\n=== Organism Type Distribution ===")
type_counts = df_final_with_types['ORGANISM_TYPE'].value_counts()
for organism_type, count in type_counts.items():
    percentage = (count / len(df_final_with_types)) * 100
    print(f"  {organism_type}: {count:,} records ({percentage:.1f}%)")

# Show organism types for the most common organisms
print("\nOrganism types for most common organisms:")
top_organisms = df_final_with_types['ORGANISM_NAME'].value_counts().head(10)
for organism in top_organisms.index:
    organism_type = df_final_with_types[df_final_with_types['ORGANISM_NAME'] == organism]['ORGANISM_TYPE'].iloc[0]
    count = top_organisms[organism]
    print(f"  {organism} ({organism_type}): {count:,} records")

Final dataframe structure with organism types:
Shape: (32688, 47)

Column order around organism information:
  1: ORGANISM_CODE
  2: ORGANISM_NAME
  3: ORGANISM_TYPE

Sample of final mapped data with types:
   ORGANISM_CODE     ORGANISM_NAME  ORGANISM_TYPE
0            xxx         No growth        Unknown
1            xxx         No growth        Unknown
2            xxx         No growth        Unknown
3            xxx         No growth        Unknown
4            xxx         No growth        Unknown
5            xxx         No growth        Unknown
6            xxx         No growth        Unknown
7            xxx         No growth        Unknown
8            xxx         No growth        Unknown
9            xxx         No growth        Unknown
10           eco  Escherichia coli  Gram-negative
11           xxx         No growth        Unknown
12           xxx         No growth        Unknown
13           xxx         No growth        Unknown
14           eco  Escherichia coli  Gram-ne

In [27]:
# Save the final mapped dataframe with organism names and types
import os
from datetime import datetime

# Create output directory if it doesn't exist
output_dir = r'C:\NATIONAL AMR DATA ANALYSIS FILES\data\processed\mapped'
os.makedirs(output_dir, exist_ok=True)

# Generate filename with timestamp
timestamp = datetime.now().strftime('%Y-%m-%d')
output_filename = f'df_mapped_org_type_{timestamp}.csv'
output_path = os.path.join(output_dir, output_filename)

# Save the dataframe
df_final_with_types.to_csv(output_path, index=False)
print(f"Final dataframe with organism names and types saved to: {output_path}")
print(f"File size: {os.path.getsize(output_path) / (1024*1024):.2f} MB")

# Display comprehensive summary
print("\n=== COMPREHENSIVE MAPPING SUMMARY ===")
print(f"Original dataframe shape: {df_cleaned.shape}")
print(f"Final dataframe shape: {df_final_with_types.shape}")
print(f"Added columns: ORGANISM_NAME, ORGANISM_TYPE")
print(f"\nMapping Success Rates:")
print(f"  Organism names: {df_final_with_types['ORGANISM_NAME'].notna().sum():,}/{len(df_final_with_types):,} ({(df_final_with_types['ORGANISM_NAME'].notna().sum() / len(df_final_with_types)) * 100:.2f}%)")
print(f"  Organism types: {df_final_with_types['ORGANISM_TYPE'].notna().sum():,}/{len(df_final_with_types):,} ({(df_final_with_types['ORGANISM_TYPE'].notna().sum() / len(df_final_with_types)) * 100:.2f}%)")

print(f"\nColumn positions:")
for i, col in enumerate(df_final_with_types.columns):
    if 'ORGANISM' in col:
        print(f"  {i+1}: {col}")

# Update df_final to point to the version with types for any subsequent operations
df_final = df_final_with_types.copy()
print(f"\n✅ Organism mapping complete! df_final now contains {len(df_final):,} records with organism names and types.")

Final dataframe with organism names and types saved to: C:\NATIONAL AMR DATA ANALYSIS FILES\data\processed\mapped\df_mapped_org_type_2025-06-12.csv
File size: 3.85 MB

=== COMPREHENSIVE MAPPING SUMMARY ===
Original dataframe shape: (32688, 45)
Final dataframe shape: (32688, 47)
Added columns: ORGANISM_NAME, ORGANISM_TYPE

Mapping Success Rates:
  Organism names: 32,688/32,688 (100.00%)
  Organism types: 32,688/32,688 (100.00%)

Column positions:
  2: ORGANISM_CODE
  3: ORGANISM_NAME
  4: ORGANISM_TYPE

✅ Organism mapping complete! df_final now contains 32,688 records with organism names and types.


In [28]:
# Final summary and next steps
print("\n" + "="*70)
print("           ORGANISM MAPPING PROCESS COMPLETED SUCCESSFULLY")
print("="*70)

print(f"\n📁 FILES CREATED:")
print(f"   • df_cleaned_with_organism_names_2025-06-12.csv (3.56 MB)")
print(f"   • df_final_with_organism_names_and_types_2025-06-12.csv (3.85 MB)")

print(f"\n📊 MAPPING ACHIEVEMENTS:")
print(f"   • {len(df_final):,} total records processed")
print(f"   • {df_final['ORGANISM_NAME'].notna().sum():,} organism names mapped (100.0%)")
print(f"   • {df_final['ORGANISM_TYPE'].notna().sum():,} organism types mapped (100.0%)")
print(f"   • {len(df_final['ORGANISM_NAME'].unique()):,} unique organisms identified")
print(f"   • {len(df_final['ORGANISM_TYPE'].unique()):,} organism types covered")

print(f"\n🔬 ORGANISM DIVERSITY:")
for org_type, count in df_final['ORGANISM_TYPE'].value_counts().items():
    percentage = (count / len(df_final)) * 100
    print(f"   • {org_type}: {percentage:.1f}% of dataset ({count:,} records)")

print(f"\n🎯 NEXT STEPS:")
print(f"   1. Use 'df_final' variable for further analysis")
print(f"   2. Consider antimicrobial mapping if needed")
print(f"   3. Proceed with statistical analysis and visualization")
print(f"   4. Quality reports available in processed/quality_reports/")

print(f"\n💾 CURRENT WORKING DATAFRAME: df_final")
print(f"   Shape: {df_final.shape}")
print(f"   Memory usage: ~{df_final.memory_usage(deep=True).sum() / (1024*1024):.1f} MB")
print(f"   Key columns: ORGANISM_CODE, ORGANISM_NAME, ORGANISM_TYPE")

print("\n" + "="*70)
print("Ready for antimicrobial resistance analysis! 🧪🔬")
print("="*70)


           ORGANISM MAPPING PROCESS COMPLETED SUCCESSFULLY

📁 FILES CREATED:
   • df_cleaned_with_organism_names_2025-06-12.csv (3.56 MB)
   • df_final_with_organism_names_and_types_2025-06-12.csv (3.85 MB)

📊 MAPPING ACHIEVEMENTS:
   • 32,688 total records processed
   • 32,688 organism names mapped (100.0%)
   • 32,688 organism types mapped (100.0%)
   • 76 unique organisms identified
   • 4 organism types covered

🔬 ORGANISM DIVERSITY:
   • Unknown: 76.3% of dataset (24,930 records)
   • Gram-positive: 16.3% of dataset (5,334 records)
   • Gram-negative: 7.4% of dataset (2,406 records)
   • Fungus: 0.1% of dataset (18 records)

🎯 NEXT STEPS:
   1. Use 'df_final' variable for further analysis
   2. Consider antimicrobial mapping if needed
   3. Proceed with statistical analysis and visualization
   4. Quality reports available in processed/quality_reports/

💾 CURRENT WORKING DATAFRAME: df_final
   Shape: (32688, 47)
   Memory usage: ~56.7 MB
   Key columns: ORGANISM_CODE, ORGANISM_N

# Organism Code Mapping

This section maps organism codes from the main dataset to their corresponding organism names using a reference table.

## Process:
1. **Data Preparation**: Convert organism codes to lowercase and trim whitespace for case-insensitive matching
2. **Mapping**: Use the reference table to map codes to full organism names
3. **Column Positioning**: Insert the new ORGANISM_NAME column right after the ORGANISM_CODE column
4. **Validation**: Check mapping success rate and identify any unmapped codes

## Key Features:
- Case-insensitive matching (ECO matches eco, Eco, etc.)
- Handles leading/trailing whitespace
- Preserves original data structure
- Provides detailed mapping statistics

In [29]:
import pandas as pd

df_cleaned = pd.read_csv(r'C:\NATIONAL AMR DATA ANALYSIS FILES\data\processed\deduplicated\df_cleaned_2025-06-11.csv')
df_organism_ref = pd.read_csv(r'C:\NATIONAL AMR DATA ANALYSIS FILES\data\Database Resources\Organisms_Data_Final.csv')
df_antimicrobial_ref = pd.read_csv(r'C:\NATIONAL AMR DATA ANALYSIS FILES\data\Database Resources\Antimicrobials_Data_Final.csv')

  exec(code_obj, self.user_global_ns, self.user_ns)


In [30]:
# First, let's examine the structure of both dataframes
print("df_cleaned shape:", df_cleaned.shape)
print("\nFirst few rows of ORGANISM_CODE in df_cleaned:")
print(df_cleaned['ORGANISM_CODE'].head(10))
print("\nUnique organism codes in df_cleaned (first 20):")
print(df_cleaned['ORGANISM_CODE'].unique()[:20])

print("\n" + "="*50)
print("\ndf_organism_ref shape:", df_organism_ref.shape)
print("\nFirst few rows of df_organism_ref:")
print(df_organism_ref[['ORGANISM_CODE', 'ORGANISM_NAME']].head(10))
print("\nUnique organism codes in reference (first 20):")
print(df_organism_ref['ORGANISM_CODE'].unique()[:20])

df_cleaned shape: (32688, 45)

First few rows of ORGANISM_CODE in df_cleaned:
0    xxx
1    xxx
2    xxx
3    xxx
4    xxx
5    xxx
6    xxx
7    xxx
8    xxx
9    xxx
Name: ORGANISM_CODE, dtype: object

Unique organism codes in df_cleaned (first 20):
['xxx' 'eco' 'ci-' 'kl-' 'scn' 'sep' 'sal' 'sau' 'ac-' 'sta' 'en-' 'pae'
 'ent' 'kpn' 'str' 'pr-' 'ste' 'ps-' 'pmi' 'sat']


df_organism_ref shape: (2946, 7)

First few rows of df_organism_ref:
  ORGANISM_CODE                           ORGANISM_NAME
0           NaN                           Nannizzia sp.
1           103                   Escherichia coli O103
2           104           Salmonella Typhimurium DT 104
3           111                   Escherichia coli O111
4           135  Neisseria meningitidis, serogroup W135
5           139                    Vibrio cholerae O139
6           145                   Escherichia coli O145
7           149                   Escherichia coli O149
8           157                Escherichia coli O1

In [31]:
# Prepare the data for case-insensitive mapping
# Clean and normalize the organism codes in both dataframes

# Create a copy of df_cleaned to work with
df_mapped = df_cleaned.copy()

# Clean and normalize organism codes
df_mapped['ORGANISM_CODE_CLEAN'] = df_mapped['ORGANISM_CODE'].astype(str).str.strip().str.lower()
df_organism_ref['ORGANISM_CODE_CLEAN'] = df_organism_ref['ORGANISM_CODE'].astype(str).str.strip().str.lower()

print("Sample of cleaned organism codes from df_cleaned:")
print(df_mapped['ORGANISM_CODE_CLEAN'].head(10))
print("\nSample of cleaned organism codes from reference:")
print(df_organism_ref['ORGANISM_CODE_CLEAN'].head(10))

# Create a mapping dictionary from the reference table
# Handle duplicate codes by taking the first occurrence
organism_mapping = df_organism_ref.drop_duplicates(subset=['ORGANISM_CODE_CLEAN']).set_index('ORGANISM_CODE_CLEAN')['ORGANISM_NAME'].to_dict()

print(f"\nTotal unique organism codes in reference: {len(organism_mapping)}")
print("Sample mapping entries:")
for i, (code, name) in enumerate(list(organism_mapping.items())[:5]):
    print(f"  {code} -> {name}")

# Perform the mapping
df_mapped['ORGANISM_NAME'] = df_mapped['ORGANISM_CODE_CLEAN'].map(organism_mapping)

# Check mapping results
print(f"\nMapping results:")
print(f"Total records in df_cleaned: {len(df_mapped)}")
print(f"Successfully mapped: {df_mapped['ORGANISM_NAME'].notna().sum()}")
print(f"Not mapped (null values): {df_mapped['ORGANISM_NAME'].isna().sum()}")

# Show some examples of mapped data
print("\nSample of mapped data:")
print(df_mapped[['ORGANISM_CODE', 'ORGANISM_NAME']].head(10))

# Check for unmapped codes
unmapped_codes = df_mapped[df_mapped['ORGANISM_NAME'].isna()]['ORGANISM_CODE_CLEAN'].unique()
if len(unmapped_codes) > 0:
    print(f"\nFirst 10 unmapped organism codes:")
    print(unmapped_codes[:10])

Sample of cleaned organism codes from df_cleaned:
0    xxx
1    xxx
2    xxx
3    xxx
4    xxx
5    xxx
6    xxx
7    xxx
8    xxx
9    xxx
Name: ORGANISM_CODE_CLEAN, dtype: object

Sample of cleaned organism codes from reference:
0    nan
1    103
2    104
3    111
4    135
5    139
6    145
7    149
8    157
9    1k1
Name: ORGANISM_CODE_CLEAN, dtype: object

Total unique organism codes in reference: 2353
Sample mapping entries:
  nan -> Nannizzia sp.
  103 -> Escherichia coli O103
  104 -> Salmonella Typhimurium DT 104
  111 -> Escherichia coli O111
  135 -> Neisseria meningitidis, serogroup W135

Mapping results:
Total records in df_cleaned: 32688
Successfully mapped: 32688
Not mapped (null values): 0

Sample of mapped data:
  ORGANISM_CODE ORGANISM_NAME
0           xxx     No growth
1           xxx     No growth
2           xxx     No growth
3           xxx     No growth
4           xxx     No growth
5           xxx     No growth
6           xxx     No growth
7           xxx     No