In [3]:
import pandas as pd

df = pd.read_csv('../data/Literature_research_vibrio.csv')
df_cleaned = df.dropna(axis=1, how='all')  # Remove empty columns
df_cleaned.head()

Unnamed: 0,Medium,Growth Rate (μ),Doubling Time,Final OD₆₀₀,Time,Tryptone,Yeast Extract,Glycerol,Glucose,NaCl,MgSO₄,KCl,(NH₄)₂SO₄,K₂HPO₄,KH₂PO₄,MOPS,CaCl₂,Trace Metals,Notes,Source
0,BHIN-like (replicate),2.70 ± 0.03 h⁻¹,15.4 min,Not specified,0.5-1.5 h,10,5,-,5,20,-,-,-,2.5,-,-,-,-,Complex medium substitute,Hoffart 2017
1,BHIN-like filtered (replicate),3.54 ± 0.22 h⁻¹,9.4 min,Not specified,~2 h,10,5,-,5,20,-,-,-,2.5,-,-,-,-,Filter sterilize (0.22 μm),Hoffart 2017
2,LB3,Variable,~12-15 min,Not specified,Overnight,10,5,-,-,30,-,-,-,-,-,-,-,-,✓ EXACT MATCH,Eilers 2016
3,LBO-like (replicate),Variable,~12-15 min,Not specified,Overnight,10,5,-,-,20,2,0.5,-,-,-,-,0.1,-,Ocean salts approximation,Eilers 2016
4,VN Minimal (aerobic),1.48 ± 0.06 h⁻¹,28 min,~15.7,5 h,-,-,-,10,15,0.25,-,5,1,1,-,0.01,Yes,✓ EXACT MATCH - Best defined,Hoffart 2017


In [4]:
# Remove Doubling Time and Growth Rate columns
df_cleaned = df_cleaned.drop(columns=['Doubling Time', 'Growth Rate (μ)'])
df_cleaned.head()


Unnamed: 0,Medium,Final OD₆₀₀,Time,Tryptone,Yeast Extract,Glycerol,Glucose,NaCl,MgSO₄,KCl,(NH₄)₂SO₄,K₂HPO₄,KH₂PO₄,MOPS,CaCl₂,Trace Metals,Notes,Source
0,BHIN-like (replicate),Not specified,0.5-1.5 h,10,5,-,5,20,-,-,-,2.5,-,-,-,-,Complex medium substitute,Hoffart 2017
1,BHIN-like filtered (replicate),Not specified,~2 h,10,5,-,5,20,-,-,-,2.5,-,-,-,-,Filter sterilize (0.22 μm),Hoffart 2017
2,LB3,Not specified,Overnight,10,5,-,-,30,-,-,-,-,-,-,-,-,✓ EXACT MATCH,Eilers 2016
3,LBO-like (replicate),Not specified,Overnight,10,5,-,-,20,2,0.5,-,-,-,-,0.1,-,Ocean salts approximation,Eilers 2016
4,VN Minimal (aerobic),~15.7,5 h,-,-,-,10,15,0.25,-,5,1,1,-,0.01,Yes,✓ EXACT MATCH - Best defined,Hoffart 2017


In [None]:
# Add HCl and NaOH columns (not present in the literature data, filled with '-')
df_cleaned['HCl'] = '-'
df_cleaned['NaOH'] = '-'
df_cleaned.columns 



In [7]:
# Replace all '-' values with 0
# Exclude non-numeric columns like 'Medium', 'Notes', 'Source', 'Final OD₆₀₀', 'Time'
exclude_cols = ['Medium', 'Notes', 'Source', 'Final OD₆₀₀', 'Time', 'Trace Metals']
numeric_cols = [col for col in df_cleaned.columns if col not in exclude_cols]

# Replace '-' with 0 in all numeric columns
for col in numeric_cols:
    df_cleaned[col] = df_cleaned[col].replace('-', 0)
    # Also handle any other non-numeric values
    df_cleaned[col] = pd.to_numeric(df_cleaned[col], errors='coerce').fillna(0)

df_cleaned.head()


  df_cleaned[col] = df_cleaned[col].replace('-', 0)


Unnamed: 0,Medium,Final OD₆₀₀,Time,Tryptone,Yeast Extract,Glycerol,Glucose,NaCl,MgSO₄,KCl,(NH₄)₂SO₄,K₂HPO₄,KH₂PO₄,MOPS,CaCl₂,Trace Metals,Notes,Source,HCl,NaOH
0,BHIN-like (replicate),Not specified,0.5-1.5 h,10,5.0,0.0,5.0,20.0,0.0,0.0,0.0,2.5,0.0,0,0.0,-,Complex medium substitute,Hoffart 2017,0,0
1,BHIN-like filtered (replicate),Not specified,~2 h,10,5.0,0.0,5.0,20.0,0.0,0.0,0.0,2.5,0.0,0,0.0,-,Filter sterilize (0.22 μm),Hoffart 2017,0,0
2,LB3,Not specified,Overnight,10,5.0,0.0,0.0,30.0,0.0,0.0,0.0,0.0,0.0,0,0.0,-,✓ EXACT MATCH,Eilers 2016,0,0
3,LBO-like (replicate),Not specified,Overnight,10,5.0,0.0,0.0,20.0,2.0,0.5,0.0,0.0,0.0,0,0.1,-,Ocean salts approximation,Eilers 2016,0,0
4,VN Minimal (aerobic),~15.7,5 h,0,0.0,0.0,10.0,15.0,0.25,0.0,5.0,1.0,1.0,0,0.01,Yes,✓ EXACT MATCH - Best defined,Hoffart 2017,0,0


In [9]:
# Drop rows where Final OD₆₀₀ is 0 or not specified
# First convert Final OD₆₀₀ to numeric
df_cleaned['Final OD₆₀₀_num'] = pd.to_numeric(df_cleaned['Final OD₆₀₀'].astype(str).str.replace('~', '').str.replace('Not specified', ''), errors='coerce')

# Drop rows where Final OD₆₀₀ is 0 or NaN
df_cleaned = df_cleaned[df_cleaned['Final OD₆₀₀_num'] > 0]
df_cleaned = df_cleaned.drop(columns=['Final OD₆₀₀_num'])

print(f"Rows remaining: {len(df_cleaned)}")
df_cleaned


Rows remaining: 6


Unnamed: 0,Medium,Final OD₆₀₀,Time,Tryptone,Yeast Extract,Glycerol,Glucose,NaCl,MgSO₄,KCl,(NH₄)₂SO₄,K₂HPO₄,KH₂PO₄,MOPS,CaCl₂,Trace Metals,Notes,Source,HCl,NaOH
4,VN Minimal (aerobic),~15.7,5 h,0,0.0,0.0,10.0,15.0,0.25,0.0,5.0,1.0,1.0,0,0.01,Yes,✓ EXACT MATCH - Best defined,Hoffart 2017,0,0
5,VN Minimal (anaerobic-like),~4.4,7 h,0,0.0,0.0,10.0,15.0,0.25,0.0,5.0,1.0,1.0,0,0.01,Yes,No NaHCO₃ available,Hoffart 2017,0,0
8,MSM Fed-batch (30°C),~204,12 h,0,0.0,0.0,0.0,17.0,0.5,0.0,5.0,2.0,2.0,0,0.02,Yes,High cell density - glucose fed,Fink 2021,0,0
9,MSM Fed-batch (37°C),~148,12 h,0,0.0,0.0,0.0,17.0,0.5,0.0,5.0,2.0,2.0,0,0.02,Yes,High cell density - glucose fed,Fink 2021,0,0
13,Anaerobic-like (MOPS),~5.5,216 h,0,0.0,0.0,10.0,15.0,0.5,0.0,5.0,2.0,2.0,10,0.02,Yes,MOPS buffer substitute,Gemünde 2023,0,0
14,VN Resting cells,~74,2 h,0,0.0,0.0,42.5,15.0,0.25,0.0,5.0,1.0,1.0,0,0.01,Yes,"✓ EXACT - High glucose, 20 g/L biomass",Hoffart 2017,0,0


In [11]:
# Remove Notes and Source columns
df_cleaned = df_cleaned.drop(columns=['Notes', 'Source'])
df_cleaned.head()


Unnamed: 0,Medium,Final OD₆₀₀,Time,Tryptone,Yeast Extract,Glycerol,Glucose,NaCl,MgSO₄,KCl,(NH₄)₂SO₄,K₂HPO₄,KH₂PO₄,MOPS,CaCl₂,Trace Metals,HCl,NaOH
4,VN Minimal (aerobic),~15.7,5 h,0,0.0,0.0,10.0,15.0,0.25,0.0,5.0,1.0,1.0,0,0.01,Yes,0,0
5,VN Minimal (anaerobic-like),~4.4,7 h,0,0.0,0.0,10.0,15.0,0.25,0.0,5.0,1.0,1.0,0,0.01,Yes,0,0
8,MSM Fed-batch (30°C),~204,12 h,0,0.0,0.0,0.0,17.0,0.5,0.0,5.0,2.0,2.0,0,0.02,Yes,0,0
9,MSM Fed-batch (37°C),~148,12 h,0,0.0,0.0,0.0,17.0,0.5,0.0,5.0,2.0,2.0,0,0.02,Yes,0,0
13,Anaerobic-like (MOPS),~5.5,216 h,0,0.0,0.0,10.0,15.0,0.5,0.0,5.0,2.0,2.0,10,0.02,Yes,0,0


In [12]:
# Remove Medium column
df_cleaned = df_cleaned.drop(columns=['Medium'])
df_cleaned.head()


Unnamed: 0,Final OD₆₀₀,Time,Tryptone,Yeast Extract,Glycerol,Glucose,NaCl,MgSO₄,KCl,(NH₄)₂SO₄,K₂HPO₄,KH₂PO₄,MOPS,CaCl₂,Trace Metals,HCl,NaOH
4,~15.7,5 h,0,0.0,0.0,10.0,15.0,0.25,0.0,5.0,1.0,1.0,0,0.01,Yes,0,0
5,~4.4,7 h,0,0.0,0.0,10.0,15.0,0.25,0.0,5.0,1.0,1.0,0,0.01,Yes,0,0
8,~204,12 h,0,0.0,0.0,0.0,17.0,0.5,0.0,5.0,2.0,2.0,0,0.02,Yes,0,0
9,~148,12 h,0,0.0,0.0,0.0,17.0,0.5,0.0,5.0,2.0,2.0,0,0.02,Yes,0,0
13,~5.5,216 h,0,0.0,0.0,10.0,15.0,0.5,0.0,5.0,2.0,2.0,10,0.02,Yes,0,0


In [13]:
# Create a growth rate column (Final OD₆₀₀ / Time)
# Convert Final OD₆₀₀ to numeric
df_cleaned['Final OD₆₀₀_num'] = pd.to_numeric(df_cleaned['Final OD₆₀₀'].astype(str).str.replace('~', ''), errors='coerce')

# Convert Time to numeric hours (extract numbers and assume hours)
# Handle cases like "5 h", "12 h", "~2 h", "Overnight", etc.
df_cleaned['Time_num'] = df_cleaned['Time'].astype(str).str.extract(r'(\d+\.?\d*)').astype(float)

# For Time values like "Overnight" or "Several h", set a default value (e.g., 12 hours)
df_cleaned.loc[df_cleaned['Time'].str.contains('Overnight', case=False, na=False), 'Time_num'] = 12
df_cleaned.loc[df_cleaned['Time'].str.contains('Several', case=False, na=False), 'Time_num'] = 24

# Calculate growth rate
df_cleaned['growth rate'] = df_cleaned['Final OD₆₀₀_num'] / df_cleaned['Time_num']

# Remove temporary columns
df_cleaned = df_cleaned.drop(columns=['Final OD₆₀₀_num', 'Time_num'])

df_cleaned.head()


Unnamed: 0,Final OD₆₀₀,Time,Tryptone,Yeast Extract,Glycerol,Glucose,NaCl,MgSO₄,KCl,(NH₄)₂SO₄,K₂HPO₄,KH₂PO₄,MOPS,CaCl₂,Trace Metals,HCl,NaOH,growth rate
4,~15.7,5 h,0,0.0,0.0,10.0,15.0,0.25,0.0,5.0,1.0,1.0,0,0.01,Yes,0,0,3.14
5,~4.4,7 h,0,0.0,0.0,10.0,15.0,0.25,0.0,5.0,1.0,1.0,0,0.01,Yes,0,0,0.628571
8,~204,12 h,0,0.0,0.0,0.0,17.0,0.5,0.0,5.0,2.0,2.0,0,0.02,Yes,0,0,17.0
9,~148,12 h,0,0.0,0.0,0.0,17.0,0.5,0.0,5.0,2.0,2.0,0,0.02,Yes,0,0,12.333333
13,~5.5,216 h,0,0.0,0.0,10.0,15.0,0.5,0.0,5.0,2.0,2.0,10,0.02,Yes,0,0,0.025463


In [20]:
# Remove Final OD₆₀₀ and Time columns
df_cleaned = df_cleaned.drop(columns=['Final OD₆₀₀', 'Time'])
df_cleaned.head()


KeyError: "['Final OD₆₀₀', 'Time'] not found in axis"

In [21]:
# Add 'cell_dilution' column with value 1 for every row if it doesn't exist
if 'cell_dilution' not in df_cleaned.columns:
    df_cleaned['cell_dilution'] = 1.0

In [22]:
df_cleaned.head()

Unnamed: 0,Tryptone,Yeast Extract,Glycerol,Glucose,NaCl,MgSO₄,KCl,(NH₄)₂SO₄,K₂HPO₄,KH₂PO₄,MOPS,CaCl₂,Trace Metals,HCl,NaOH,growth rate,cell_dilution
4,0,0.0,0.0,10.0,15.0,0.25,0.0,5.0,1.0,1.0,0,0.01,Yes,0,0,3.14,1.0
5,0,0.0,0.0,10.0,15.0,0.25,0.0,5.0,1.0,1.0,0,0.01,Yes,0,0,0.628571,1.0
8,0,0.0,0.0,0.0,17.0,0.5,0.0,5.0,2.0,2.0,0,0.02,Yes,0,0,17.0,1.0
9,0,0.0,0.0,0.0,17.0,0.5,0.0,5.0,2.0,2.0,0,0.02,Yes,0,0,12.333333,1.0
13,0,0.0,0.0,10.0,15.0,0.5,0.0,5.0,2.0,2.0,10,0.02,Yes,0,0,0.025463,1.0


In [23]:
# Save the cleaned dataframe as CSV
df_cleaned.to_csv('../data/litterature_research_cleanup.csv', index=False)
print("DataFrame saved to '../data/litterature_research_cleanup.csv'")
print(f"Shape: {df_cleaned.shape}")


DataFrame saved to '../data/litterature_research_cleanup.csv'
Shape: (6, 17)
