In [None]:
import pandas as pd
file_path = '12 Abs SIR with metadata 1-85.csv'
data = pd.read_csv(file_path)
data.head()

In [None]:
data = pd.read_csv(file_path, encoding='ISO-8859-1')
data.head()

In [None]:
# Define the scoring system
scoring_system = {'R': 2, 'I': 1, 'S': 0}

# Select only the antibiotic columns
antibiotic_columns = data.columns[5:]

# Apply the scoring system to the antibiotic columns
scored_data = data[antibiotic_columns].replace(scoring_system)

# Sum the scores for each isolate
data['Total_Score'] = scored_data.sum(axis=1)

# Rank the isolates in descending order of total score
ranked_data = data.sort_values('Total_Score', ascending=False)
ranked_data[['Strain Number', 'Total_Score']].head()

In [None]:
import matplotlib.pyplot as plt

# Create labels for the isolates
labels = []
for index, row in ranked_data.iterrows():
    if row['Genome Sequencing'] != 'Not Sequenced':
        labels.append('\textit{' + row['Genome Sequencing'] + '}')
    else:
        labels.append(str(row['Strain Number']))

# Plot the resistance scores
plt.figure(figsize=(15, 5))
plt.bar(labels, ranked_data['Total_Score'])
plt.xticks(rotation=90)
plt.xlabel('Isolates')
plt.ylabel('Resistance Score')
plt.title('Resistance Score of Bacterial Isolates')
plt.tight_layout()
plt.show()

In [None]:
# Use strain numbers as labels
labels = ranked_data['Strain Number'].astype(str)

# Plot the resistance scores
plt.figure(figsize=(20, 10))
plt.bar(labels, ranked_data['Total_Score'])
plt.xticks(rotation=90)
plt.xlabel('Strain Number')
plt.ylabel('Resistance Score')

plt.tight_layout()
plt.show()

In [None]:
import matplotlib.pyplot as plt

# Use strain numbers as labels
labels = ranked_data['Strain Number'].astype(str)

# Increase the space between each bar
bar_width = 0.8  # Adjust this value as needed
bar_positions = range(len(labels))

# Plot the resistance scores
plt.figure(figsize=(20, 10))
plt.bar(bar_positions, ranked_data['Total_Score'], width=bar_width, tick_label=labels)

# Increase the font size for the x-axis and y-axis labels and tick labels
plt.xticks(rotation=90, fontsize=15)  # Increase the font size for x-axis tick labels
plt.yticks(fontsize=14)  # Increase the font size for y-axis tick labels
plt.xlabel('Strain Number', fontsize=20)  # Increase the font size for x-axis label
plt.ylabel('Resistance Score', fontsize=20)  # Increase the font size for y-axis label

plt.tight_layout()
plt.show()


In [None]:
# Filter strains with genome sequencing (36-85)
genome_sequenced_data = ranked_data[(ranked_data['Strain Number'] >= 36) & (ranked_data['Strain Number'] <= 85)]

# Group by species and calculate the average resistance score
average_scores = genome_sequenced_data.groupby('Genome Sequencing')['Total_Score'].mean().reset_index()

# Sort by average resistance score in descending order
average_scores_sorted = average_scores.sort_values('Total_Score', ascending=False)
average_scores_sorted.head()

In [None]:
import matplotlib.pyplot as plt

# Assuming you have the necessary data and variables

plt.figure(figsize=(20, 10))
plt.bar(average_scores_sorted['Genome Sequencing'], average_scores_sorted['Total_Score'])
plt.xticks(rotation=90, fontsize=20)  # Increase the fontsize for x-axis tick labels
plt.yticks(fontsize=20)  # Increase the fontsize for y-axis tick labels
plt.xlabel('Species', fontsize=25)  # Increase the fontsize for x-axis label
plt.ylabel('Average Resistance Score', fontsize=20)  # Increase the fontsize for y-axis label

plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Assuming you have the necessary data and variables

plt.figure(figsize=(20, 10))
plt.bar(average_scores_sorted['Genome Sequencing'], average_scores_sorted['Total_Score'])

# Modify x-axis tick labels to italicize species names using LaTeX formatting
species_names = average_scores_sorted['Genome Sequencing']
italic_species_names = [f"${species}$" for species in species_names]
plt.xticks(range(len(italic_species_names)), italic_species_names, rotation=90, fontsize=20)

plt.yticks(fontsize=20)  # Increase the fontsize for y-axis tick labels
plt.xlabel('Species', fontsize=25)  # Increase the fontsize for x-axis label
plt.ylabel('Average Resistance Score', fontsize=20)  # Increase the fontsize for y-axis label

plt.tight_layout()
plt.show()



In [None]:
# Identify the exact characters in the species name 'E. roggenkampii'
roggenkampii_name = average_scores_sorted[average_scores_sorted['Genome Sequencing'].str.contains('E. roggenkampii')]['Genome Sequencing'].iloc[0]
roggenkampii_name

In [None]:
# Remove the special character from the species name
average_scores_sorted['Genome Sequencing'] = average_scores_sorted['Genome Sequencing'].replace('E. roggenkampii\x98', 'E. roggenkampii')

# Plot the average resistance scores again
plt.figure(figsize=(15, 5))
plt.bar(average_scores_sorted['Genome Sequencing'], average_scores_sorted['Total_Score'])
plt.xticks(rotation=90)
plt.xlabel('Species')
plt.ylabel('Average Resistance Score')
plt.title('Average Resistance Score of Bacterial Species (Strains 36-85)')
plt.tight_layout()
plt.show()

In [None]:
# Load the updated CSV file
updated_file_path = '12 Abs SIR with metadata 1-85 updated.csv'
updated_data = pd.read_csv(updated_file_path, encoding='ISO-8859-1')
updated_data.head()

In [None]:
from scipy.stats import chi2_contingency

# Exclude the type strain
data_without_type_strain = updated_data[updated_data['Type of Isolate'] != 'Type Strain']

# Prepare a dictionary to store p-values for each antibiotic
p_values = {}

# Iterate through the antibiotic columns and perform chi-squared test
for antibiotic in antibiotic_columns:
    # Create a contingency table for resistance profile and geographical location
    contingency_table = pd.crosstab(data_without_type_strain[antibiotic], data_without_type_strain['Geographical location'])
    # Perform chi-squared test and store the p-value
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    p_values[antibiotic] = p

# Convert p-values to a DataFrame for visualization
p_values_df = pd.DataFrame(list(p_values.items()), columns=['Antibiotic', 'P-Value'])
p_values_df

In [None]:
# Plot the p-values for each antibiotic
plt.figure(figsize=(15, 5))
plt.bar(p_values_df['Antibiotic'], p_values_df['P-Value'], color=['red' if p < 0.05 else 'blue' for p in p_values_df['P-Value']])
plt.axhline(y=0.05, color='gray', linestyle='--')
plt.xticks(rotation=90)
plt.xlabel('Antibiotic')
plt.ylabel('P-Value')
plt.title('P-Values for Association Between Resistance Profile and Geographical Location')
plt.yscale('log')
plt.tight_layout()
plt.show()

In [None]:
# Identify antibiotics where all samples have the same resistance profile
independent_antibiotics = []
for antibiotic in antibiotic_columns:
    unique_values = data_without_type_strain[antibiotic].nunique()
    if unique_values == 1:
        independent_antibiotics.append(antibiotic)

independent_antibiotics_df = pd.DataFrame(independent_antibiotics, columns=['Antibiotic'])
independent_antibiotics_df['Independence'] = 'Yes'
independent_antibiotics_df

In [None]:
# Print the list of antibiotics where the resistance profile is independent of geographical location
independent_antibiotics_df

In [None]:
# Exclude the type strain
data_without_type_strain = updated_data[updated_data['Type of Isolate'] != 'Type Strain']

# Check the first few rows to ensure the type strain has been excluded
data_without_type_strain.head()

In [None]:
# Analyze each antibiotic to determine if the resistance profile is independent of geographical location
independence_results = []
for antibiotic in antibiotic_columns:
    # Group by geographical location and check if the resistance profile is the same across all locations
    grouped_by_location = data_without_type_strain.groupby('Geographical location')[antibiotic].nunique()
    is_independent = 'Yes' if grouped_by_location.nunique() == 1 else 'No'
    independence_results.append((antibiotic, is_independent))

# Convert the results to a DataFrame for easy visualization
independence_results_df = pd.DataFrame(independence_results, columns=['Antibiotic', 'Independence'])
independence_results_df

In [None]:
# Perform chi-squared test of independence for each antibiotic and summarize the p-values
p_values_results = []
for antibiotic in antibiotic_columns:
    # Create a contingency table for resistance profile and geographical location
    contingency_table = pd.crosstab(data_without_type_strain[antibiotic], data_without_type_strain['Geographical location'])
    # Perform chi-squared test
    chi2, p, dof, expected = chi2_contingency(contingency_table)
    # Check if the test is applicable (i.e., there is variation in the resistance profile)
    if contingency_table.shape[0] > 1:
        p_values_results.append((antibiotic, p))
    else:
        p_values_results.append((antibiotic, 'Independent (No Variation)'))

# Convert the results to a DataFrame for easy visualization
p_values_results_df = pd.DataFrame(p_values_results, columns=['Antibiotic', 'P-Value'])
p_values_results_df

In [None]:
# Plot the p-values for each antibiotic
plt.figure(figsize=(20,10))
plt.bar(p_values_results_df['Antibiotic'], p_values_results_df['P-Value'].apply(lambda x: x if isinstance(x, float) else 1), color=['red' if p < 0.05 else 'blue' for p in p_values_results_df['P-Value'].apply(lambda x: x if isinstance(x, float) else 1)])
plt.axhline(y=0.05, color='gray', linestyle='--')
plt.xticks(rotation=90)
plt.xlabel('Antibiotic')
plt.ylabel('P-Value')
plt.title('P-Values for Association Between Resistance Profile and Geographical Location')

plt.tight_layout()
plt.show()

In [None]:
# Analyze Tigecyclin 15 to understand where we see more or less resistance
tigecyclin_data = data_without_type_strain.groupby('Geographical location')['Tigecyclin 15'].value_counts(normalize=True).unstack().fillna(0) * 100
tigecyclin_data

In [None]:
# Perform chi-squared test of independence for Tigecyclin 15
contingency_table_tigecyclin = pd.crosstab(data_without_type_strain['Tigecyclin 15'], data_without_type_strain['Geographical location'])
chi2_tigecyclin, p_tigecyclin, dof_tigecyclin, expected_tigecyclin = chi2_contingency(contingency_table_tigecyclin)
p_tigecyclin

In [None]:
# Analyze Aztreonam 30 to understand where we see more or less resistance
aztreonam_data = data_without_type_strain.groupby('Geographical location')['Aztreonam 30'].value_counts(normalize=True).unstack().fillna(0) * 100
aztreonam_data

In [None]:
# Perform chi-squared test of independence for Aztreonam 30
contingency_table_aztreonam = pd.crosstab(data_without_type_strain['Aztreonam 30'], data_without_type_strain['Geographical location'])
chi2_aztreonam, p_aztreonam, dof_aztreonam, expected_aztreonam = chi2_contingency(contingency_table_aztreonam)
p_aztreonam

In [None]:
# Import geopy to get latitude and longitude coordinates
from geopy.geocoders import Nominatim

# Function to get coordinates for a location
def get_coordinates(location):
    geolocator = Nominatim(user_agent="geoapi")
    location_obj = geolocator.geocode(location)
    return location_obj.latitude, location_obj.longitude

# Prepare data for Tigecyclin 15
tigecyclin_data['Location'] = tigecyclin_data.index
tigecyclin_data['Latitude'] = tigecyclin_data['Location'].apply(lambda x: get_coordinates(x)[0])
tigecyclin_data['Longitude'] = tigecyclin_data['Location'].apply(lambda x: get_coordinates(x)[1])
tigecyclin_data['Resistance Level'] = tigecyclin_data['R'].apply(lambda x: 'High' if x > 50 else 'Low')
tigecyclin_data

In [None]:
# Install the geopy library to work with geographical coordinates
!pip install -q geopy

In [None]:
# Manually specify coordinates for the locations
coordinates = {
    'SA Limpopo province': (-23.4013, 29.4179),
    'Switzerland': (46.8182, 8.2275),
    'USA (various states)': (37.0902, -95.7129)
}

# Prepare data for Tigecyclin 15 using the specified coordinates
tigecyclin_data['Location'] = tigecyclin_data.index
tigecyclin_data['Latitude'] = tigecyclin_data['Location'].apply(lambda x: coordinates[x][0])
tigecyclin_data['Longitude'] = tigecyclin_data['Location'].apply(lambda x: coordinates[x][1])
tigecyclin_data['Resistance Level'] = tigecyclin_data['R'].apply(lambda x: 'High' if x > 50 else 'Low')
tigecyclin_data

In [None]:
# Trim extra spaces from location names
tigecyclin_data.index = tigecyclin_data.index.str.strip()

# Prepare data for Tigecyclin 15 using the specified coordinates
tigecyclin_data['Location'] = tigecyclin_data.index
tigecyclin_data['Latitude'] = tigecyclin_data['Location'].apply(lambda x: coordinates[x][0])
tigecyclin_data['Longitude'] = tigecyclin_data['Location'].apply(lambda x: coordinates[x][1])
tigecyclin_data['Resistance Level'] = tigecyclin_data['R'].apply(lambda x: 'High' if x > 50 else 'Low')
tigecyclin_data

In [None]:
# Filter out the 'Type Strain' row
tigecyclin_data_filtered = tigecyclin_data.loc[tigecyclin_data.index != 'Type Strain']

# Prepare data for Tigecyclin 15 using the specified coordinates
tigecyclin_data_filtered['Location'] = tigecyclin_data_filtered.index
tigecyclin_data_filtered['Latitude'] = tigecyclin_data_filtered['Location'].apply(lambda x: coordinates[x])
tigecyclin_data_filtered['Resistance Level'] = tigecyclin_data_filtered['R'].apply(lambda x: 'High' if x > 50 else 'Low')
tigecyclin_data_filtered

In [None]:
# Extract state information for the USA and obtain coordinates
usa_states_data = tigecyclin_data.loc[tigecyclin_data.index.str.contains('USA')]
usa_states_data['State'] = usa_states_data.index.str.extract(r'\((.*?)\)')[0]
usa_states_data['Latitude'] = usa_states_data['State'].apply(lambda x: get_coordinates(x)[0])
usa_states_data['Longitude'] = usa_states_data['State'].apply(lambda x: get_coordinates(x)[1])
usa_states_data['Resistance Level'] = usa_states_data['R'].apply(lambda x: 'High' if x > 50 else 'Low')
usa_states_data

In [None]:
# Modify state names to include 'USA' and obtain coordinates
usa_states_data['State'] = usa_states_data['Location'].apply(lambda x: x.replace('USA ', '') + ', USA')
usa_states_data['Latitude'] = usa_states_data['State'].apply(lambda x: get_coordinates(x)[0])
usa_states_data['Longitude'] = usa_states_data['State'].apply(lambda x: get_coordinates(x)[1])
usa_states_data

In [None]:
# Combine USA states data with other locations
other_locations_data = tigecyclin_data.loc[~tigecyclin_data.index.str.contains('USA|Type Strain')]
other_locations_data['State'] = other_locations_data.index
other_locations_data['Latitude'] = other_locations_data['State'].apply(lambda x: coordinates[x][0])
other_locations_data['Longitude'] = other_locations_data['State'].apply(lambda x: coordinates[x][1])
other_locations_data['Resistance Level'] = other_locations_data['R'].apply(lambda x: 'High' if x > 50 else 'Low')

# Concatenate with USA states data
final_tigecyclin_data = pd.concat([usa_states_data, other_locations_data])

# Plot the map
fig = px.scatter_geo(final_tigecyclin_data, lat='Latitude', lon='Longitude',
                     text='State', color='Resistance Level',
                     projection='natural earth', title='Resistance Levels for Tigecyclin 15')
fig.show()

In [None]:
import plotly.express as px

# Plot the map
fig = px.scatter_geo(final_tigecyclin_data, lat='Latitude', lon='Longitude',
                     text='State', color='Resistance Level',
                     projection='natural earth', title='Resistance Levels for Tigecyclin 15')
fig.show()

In [None]:
# Prepare data for Tigecyclin 15 and Aztreonam 30 resistance across locations
resistance_data = tigecyclin_data[['R']].rename(columns={'R': 'Tigecyclin 15 Resistance'})
resistance_data['Aztreonam 30 Resistance'] = aztreonam_data['R']
resistance_data = resistance_data.reset_index().rename(columns={'index': 'Location'})
resistance_data = resistance_data.melt(id_vars=['Location'], value_vars=['Tigecyclin 15 Resistance', 'Aztreonam 30 Resistance'], var_name='Antibiotic', value_name='Resistance')

# Plot the bar chart
fig = px.bar(resistance_data, x='Location', y='Resistance', color='Antibiotic', barmode='group', title='Resistance Percentage for Tigecyclin 15 and Aztreonam 30 Across Locations')
fig.show()

In [None]:
# Prepare data for Tigecyclin 15 and Aztreonam 30 resistance across locations
resistance_data = tigecyclin_data[['R']].rename(columns={'R': 'Tigecyclin 15 Resistance'})
resistance_data['Aztreonam 30 Resistance'] = aztreonam_data['R']
resistance_data['Location'] = resistance_data.index
resistance_data = resistance_data.melt(id_vars=['Location'], value_vars=['Tigecyclin 15 Resistance', 'Aztreonam 30 Resistance'], var_name='Antibiotic', value_name='Resistance')

# Plot the bar chart
fig = px.bar(resistance_data, x='Location', y='Resistance', color='Antibiotic', barmode='group', title='Resistance Percentage for Tigecyclin 15 and Aztreonam 30 Across Locations')
fig.show()

In [None]:
# Plot the line chart for Tigecyclin 15 and Aztreonam 30 resistance trend across locations
fig = px.line(resistance_data, x='Location', y='Resistance', color='Antibiotic', title='Resistance Trend for Tigecyclin 15 and Aztreonam 30 Across Locations')
fig.show()

In [None]:
# Plot the grouped bar chart for Tigecyclin 15 and Aztreonam 30 resistance comparison across locations
fig = px.bar(resistance_data, x='Location', y='Resistance', color='Antibiotic', barmode='group', title='Resistance Comparison for Tigecyclin 15 and Aztreonam 30 Across Locations')
fig.show()

In [None]:
# Prepare data for scatter plot
scatter_data = tigecyclin_data[['R']].rename(columns={'R': 'Tigecyclin 15 Resistance'})
scatter_data['Aztreonam 30 Resistance'] = aztreonam_data['R']
scatter_data.reset_index(inplace=True)
scatter_data.rename(columns={'index': 'Location'}, inplace=True)

# Plot the scatter plot with trend line
fig = px.scatter(scatter_data, x='Tigecyclin 15 Resistance', y='Aztreonam 30 Resistance', trendline='ols', title='Correlation Between Resistance Levels for Tigecyclin 15 and Aztreonam 30')
fig.show()

In [None]:
# Exclude both type strains from the dataset
excluded_type_strains_data_tigecyclin = tigecyclin_data[~tigecyclin_data.index.str.contains('Type Strain')]
excluded_type_strains_data_aztreonam = aztreonam_data[~aztreonam_data.index.str.contains('Type Strain')]

# Perform Chi-square test for Tigecyclin 15
chi2_tigecyclin, p_tigecyclin, dof_tigecyclin, expected_tigecyclin = chi2_contingency(excluded_type_strains_data_tigecyclin)

# Perform Chi-square test for Aztreonam 30
chi2_aztreonam, p_aztreonam, dof_aztreonam, expected_aztreonam = chi2_contingency(excluded_type_strains_data_aztreonam)

p_tigecyclin, p_aztreonam

In [None]:
# Select only the resistance scores (excluding metadata columns) for the Chi-square test
excluded_type_strains_data_tigecyclin = excluded_type_strains_data_tigecyclin.iloc[:, 2:]
excluded_type_strains_data_aztreonam = excluded_type_strains_data_aztreonam.iloc[:, 2:]

# Perform Chi-square test for Tigecyclin 15
chi2_tigecyclin, p_tigecyclin, dof_tigecyclin, expected_tigecyclin = chi2_contingency(excluded_type_strains_data_tigecyclin)

# Perform Chi-square test for Aztreonam 30
chi2_aztreonam, p_aztreonam, dof_aztreonam, expected_aztreonam = chi2_contingency(excluded_type_strains_data_aztreonam)

p_tigecyclin, p_aztreonam

In [None]:
# Display the first few rows of the data to understand the structure
tigecyclin_data.head(), aztreonam_data.head()

In [None]:
# Exclude both type strains from the index
excluded_type_strains_data_tigecyclin = tigecyclin_data.drop(index='Type Strain')
excluded_type_strains_data_aztreonam = aztreonam_data.drop(index='Type Strain')

# Perform Chi-square test for Tigecyclin 15
chi2_tigecyclin, p_tigecyclin, dof_tigecyclin, expected_tigecyclin = chi2_contingency(excluded_type_strains_data_tigecyclin)

# Perform Chi-square test for Aztreonam 30
chi2_aztreonam, p_aztreonam, dof_aztreonam, expected_aztreonam = chi2_contingency(excluded_type_strains_data_aztreonam)

p_tigecyclin, p_aztreonam

In [None]:
# Exclude strains 83 and 84 (columns 84 and 85) from the dataset
excluded_type_strains_data_tigecyclin = tigecyclin_data.drop(columns=[84, 85])
excluded_type_strains_data_aztreonam = aztreonam_data.drop(columns=[84, 85])

# Perform Chi-square test for Tigecyclin 15
chi2_tigecyclin, p_tigecyclin, dof_tigecyclin, expected_tigecyclin = chi2_contingency(excluded_type_strains_data_tigecyclin)

# Perform Chi-square test for Aztreonam 30
chi2_aztreonam, p_aztreonam, dof_aztreonam, expected_aztreonam = chi2_contingency(excluded_type_strains_data_aztreonam)

p_tigecyclin, p_aztreonam

In [None]:
# Exclude strains 83 and 84 (columns 83 and 84) from the dataset
excluded_type_strains_data_tigecyclin = tigecyclin_data.drop(columns=[83, 84])
excluded_type_strains_data_aztreonam = aztreonam_data.drop(columns=[83, 84])

# Perform Chi-square test for Tigecyclin 15
chi2_tigecyclin, p_tigecyclin, dof_tigecyclin, expected_tigecyclin = chi2_contingency(excluded_type_strains_data_tigecyclin)

# Perform Chi-square test for Aztreonam 30
chi2_aztreonam, p_aztreonam, dof_aztreonam, expected_aztreonam = chi2_contingency(excluded_type_strains_data_aztreonam)

p_tigecyclin, p_aztreonam

In [None]:
# Display the first few rows of the data to understand the structure
tigecyclin_data.head(), aztreonam_data.head()

In [None]:
# Exclude the 'Type Strain' row from the dataset
excluded_type_strains_data_tigecyclin = tigecyclin_data.drop(index='Type Strain')
excluded_type_strains_data_aztreonam = aztreonam_data.drop(index='Type Strain')

# Perform Chi-square test for Tigecyclin 15
chi2_tigecyclin, p_tigecyclin, dof_tigecyclin, expected_tigecyclin = chi2_contingency(excluded_type_strains_data_tigecyclin)

# Perform Chi-square test for Aztreonam 30
chi2_aztreonam, p_aztreonam, dof_aztreonam, expected_aztreonam = chi2_contingency(excluded_type_strains_data_aztreonam)

p_tigecyclin, p_aztreonam

In [None]:
# Exclude the 'Type Strain' row from the dataset
excluded_type_strains_data_tigecyclin = tigecyclin_data.drop(index='Type Strain')
excluded_type_strains_data_aztreonam = aztreonam_data.drop(index='Type Strain')

# Display the first few rows of the excluded data to understand the structure
excluded_type_strains_data_tigecyclin.head(), excluded_type_strains_data_aztreonam.head()

In [None]:
# Create contingency table for Tigecyclin by rounding the percentages to integers
contingency_table_tigecyclin = excluded_type_strains_data_tigecyclin[['R', 'S']].round().astype(int)

# Create contingency table for Aztreonam by rounding the percentages to integers
contingency_table_aztreonam = excluded_type_strains_data_aztreonam[['I', 'R', 'S']].round().astype(int)

# Perform Chi-square test for Tigecyclin 15
chi2_tigecyclin, p_tigecyclin, dof_tigecyclin, expected_tigecyclin = chi2_contingency(contingency_table_tigecyclin)

# Perform Chi-square test for Aztreonam 30
chi2_aztreonam, p_aztreonam, dof_aztreonam, expected_aztreonam = chi2_contingency(contingency_table_aztreonam)

p_tigecyclin, p_aztreonam

In [None]:
# Reproducing Figure 2 (Excluding Type Strains) for Tigecyclin
plt.figure(figsize=(10, 6))
sns.barplot(x=excluded_type_strains_data_tigecyclin.index, y='R', data=excluded_type_strains_data_tigecyclin, color='red')
plt.xticks(rotation=90)
plt.ylabel('Resistance Percentage')
plt.title('Resistance to Tigecyclin 15 Across Different Geographical Locations (Excluding Type Strains)')
plt.show()

In [None]:
# Plotting the bar graph for Tigecyclin resistance profile across different geographical locations
fig, axes = plt.subplots(2, 1, figsize=(12, 12))
contingency_table_tigecyclin.plot(kind='bar', stacked=True, ax=axes[0], title='Tigecyclin Resistance Profile')
axes[0].set_ylabel('Percentage')

# Plotting the bar graph for Aztreonam resistance profile across different geographical locations
contingency_table_aztreonam.plot(kind='bar', stacked=True, ax=axes[1], title='Aztreonam Resistance Profile')
axes[1].set_ylabel('Percentage')

plt.tight_layout()

In [None]:
# Plotting the bar graph for Tigecyclin resistance profile across different geographical locations (without title)
fig, axes = plt.subplots(2, 1, figsize=(12, 12))
contingency_table_tigecyclin.plot(kind='bar', stacked=True, ax=axes[0])
axes[0].set_ylabel('Percentage')
axes[0].legend(loc='upper right')

# Plotting the bar graph for Aztreonam resistance profile across different geographical locations (without title)
contingency_table_aztreonam.plot(kind='bar', stacked=True, ax=axes[1])
axes[1].set_ylabel('Percentage')
axes[1].legend(loc='upper right')

plt.tight_layout()

In [None]:
# List to store p-values for each antibiotic
p_values = []

# Cut-off p-value for significance
p_value_cutoff = 0.05

# Iterate through the antibiotics data (excluding Amikacin and type strains)
for antibiotic, data in antibiotics_data.items():
    if antibiotic != 'Amikacin 30':
        # Exclude the 'Type Strain' row
        data_excluded_type_strains = data.drop(index='Type Strain')
        # Perform Chi-square test
        chi2, p, dof, expected = chi2_contingency(data_excluded_type_strains)
        p_values.append((antibiotic, p))

# Create a DataFrame to store the p-values
p_values_df = pd.DataFrame(p_values, columns=['Antibiotic', 'P-Value'])

# Plotting the p-values with the cut-off line
plt.figure(figsize=(10, 6))
plt.bar(p_values_df['Antibiotic'], p_values_df['P-Value'], color=['red' if p < p_value_cutoff else 'blue' for p in p_values_df['P-Value']])
plt.axhline(y=p_value_cutoff, color='green', linestyle='--', label=f'P-Value Cut-off ({p_value_cutoff})')
plt.ylabel('P-Value')
plt.xticks(rotation=90)
plt.legend()
plt.title('P-Values for Chi-Square Test (Excluding Amikacin)')
plt.tight_layout()

In [None]:
# List to store p-values for each antibiotic
p_values = []

# Cut-off p-value for significance
p_value_cutoff = 0.05

# Iterate through the antibiotics data (excluding Amikacin and type strains)
for antibiotic in antibiotics_columns:
    if antibiotic != 'Amikacin 30':
        # Extract data for the specific antibiotic
        data = resistance_data[[antibiotic, 'Location']].pivot_table(index='Location', columns=antibiotic, aggfunc=len, fill_value=0)
        # Exclude the 'Type Strain' row
        data_excluded_type_strains = data.drop(index='Type Strain')
        # Perform Chi-square test
        chi2, p, dof, expected = chi2_contingency(data_excluded_type_strains)
        p_values.append((antibiotic, p))

# Create a DataFrame to store the p-values
p_values_df = pd.DataFrame(p_values, columns=['Antibiotic', 'P-Value'])

# Plotting the p-values with the cut-off line
plt.figure(figsize=(10, 6))
plt.bar(p_values_df['Antibiotic'], p_values_df['P-Value'], color=['red' if p < p_value_cutoff else 'blue' for p in p_values_df['P-Value']])
plt.axhline(y=p_value_cutoff, color='green', linestyle='--', label=f'P-Value Cut-off ({p_value_cutoff})')
plt.ylabel('P-Value')
plt.xticks(rotation=90)
plt.legend()
plt.title('P-Values for Chi-Square Test (Excluding Amikacin)')
plt.tight_layout()

In [None]:
# List to store p-values for each antibiotic
p_values = []

# Cut-off p-value for significance
p_value_cutoff = 0.05

# Iterate through the antibiotics data (excluding Amikacin and type strains)
for antibiotic in antibiotics_excluding_type_strains.columns[:-1]:
    if antibiotic != 'Amikacin 30':
        # Extract data for the specific antibiotic
        data = antibiotics_excluding_type_strains[[antibiotic, 'Location']].pivot_table(index='Location', columns=antibiotic, aggfunc=len, fill_value=0)
        # Perform Chi-square test
        chi2, p, dof, expected = chi2_contingency(data)
        p_values.append((antibiotic, p))

# Create a DataFrame to store the p-values
p_values_df = pd.DataFrame(p_values, columns=['Antibiotic', 'P-Value'])

# Plotting the p-values with the cut-off line
plt.figure(figsize=(10, 6))
plt.bar(p_values_df['Antibiotic'], p_values_df['P-Value'], color=['red' if p < p_value_cutoff else 'blue' for p in p_values_df['P-Value']])
plt.axhline(y=p_value_cutoff, color='green', linestyle='--', label=f'P-Value Cut-off ({p_value_cutoff})')
plt.ylabel('P-Value')
plt.xticks(rotation=90)
plt.legend()
plt.title('P-Values for Chi-Square Test (Excluding Amikacin)')
plt.tight_layout()

In [None]:
# Excluding the type strains (83 and 84) and Amikacin
data_for_chi_square = resistance_data_excluding_type_strains.drop(columns=['Amikacin 30'])

# List to store p-values for each antibiotic
p_values = []

# Cut-off p-value for significance
p_value_cutoff = 0.05

# Iterate through the antibiotics data (excluding Amikacin and type strains)
for antibiotic in data_for_chi_square.columns[:-1]:
    # Extract data for the specific antibiotic
    data = data_for_chi_square[[antibiotic, 'Location']].pivot_table(index='Location', columns=antibiotic, aggfunc=len, fill_value=0)
    # Perform Chi-square test
    chi2, p, dof, expected = chi2_contingency(data)
    p_values.append((antibiotic, p))

# Create a DataFrame to store the p-values
p_values_df = pd.DataFrame(p_values, columns=['Antibiotic', 'P-Value'])

# Plotting the p-values with the cut-off line
plt.figure(figsize=(10, 6))
plt.bar(p_values_df['Antibiotic'], p_values_df['P-Value'], color=['red' if p < p_value_cutoff else 'blue' for p in p_values_df['P-Value']])
plt.axhline(y=p_value_cutoff, color='green', linestyle='--', label=f'P-Value Cut-off ({p_value_cutoff})')
plt.ylabel('P-Value')
plt.xticks(rotation=90)
plt.legend()
plt.title('P-Values for Chi-Square Test (Excluding Amikacin)')
plt.tight_layout()

In [None]:
# Excluding the type strains (83 and 84) and Amikacin
data_for_chi_square = resistance_data.drop(index=[83, 84], columns=['Amikacin 30'])

# List to store p-values for each antibiotic
p_values = []

# Cut-off p-value for significance
p_value_cutoff = 0.05

# Iterate through the antibiotics data (excluding Amikacin and type strains)
for antibiotic in data_for_chi_square.columns[:-1]:
    # Extract data for the specific antibiotic
    data = data_for_chi_square[[antibiotic, 'Location']].pivot_table(index='Location', columns=antibiotic, aggfunc=len, fill_value=0)
    # Perform Chi-square test
    chi2, p, dof, expected = chi2_contingency(data)
    p_values.append((antibiotic, p))

# Create a DataFrame to store the p-values
p_values_df = pd.DataFrame(p_values, columns=['Antibiotic', 'P-Value'])

# Plotting the p-values with the cut-off line
plt.figure(figsize=(10, 6))
plt.bar(p_values_df['Antibiotic'], p_values_df['P-Value'], color=['red' if p < p_value_cutoff else 'blue' for p in p_values_df['P-Value']])
plt.axhline(y=p_value_cutoff, color='green', linestyle='--', label=f'P-Value Cut-off ({p_value_cutoff})')
plt.ylabel('P-Value')
plt.xticks(rotation=90)
plt.legend()
plt.tight_layout()

In [None]:
# Identifying the rows corresponding to the type strains (83 and 84)
type_strains_rows = resistance_data[(resistance_data['Strain'] == 83) | (resistance_data['Strain'] == 84)]
type_strains_rows

In [None]:
# Identifying the rows corresponding to the type strains (83 and 84)
type_strains_rows = resistance_data.loc[[83, 84]]
type_strains_rows

In [None]:
# Excluding the specified strains (77, 83, and 84) and Amikacin
data_for_chi_square = resistance_data.drop(index=[77, 83, 84], columns=['Amikacin 30'])

# List to store p-values for each antibiotic
p_values = []

# Cut-off p-value for significance
p_value_cutoff = 0.05

# Iterate through the antibiotics data (excluding Amikacin and specified strains)
for antibiotic in data_for_chi_square.columns[:-1]:
    # Extract data for the specific antibiotic
    data = data_for_chi_square[[antibiotic, 'Location']].pivot_table(index='Location', columns=antibiotic, aggfunc=len, fill_value=0)
    # Perform Chi-square test
    chi2, p, dof, expected = chi2_contingency(data)
    p_values.append((antibiotic, p))

# Create a DataFrame to store the p-values
p_values_df = pd.DataFrame(p_values, columns=['Antibiotic', 'P-Value'])

# Plotting the p-values with the cut-off line
plt.figure(figsize=(10, 6))
plt.bar(p_values_df['Antibiotic'], p_values_df['P-Value'], color=['red' if p < p_value_cutoff else 'blue' for p in p_values_df['P-Value']])
plt.axhline(y=p_value_cutoff, color='green', linestyle='--', label=f'P-Value Cut-off ({p_value_cutoff})')
plt.ylabel('P-Value')
plt.xticks(rotation=90)
plt.legend()
plt.title('P-Values for Chi-Square Test (Excluding Amikacin)')
plt.tight_layout()

In [None]:
# Identifying the correct index values for the specified strains (77, 83, and 84)
strains_to_exclude = [77, 83, 84]
index_to_exclude = resistance_data[resistance_data['Strain'].isin(strains_to_exclude)].index

# Excluding the specified strains and Amikacin
data_for_chi_square = resistance_data.drop(index=index_to_exclude, columns=['Amikacin 30'])

# List to store p-values for each antibiotic
p_values = []

# Cut-off p-value for significance
p_value_cutoff = 0.05

# Iterate through the antibiotics data (excluding Amikacin and specified strains)
for antibiotic in data_for_chi_square.columns[:-2]:
    # Extract data for the specific antibiotic
    data = data_for_chi_square[[antibiotic, 'Location']].pivot_table(index='Location', columns=antibiotic, aggfunc=len, fill_value=0)
    # Perform Chi-square test
    chi2, p, dof, expected = chi2_contingency(data)
    p_values.append((antibiotic, p))

# Create a DataFrame to store the p-values
p_values_df = pd.DataFrame(p_values, columns=['Antibiotic', 'P-Value'])

# Plotting the p-values with the cut-off line
plt.figure(figsize=(10, 6))
plt.bar(p_values_df['Antibiotic'], p_values_df['P-Value'], color=['red' if p < p_value_cutoff else 'blue' for p in p_values_df['P-Value']])
plt.axhline(y=p_value_cutoff, color='green', linestyle='--', label=f'P-Value Cut-off ({p_value_cutoff})')
plt.ylabel('P-Value')
plt.xticks(rotation=90)
plt.legend()
plt.title('P-Values for Chi-Square Test (Excluding Amikacin)')
plt.tight_layout()

In [None]:
# Loading the new dataset for Chi-square analysis
chi_square_data_path = '/content/Enterobacter Chi-Square .csv'
chi_square_data = pd.read_csv(chi_square_data_path)

# Displaying the first few rows of the new dataset
chi_square_data.head()

In [None]:
# Correcting the file path and loading the new dataset for Chi-square analysis
chi_square_data_path = 'Enterobacter Chi-Square .csv'
chi_square_data = pd.read_csv(chi_square_data_path)

# Displaying the first few rows of the new dataset
chi_square_data.head()

In [None]:
# List to store p-values for each antibiotic
p_values_chi_square = []

# Cut-off p-value for significance
p_value_cutoff_chi_square = 0.05

# Iterate through the antibiotics data
for antibiotic in chi_square_data.columns[4:]:
    # Extract data for the specific antibiotic
    data = chi_square_data[[antibiotic, 'Location']].pivot_table(index='Location', columns=antibiotic, aggfunc=len, fill_value=0)
    # Perform Chi-square test
    chi2, p, dof, expected = chi2_contingency(data)
    p_values_chi_square.append((antibiotic, p))

# Create a DataFrame to store the p-values
p_values_chi_square_df = pd.DataFrame(p_values_chi_square, columns=['Antibiotic', 'P-Value'])

# Plotting the p-values with the cut-off line
plt.figure(figsize=(10, 6))
plt.bar(p_values_chi_square_df['Antibiotic'], p_values_chi_square_df['P-Value'], color=['red' if p < p_value_cutoff_chi_square else 'blue' for p in p_values_chi_square_df['P-Value']])
plt.axhline(y=p_value_cutoff_chi_square, color='green', linestyle='--', label=f'P-Value Cut-off ({p_value_cutoff_chi_square})')
plt.ylabel('P-Value')
plt.xticks(rotation=90)
plt.legend(loc='upper right')
# Adding "Antibiotics" text below the x-axis labels but lower
plt.text(0.5, -0.8, "Antibiotics", transform=plt.gca().transAxes, ha='center')


plt.tight_layout()

In [None]:
# Creating a contingency table for Tigecyclin resistance profile across different geographical locations
contingency_table_tigecyclin_new = chi_square_data.groupby('Location')['Tigecyclin 15'].value_counts(normalize=True).unstack().fillna(0) * 100

# Plotting the bar graph for Tigecyclin resistance profile across different geographical locations (without title)
plt.figure(figsize=(12, 6))
contingency_table_tigecyclin_new.plot(kind='bar', stacked=True)
plt.ylabel('Percentage')
plt.legend(loc='upper right')
plt.title('')
plt.tight_layout()
plt.show()


In [None]:
import matplotlib.pyplot as plt

# Creating a contingency table for Tigecyclin resistance profile across different geographical locations
contingency_table_tigecyclin_new = chi_square_data.groupby('Location')['Tigecyclin 15'].value_counts(normalize=True).unstack().fillna(0) * 100

# Define colors for 'R' and 'S'
colors = {'R': 'coral', 'S': 'lightblue'}

# Plotting the bar graph for Tigecyclin resistance profile across different geographical locations (without title)
plt.figure(figsize=(12, 6))
contingency_table_tigecyclin_new.plot(kind='bar', stacked=True, color=[colors[column] for column in contingency_table_tigecyclin_new.columns])
plt.ylabel('Percentage')
plt.legend(loc='upper right')
plt.title('')
plt.tight_layout()
plt.show()


In [None]:
# Loading the updated dataset for correlation analysis
correlation_data_path = '12 Abs SIR with metadata 1-85 updated.csv'
correlation_data = pd.read_csv(correlation_data_path)

# Displaying the first few rows of the dataset
correlation_data.head()

In [None]:
# Mapping resistance profile to numerical values (R=2, I=1, S=0)
resistance_mapping = {'R': 2, 'I': 1, 'S': 0}
antibiotics_columns = correlation_data.columns[6:]

# Excluding Amikacin if all outcomes are 'S'
if correlation_data['Amikacin 30'].nunique() == 1 and correlation_data['Amikacin 30'].iloc[0] == 'S':
    antibiotics_columns = antibiotics_columns.drop('Amikacin 30')

# Applying the mapping to the antibiotics columns
for col in antibiotics_columns:
    correlation_data[col] = correlation_data[col].map(resistance_mapping)

# Calculating the correlation matrix
correlation_matrix = correlation_data[antibiotics_columns].corr()

# Plotting the correlation heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=.5)
plt.title('Correlation Heatmap for Antibiotics Resistance')
plt.show()

In [None]:
# Excluding Amikacin if all outcomes are 'S'
if correlation_data['Amikacin 30'].nunique() == 1 and correlation_data['Amikacin 30'].iloc[0] == 'S':
    antibiotics_columns = [col for col in antibiotics_columns if col != 'Amikacin 30']

# Applying the mapping to the antibiotics columns
for col in antibiotics_columns:
    correlation_data[col] = correlation_data[col].map(resistance_mapping)

# Calculating the correlation matrix
correlation_matrix = correlation_data[antibiotics_columns].corr()

# Plotting the correlation heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=.5)
plt.title('Correlation Heatmap for Antibiotics Resistance')
plt.show()

In [None]:
# Importing Seaborn library
import seaborn as sns

# Plotting the correlation heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=.5)
plt.title('Correlation Heatmap for Antibiotics Resistance')
plt.show()

In [None]:
from scipy.stats import pearsonr

# Function to calculate p-value for each correlation
def calculate_pvalues(df):
    pvalues = df.apply(lambda x: [pearsonr(x, df[col])[1] for col in df.columns])
    return pd.DataFrame(pvalues, columns=df.columns, index=df.index)

# Calculating p-values for the correlation matrix
p_values_matrix = calculate_pvalues(correlation_data[antibiotics_columns])

# Significance level for highlighting
significance_level = 0.05

# Creating a mask for significant correlations
significant_mask = p_values_matrix < significance_level

# Plotting the correlation heatmap with significant correlations highlighted
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=.5, mask=~significant_mask)
plt.title('Correlation Heatmap for Antibiotics Resistance (Significant Correlations Highlighted)')
plt.show()

In [None]:
# Function to calculate p-value for each correlation (ensuring alignment with correlation matrix)
def calculate_pvalues_aligned(df):
    pvalues = pd.DataFrame(index=df.columns, columns=df.columns)
    for col1 in df.columns:
        for col2 in df.columns:
            pvalues[col1][col2] = pearsonr(df[col1], df[col2])[1]
    return pvalues

# Calculating p-values for the correlation matrix (aligned)
p_values_matrix_aligned = calculate_pvalues_aligned(correlation_data[antibiotics_columns])

# Creating a mask for significant correlations (aligned)
significant_mask_aligned = p_values_matrix_aligned < significance_level

# Plotting the correlation heatmap with significant correlations highlighted (aligned)
plt.figure(figsize=(12, 10))
sns.heatmap(correlation_matrix, annot=True, cmap='coolwarm', linewidths=.5, mask=~significant_mask_aligned)
plt.title('Correlation Heatmap for Antibiotics Resistance (Significant Correlations Highlighted)')
plt.show()

In [None]:
from scipy.stats import chi2_contingency
import numpy as np

# Function to calculate Cramer's V for a given contingency table
def cramers_v(contingency_table):
    chi2, _, _, _ = chi2_contingency(contingency_table)
    n = contingency_table.sum().sum()
    phi2 = chi2 / n
    r, k = contingency_table.shape
    phi2_corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
    r_corr = r - ((r - 1) ** 2) / (n - 1)
    k_corr = k - ((k - 1) ** 2) / (n - 1)
    return np.sqrt(phi2_corr / min((k_corr - 1), (r_corr - 1)))

# Calculating Cramer's V for each pair of antibiotics
cramers_v_matrix = pd.DataFrame(index=antibiotics_columns, columns=antibiotics_columns)
for col1 in antibiotics_columns:
    for col2 in antibiotics_columns:
        contingency_table = pd.crosstab(correlation_data[col1], correlation_data[col2])
        cramers_v_matrix[col1][col2] = cramers_v(contingency_table)

# Plotting the Cramer's V heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(cramers_v_matrix.astype(float), annot=True, cmap='coolwarm', linewidths=.5)
plt.title('Cramer\'s V Heatmap for Antibiotics Resistance')
plt.show()

In [None]:
# Setting the diagonal values to 1 (perfect association for the same antibiotic)
np.fill_diagonal(cramers_v_matrix.values, 1)

# Plotting the corrected Cramer's V heatmap
plt.figure(figsize=(12, 10))
sns.heatmap(cramers_v_matrix.astype(float), annot=True, cmap='coolwarm', linewidths=.5)
plt.title('Cramer\'s V Heatmap for Antibiotics Resistance')
plt.show()

In [None]:
# Function to calculate p-values for Cramer's V (aligned with Cramer's V matrix)
def calculate_pvalues_cramers_v(df):
    pvalues = pd.DataFrame(index=df.columns, columns=df.columns)
    for col1 in df.columns:
        for col2 in df.columns:
            contingency_table = pd.crosstab(correlation_data[col1], correlation_data[col2])
            pvalues[col1][col2] = chi2_contingency(contingency_table)[1]
    return pvalues

# Calculating p-values for the Cramer's V matrix (aligned)
p_values_matrix_cramers_v = calculate_pvalues_cramers_v(correlation_data[antibiotics_columns])

# Creating a mask for significant correlations (aligned with Cramer's V)
significant_mask_cramers_v = p_values_matrix_cramers_v < significance_level

# Setting diagonal values to True (since they are always significant)
np.fill_diagonal(significant_mask_cramers_v.values, True)

# Plotting the Cramer's V heatmap with significant correlations highlighted (aligned)
plt.figure(figsize=(12, 10))
sns.heatmap(cramers_v_matrix.astype(float), annot=True, cmap='coolwarm', linewidths=.5, mask=~significant_mask_cramers_v)
plt.title('Cramer\'s V Heatmap for Antibiotics Resistance (Significant Correlations Highlighted)')
plt.show()

In [None]:
# Creating a new matrix that contains only the significant Cramer's V values (non-significant values set to NaN)
significant_cramers_v_matrix = cramers_v_matrix.where(significant_mask_cramers_v)

# Plotting the heatmap with only significant Cramer's V values
plt.figure(figsize=(12, 10))
sns.heatmap(significant_cramers_v_matrix.astype(float), annot=True, cmap='coolwarm', linewidths=.5, cbar=False)
plt.title('Cramer\'s V Heatmap for Antibiotics Resistance (Only Significant Correlations)')
plt.show()