In [None]:
import pandas as pd

# Load the dataset
file_path = '12 Abs SIR with metadata 1-85 updated.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset
data.head()

In [None]:
# List of all antibiotics
antibiotics_columns = data.columns[6:]

# Calculate the success rate for each antibiotic (proportion of 'S')
success_rate = [data[antibiotic].value_counts(normalize=True).get('S', 0) for antibiotic in antibiotics_columns]

# Create a DataFrame for visualization
success_rate_df = pd.DataFrame({'Antibiotic': antibiotics_columns, 'Success Rate': success_rate})
success_rate_df = success_rate_df.sort_values(by='Success Rate', ascending=False)
success_rate_df

In [None]:
import seaborn as sns
import matplotlib.pyplot as plt

# Plot the success rate for all 12 antibiotics
plt.figure(figsize=(10, 6))
sns.barplot(x='Success Rate', y='Antibiotic', data=success_rate_df, palette='viridis')
plt.title('Success Rate of Different Antibiotics')
plt.xlabel('Success Rate')
plt.ylabel('Antibiotic')
plt.show()

In [None]:
# Display the column names to identify the specific column representing "Amikacin 30"
data.columns

In [None]:
# List of all antibiotics, explicitly including "Amikacin 30"
antibiotics_columns = data.columns[5:]

# Calculate the success rate for each antibiotic (proportion of 'S')
success_rate = [data[antibiotic].value_counts(normalize=True).get('S', 0) for antibiotic in antibiotics_columns]

# Create a DataFrame for visualization
success_rate_df = pd.DataFrame({'Antibiotic': antibiotics_columns, 'Success Rate': success_rate})
success_rate_df = success_rate_df.sort_values(by='Success Rate', ascending=False)
success_rate_df

In [None]:
# Plot the success rate for all 12 antibiotics (including "Amikacin 30")
plt.figure(figsize=(10, 6))
sns.barplot(x='Success Rate', y='Antibiotic', data=success_rate_df, palette='viridis')
plt.title('Success Rate of Different Antibiotics (Including Amikacin 30)')
plt.xlabel('Success Rate')
plt.ylabel('Antibiotic')
plt.show()

In [None]:
# Plot the success rate for all 12 antibiotics (including "Amikacin 30") with percentages on bars
plt.figure(figsize=(10, 6))
barplot = sns.barplot(x='Success Rate', y='Antibiotic', data=success_rate_df, palette='viridis')

# Add percentages to the bars
for p in barplot.patches:
    width = p.get_width()
    plt.text(width - 0.01, p.get_y() + p.get_height() / 2, '{:1.2f}%'.format(width * 100), ha='right')



plt.xlabel('Success Rate')
plt.ylabel('Antibiotic')
plt.show()

In [None]:
from wordcloud import WordCloud

# Extract antibiotic names without dosages
antibiotic_names_without_dosages = [name.split(' ')[0] for name in success_rate_df['Antibiotic']]

# Create a dictionary with cleaned antibiotic names as keys and success rate as values
antibiotics_success_rate_cleaned = {name: rate for name, rate in zip(antibiotic_names_without_dosages, success_rate_df['Success Rate'])}

# Generate the word cloud with cleaned antibiotic names
wordcloud_cleaned = WordCloud(width=800, height=400, background_color='white').generate_from_frequencies(antibiotics_success_rate_cleaned)

# Display the word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud_cleaned, interpolation='bilinear')
plt.axis('off')

plt.show()

In [None]:
!pip install -q wordcloud

In [None]:
# Load the dataset for strains with genome sequencing
file_path_strains = '12 antitbiotics - strains with genome sequencing .csv'
data_strains = pd.read_csv(file_path_strains)

# Display the first few rows of the dataset
data_strains.head()

In [None]:
# Load the dataset for strains with genome sequencing using an alternative encoding (ISO-8859-1)
data_strains = pd.read_csv(file_path_strains, encoding='ISO-8859-1')

# Display the first few rows of the dataset
data_strains.head()

In [None]:
# Define a mapping for resistance scores
resistance_mapping = {'S': 0, 'I': 1, 'R': 2}

# Apply the mapping to the antibiotic columns
antibiotic_columns_strains = data_strains.columns[3:]
for col in antibiotic_columns_strains:
    data_strains[col] = data_strains[col].map(resistance_mapping)

# Calculate the average resistance score for each strain
data_strains['Average Resistance Score'] = data_strains[antibiotic_columns_strains].mean(axis=1)

# Create a dictionary with strain names as keys and average resistance score as values
strain_resistance_dict = {strain: score for strain, score in zip(data_strains['Species'], data_strains['Average Resistance Score'])}

# Generate the word cloud with strain names in italics
wordcloud_strains = WordCloud(width=800, height=400, background_color='white', font_step=1, font_path=None, regexp=None, collocations=False, colormap='viridis', normalize_plurals=False, contour_width=0, contour_color='black', repeat=False, include_numbers=False, min_font_size=4, max_font_size=None, max_words=200, min_word_length=0, collocation_threshold=30, random_state=None, prefer_horizontal=0.9, mask=None, scale=1, color_func=None, mode='RGB', relative_scaling='auto', regexp=None, include_numbers=False, min_font_size=4, max_font_size=None, max_words=200, stopwords=None)
wordcloud_strains.generate_from_frequencies(strain_resistance_dict)

# Display the word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud_strains, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Strain Names (in Italics)')
plt.show()

In [None]:
# Generate the word cloud with strain names (removing repeated keyword arguments)
wordcloud_strains = WordCloud(width=800, height=400, background_color='white', font_step=1, collocations=False, colormap='viridis', normalize_plurals=False, contour_width=0, contour_color='black', repeat=False, include_numbers=False, min_font_size=4, max_font_size=None, max_words=200, min_word_length=0, collocation_threshold=30, random_state=None, prefer_horizontal=0.9, mask=None, scale=1, color_func=None, mode='RGB', relative_scaling='auto', stopwords=None)
wordcloud_strains.generate_from_frequencies(strain_resistance_dict)

# Display the word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud_strains, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Strain Names')
plt.show()

In [None]:
# Display the strain names and average resistance scores
data_strains[['Species', 'Average Resistance Score']].sort_values(by='Average Resistance Score', ascending=False)

In [None]:
# Clean the strain names by removing unwanted characters (e.g., symbols)
data_strains['Species_Cleaned'] = data_strains['Species'].str.replace('[^a-zA-Z0-9\s.]+', '')

# Create a dictionary with cleaned strain names as keys and average resistance score as values
strain_resistance_dict_cleaned = {strain: score for strain, score in zip(data_strains['Species_Cleaned'], data_strains['Average Resistance Score'])}

# Generate the word cloud with cleaned strain names
wordcloud_strains_cleaned = WordCloud(width=800, height=400, background_color='white', collocations=False, colormap='viridis', normalize_plurals=False, contour_width=0, contour_color='black', repeat=False, include_numbers=False, min_font_size=4, max_font_size=None, max_words=200, min_word_length=0, collocation_threshold=30, random_state=None, prefer_horizontal=0.9, mask=None, scale=1, color_func=None, mode='RGB', relative_scaling='auto', stopwords=None)
wordcloud_strains_cleaned.generate_from_frequencies(strain_resistance_dict_cleaned)

# Display the word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud_strains_cleaned, interpolation='bilinear')
plt.axis('off')
plt.title('Word Cloud of Cleaned Strain Names')
plt.show()

In [None]:
# Group the data by cleaned strain names and calculate the mean of the average resistance scores for each unique strain
unique_strain_resistance = data_strains.groupby('Species_Cleaned')['Average Resistance Score'].mean().reset_index()

# Create a dictionary with unique cleaned strain names as keys and mean average resistance score as values
unique_strain_resistance_dict = {strain: score for strain, score in zip(unique_strain_resistance['Species_Cleaned'], unique_strain_resistance['Average Resistance Score'])}

# Generate the word cloud with unique cleaned strain names
wordcloud_unique_strains = WordCloud(width=800, height=400, background_color='white', collocations=False, colormap='viridis', normalize_plurals=False, contour_width=0, contour_color='black', repeat=False, include_numbers=False, min_font_size=4, max_font_size=None, max_words=200, min_word_length=0, collocation_threshold=30, random_state=None, prefer_horizontal=0.9, mask=None, scale=1, color_func=None, mode='RGB', relative_scaling='auto', stopwords=None)
wordcloud_unique_strains.generate_from_frequencies(unique_strain_resistance_dict)

# Display the word cloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud_unique_strains, interpolation='bilinear')
plt.axis('off')

plt.show()