# T2 bacteria study

## Reload function

In [None]:
%load_ext autoreload

In [None]:
%autoreload 2

## Imports

In [None]:
import pandas as pd
from typing import List, Dict, Set
from datetime import datetime
from models import Isolate, Sample, BC_Sample, T2_Sample, Episode, Patient
import loader as load
import episodeloader as epi
from Displayer import Displayer
from Plotter import Plotter
import re
import utils as utils

import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt 
import scipy.stats as stats 
from matplotlib.ticker import PercentFormatter




## import data from files

In [None]:
df_CC = load.from_excel('T2bakt_CC.xlsx')

In [None]:
df_AP = load.from_excel('T2bakt_AP.xlsx')

In [None]:
df_other = load.from_excel('T2bakt_other.xlsx')

In [None]:
df_AP_times = load.from_excel('T2_bakt_AP_times.xlsx')

In [None]:
df_CC_times = load.from_excel('T2_bakt_CC_times.xlsx')


In [None]:
df_id_times = load.from_excel('positive_bc_timestamps.xlsx')

### Main dataframes

In [None]:
dfs_toload = df_AP, df_CC, df_other

### Dataframes with times

In [None]:
dfs_times = df_AP_times, df_CC_times

### Load dataframe into patient objects

In [None]:
patients = load.patients_list(dfs_toload, dfs_times, df_id_times)

In [None]:
### config time frames: 

bc_window = 72
other_sample_window = 72
cooldown_days = 7

In [None]:
epi.create_episodes(patients, bc_window = bc_window, other_sample_window = other_sample_window, direction="both")

## Exclusions (returns patients or episodes that are excluded

In [None]:
excluded_recurrent_episodes = epi.exclude_recurrent_episodes(patients,cooldown_days)

In [None]:
excluded_invalidT2_episodes = epi.exclude_invalid_episodes(patients)

In [None]:
empty_excluded = epi.exclude_empty_episodes(patients)

In [None]:
patients_without_valid_episodes = epi.exclude_patients(patients)

## Create Displayer for tabular data

In [None]:
displayer = Displayer(patients)

In [None]:
plotter = displayer.get_plotter()

In [None]:
displayer.tat_df()

## Results

i = 0
cool_down = 7
for i in range(0,15):
    print(i*24)
    patients = load.patients_list(dfs_toload, dfs_times)
    epi.create_episodes(patients, bc_window = (i * 24), other_sample_window = (i*24), direction="both")
    excluded_recurrent_episodes = epi.exclude_recurrent_episodes(patients, cool_down)
    excluded_invalidT2_episodes = epi.exclude_invalid_episodes(patients)
    empty_excluded = epi.exclude_empty_episodes(patients)
    patients_without_valid_episodes = epi.exclude_patients(patients)
    displayer = Displayer(patients)
    displayer.episode_classification("BC_IN_PANEL") 
    i += 1
    if i>3:
        cool_down = i*2
    



In [None]:
displayer.display_most_common_bacteria_in_episodes(is_t2included = False)

In [None]:
displayer.get_mean_time_between_samples() #between t2 and bc

In [None]:
displayer.display_aggregate_data() ### Demographic data

In [None]:
displayer.get_other_sample_locales(group = "all")

In [None]:
displayer.get_other_sample_locales(group = "t2neg_bcpos")

In [None]:
displayer.get_other_sample_locales(group = "t2pos_bcneg")

In [None]:
displayer.display_all_patients()

In [None]:
displayer.display_specific_isolates("Pseudomonas aeruginosa")

In [None]:
displayer.count_polymicrobials("bc")

In [None]:
displayer.calculate_sensitivity_specificity()

In [None]:
displayer.positivity_count() 

In [None]:
displayer.calculate_sensitivity_specificity_bacteria()


In [None]:
52/(3297+41+29+23)

In [None]:
displayer.display_specific_isolates(isolate_name = "Staphylococcus aureus")

In [None]:
displayer.episode_classification("ALL_BC") 

In [None]:
displayer.episode_classification("BC_IN_PANEL") 

In [None]:
df_poly_out_of_panel= displayer.polymicrobial_not_only_panel()

In [None]:
df_poly_out_of_panel.to_excel("polymicrobial_episodes.xlsx", index=False)

In [None]:
displayer.discordant_results("BC_IN_PANEL", "T2_POS_COMPARISON_NEG")

In [None]:
displayer.tat() ## Get turnaround times

In [None]:
displayer.discordant_check_other_samples() ## Shows cases where T2 pos, BC neg, and other samples shares at least one T2 isolate element with T2.

In [None]:
for episode in episodes:
    episode.display()

## Difference in concordance/discordance depending on time

In [None]:
samples= displayer.samples

In [None]:
episodes = displayer.episodes.values()


In [None]:
concordance=[]
time_diff=[]

for episode in episodes:
    for bc_sample in episode.bc_samples:
        time_diff_h = abs(bc_sample.sample_date-episode.t2_sample.sample_date).total_seconds()/3600
        conc = bc_sample.get_t2panel_isolates() == episode.t2_sample.isolates
        concordance.append(conc)
        time_diff.append(time_diff_h)

In [None]:
import pandas as pd
episode_types = []
t2_isolates = []
bc_isolates = []
bc_numbers = []
other_numbers = []
other_isolates = []

for episode in episodes:
    if episode.classify(compare_with= "BC_IN_PANEL") == "T2_POS_COMPARISON_NEG":
        episode_types.append("T2 pos, BC neg")
    elif episode.classify(compare_with= "BC_IN_PANEL") == "T2_NEG_COMPARISON_POS":
        episode_types.append("T2 neg, BC pos")
    else: episode_types.append("other")
    t2_iso = []
    bc_iso = []
    other_iso = []
    
    for isolate in episode.t2_sample.isolates:
        t2_iso.append(isolate.name)

    for sample in episode.bc_samples:
        for isolate in sample.isolates:
            bc_iso.append(isolate.name)
    
    for sample in episode.other_samples:
        for isolate in sample.isolates:
            other_iso.append(isolate.name)
    
    # Join isolates into a string separated by commas
    t2_isolates.append(', '.join(set(t2_iso)))
    bc_isolates.append(', '.join(set(bc_iso)))
    other_isolates.append(', '.join(set(other_iso)))
    bc_numbers.append(len(episode.bc_samples))
    other_numbers.append(len(episode.other_samples))

# Create DataFrame
df_table = pd.DataFrame({
    "Episode type": episode_types,
    "T2 isolates": t2_isolates,
    "BC samples": bc_numbers,
    "BC isolates": bc_isolates,
    "Other samples": other_numbers,
    "Other isolates": other_isolates,
})


In [None]:
# Create a dictionary of replacements
replacements = {
    "Jästsvamp": "Yeast, unspecified",
    "Betahemolyserande streptokock grupp G": "Group G streptococci",
    "Koagulas-negativ stafylokock": "Coagulase negative staphylococci",
    "Gramnegativ stav": "Gram-negative rod",
    "Betahemolyserande streptokock grupp A": "Group A streptococci",
    "Blandflora": "Mixed flora",


    # Add more replacements as needed
}

# List of columns to apply replacements in
columns_to_update = ['T2 isolates', 'BC isolates', 'Other isolates']

# Applying replacements across the specified columns
for column in columns_to_update:
    # Using regex=True allows for partial replacements within strings
    df_table[column] = df_table[column].replace(replacements, regex=True)

# This will update your DataFrame with all the specified replacements in the defined columns


In [None]:
df_table[df_table["Episode type"] == "T2 pos, BC neg"].to_excel("T2_pos_BC_neg.xlsx", index=False)


In [None]:
df_table[df_table["Episode type"] == "T2 neg, BC pos"].to_excel("T2_neg_BC_pos.xlsx", index=False)  

In [None]:
sample_time = []
category = []
for episode in episodes:
    for bc_sample in episode.bc_samples:
        time = (bc_sample.sample_date-episode.t2_sample.sample_date).total_seconds()/3600
        sample_time.append(time)


        if bc_sample.get_t2panel_isolates() == episode.t2_sample.isolates:
            category.append('concordant')
        elif bc_sample.get_t2panel_isolates() and not episode.t2_sample.isolates:
            category.append('t2_neg_bc_pos')
        elif not bc_sample.get_t2panel_isolates() and episode.t2_sample.isolates:
            category.append('t2_pos_bc_neg')
        else:
            category.append('discordant_unclassified')
            episode.display()
            print("bc_samples---------------------------------------")
            bc_sample.display()
        

In [None]:
data = pd.DataFrame({'time':sample_time, 'category':category})

In [None]:



# Calculate the minimum and maximum to ensure all data is covered
min_time = -bc_window
max_time = bc_window

bin_step = 12

bin_start = min_time
print(max_time)
bin_edges = np.arange(bin_start-bin_step/2, max_time + bin_step,bin_step)
# Adjust the last bin edge to fit exactly at max_time if necessary

print(bin_edges[-1],bin_edges[0])
bin_edges[-1] = max_time


bin_edges[0] = min_time

data['time_bin'] = pd.cut(data['time'], bins=bin_edges, right=False)  # right=False makes intervals left-closed, right-open

# Check the bins created to ensure alignment
print(bin_edges)
print(data['time_bin'].unique())

## Main fig: sampling in relationship to T2

In [None]:

# Count of each category within each bin
data['formatted_time_bin'] = data['time_bin'].apply(Displayer.format_interval)
count_data = data.groupby(['formatted_time_bin', 'category']).size().unstack(fill_value=0)

# Total counts in each bin, including 'concordant'
total_counts_per_bin = data.groupby('formatted_time_bin').size()
# Convert counts to percentages

percentage_data = count_data.divide(total_counts_per_bin, axis=0) * 100
# Optionally, exclude 'concordant' from visualization
percentage_data = percentage_data[['t2_neg_bc_pos', 't2_pos_bc_neg', 'discordant_unclassified']]
summed_percentage = percentage_data.sum(axis=1)
discordant_totals = count_data[['t2_neg_bc_pos', 't2_pos_bc_neg', 'discordant_unclassified']].sum(axis=1)


# Plotting the data
ax = percentage_data.plot(kind='bar', stacked=True, figsize=(10, 6), color=['#fc8d62', '#66c2a5', '#8da0cb'], edgecolor='black')

# Setting labels and title with styling
ax.set_xlabel("Time interval relative to T2 sampling (hours)", fontsize=12, fontweight='bold',fontname='Arial')
ax.set_ylabel("Proportion of all BCs sampled during interval", fontsize=12, fontweight='bold',fontname='Arial')
ax.set_title("Discordant samples in relationship to time from T2 sampling", fontsize=14, fontweight='bold', fontname='Arial')

# Customizing the tick labels
plt.xticks(rotation=0, fontsize=9, ha='center',fontweight='bold',fontname='Arial')
plt.yticks(fontsize=10, fontname='Arial')

# Styling the legend
legend = plt.legend(title='Discordant outcome', 
                    fontsize=10, title_fontsize='13', 
                    labels=['T2 Negative, BC Positive', 'T2 Positive, BC Negative', 'Other classified discordant result'] )



# Optional: Customizing the grid
ax.set_axisbelow(True)  # Ensure grid is below bar layers
ax.yaxis.grid(True, color='gray', linestyle='dashed', linewidth=0.5)
ax.yaxis.set_major_formatter(PercentFormatter(100))  # Format y-axis as percentages







# Annotating the total counts above each bar stack
for idx, (label, discordant_total) in enumerate(zip(percentage_data.index, discordant_totals)):
    # Get the y-coordinate as the top of the bar stack (total percentage of discordant categories)
    total_percentage = percentage_data.loc[label].sum()
    # Place the annotation above the last segment of the bar stack
    ax.annotate(f'{discordant_total}', xy=(idx, total_percentage), xytext=(0, 5), textcoords="offset points",
                ha='center', va='bottom', fontsize=9, color='black')
# Show the plot
plt.grid(False)
plt.grid(axis='y', linestyle='--', linewidth=0.5)
plt.tight_layout()  # Adjust subplots to give some padding
panel_b = plt.gcf()
plt.show()



In [None]:
## Fig with counts instead of percentage

# data['simplified_category'] = data['category'].apply(lambda x: 'Discordant' if x in ['t2_neg_bc_pos', 't2_pos_bc_neg', 'discordant_unclassified'] else x)
# # Now group by this new category along with the time_bin
# data['formatted_time_bin'] = data['time_bin'].apply(Displayer.format_interval)
# grouped_data = data.groupby(['formatted_time_bin', 'simplified_category']).size().unstack(fill_value=0)


# # Plotting the data
# fig, ax = plt.subplots(figsize=(10, 6))
# grouped_data.plot(kind='bar', stacked=True, color=['black','white'], ax=ax, edgecolor='black')

# # Setting labels and title with styling
# ax.set_xlabel("Time period relative to T2 sampling (interval of hours)", fontsize=12, fontweight='bold',fontname='Arial')
# ax.set_ylabel("BCs sampled", fontsize=12, fontweight='bold',fontname='Arial')
# ax.set_title("Blood cultures sampled in relationship to T2 sampling", fontsize=14, fontweight='bold', color='black', fontname='Arial')

# # Customizing the tick labels
# plt.xticks(rotation=0, fontsize=8, ha='center')
# plt.yticks(fontsize=10)
# plt.legend(title='BC outcome', fontsize=10, title_fontsize='13', loc='upper right', labels=['Discordant','Concordant'], reverse = True)

# # Customizing the grid
# ax.set_axisbelow(True)  # Ensure grid is below bar layers
# ax.yaxis.grid(True, color='gray', linestyle='dashed', linewidth=0.5)

# # Annotating the total counts above each bar
# for idx, value in enumerate(total_counts_per_bin):
#     ax.annotate(f'{value}', xy=(idx, value), xytext=(0,3), textcoords="offset points",
#                 ha='center', va='bottom', fontsize=9, color='black')

# # Show the plot
# plt.tight_layout()  # Adjust subplots to give some padding

# plt.show()


In [None]:
# Assuming 'data' is already defined and includes 'category' and 'time_bin' columns

# Simplifying the category
data['simplified_category'] = data['category'].apply(lambda x: 'Discordant' if x in ['t2_neg_bc_pos', 't2_pos_bc_neg', 'discordant_unclassified'] else x)

# Formatting the time bins (Assuming Displayer.format_interval is defined)
data['formatted_time_bin'] = data['time_bin'].apply(Displayer.format_interval)

# Group by the new category and formatted time_bin, then normalize
grouped_data = data.groupby(['formatted_time_bin', 'simplified_category']).size().unstack(fill_value=0)
normalized_grouped_data = grouped_data.divide(len(data))  # Normalize to total count to show proportions

# Plotting the normalized data
fig, ax = plt.subplots(figsize=(10, 6))
normalized_grouped_data.plot(kind='bar', stacked=True, color=['black','white'], ax=ax, edgecolor='black')

# Setting labels and title with styling
ax.set_xlabel('Time interval relative to T2 sampling (hours)', fontsize=12, fontweight='bold', fontname='Arial')
ax.set_ylabel("Proportion of total samples", fontsize=12, fontweight='bold', fontname='Arial')
ax.set_title("Blood cultures sampled in relation to time of T2 sampling", fontsize=14, fontweight='bold', color='black', fontname='Arial')

# Customizing the tick labels and grid
plt.xticks(rotation=0, fontsize=9, ha='center', fontweight='bold', fontname='Arial')
plt.yticks(fontsize=10, fontname='Arial')
ax.yaxis.set_major_formatter(PercentFormatter(1))  # Format y-axis as percentages

plt.legend(title='BC outcome', fontsize=10, title_fontsize='13', loc='upper right', labels=['Discordant', 'Concordant'], reverse=True)
ax.set_axisbelow(True)
ax.yaxis.grid(True, color='gray', linestyle='dashed', linewidth=0.5)

# Annotating the total counts above each bar
total_counts_per_bin = grouped_data.sum(axis=1)  # Total counts per bin
for idx, count in enumerate(total_counts_per_bin):
    ax.annotate(f'{int(count)}', xy=(idx, normalized_grouped_data.iloc[idx].sum()), xytext=(0, 3),
                textcoords="offset points", ha='center', va='bottom', fontsize=9, color='black')
plt.grid(False)
plt.grid(axis='y', linestyle='--', linewidth=0.5)
plt.tight_layout()
panel_a = plt.gcf()
plt.show()


In [None]:
#Plotter.saveplot_tofile(panel_a, name = "panel_a", format = "svg")
Plotter.saveplot_tofile(panel_b, name = "panel_b", format = "svg")

In [None]:
data = pd.DataFrame({
    'Concordance': concordance,
    'TimeDiff': time_diff
})
sns.histplot(data=data, x='TimeDiff', hue='Concordance', element='step', stat='count', common_norm=False)
plt.title('Distribution of time difference between BC and T2 samples depending on concordance')
plt.xlabel('Time Difference (hours)')
plt.ylabel('Percentage of episodes')
plt.legend(title='Classification', labels=['Concordant', 'Discordant'])
plt.show()

In [None]:
plt.boxplot([np.array(time_diff)[np.array(concordance)], np.array(time_diff)[~np.array(concordance)]], labels=['Concordant', 'Discordant'])
plt.title('Time Differences by Concordance Status')
plt.xlabel('Concordance')
plt.ylabel('Time Difference (hours)')
plt.show()


In [None]:
group1 = data[data['Concordance'] == True]['TimeDiff']
group2 = data[data['Concordance'] == False]['TimeDiff']

# Performing the Mann-Whitney U test
u_stat, p_value = stats.mannwhitneyu(group1, group2, alternative='two-sided')

print(f"Mann-Whitney U test result: U-statistic = {u_stat}, P-value = {p_value}")


## Main fig: Difference in total turn around time for positive and negative T2

In [None]:
t2_samples = []

for episode in episodes:
    t2_samples.append(episode.t2_sample)


In [None]:

times = np.array([(sample.final_report_date - sample.sample_date) for sample in t2_samples])
contains_isolates = np.array([bool(sample.isolates) for sample in t2_samples])
time_hours = np.array([time.total_seconds() / 3600 for time in times] )


In [None]:
# Add intercept
time_hours_with_intercept = sm.add_constant(time_hours)

# Create logistic model and fit it
model = sm.Logit(contains_isolates, time_hours_with_intercept)
result = model.fit()

# Generate a sequence of time values over a wider range for plotting
time_values = np.linspace(time_hours.min() - 1, time_hours.max() + 1, 300)
time_values_with_intercept = sm.add_constant(time_values)

# Predict probabilities for the generated time values
predicted_probabilities = result.predict(time_values_with_intercept)
# Print the summary of the model
print(result.summary())

# Make predictions (probabilities)
predictions = result.predict(time_hours_with_intercept)
print("Predicted probabilities:", predictions)

# Plotting
plt.figure(figsize=(8, 5))
plt.scatter(time_hours, contains_isolates, color='blue', label='Data Points', zorder=2)
plt.plot(time_values, predicted_probabilities, color='red', label='Logistic Regression', linewidth=2)
plt.title('Logistic Regression Fit')
plt.xlabel('Time in Hours')
plt.ylabel('Probability of Containing Isolates')
plt.ylim(-0.1, 1.1)  # Extend y-axis to better see the bounds
plt.legend()
plt.grid(True)
plt.show()

In [None]:
time_hours = np.array(time_hours)
test_transform = np.log(time_hours)

testxx=np.random.normal(0, 1, 1000)

In [None]:
plt.hist(time_hours, bins=20)   

In [None]:
plt.hist(time_hours, bins=20)   

In [None]:
import matplotlib.pyplot as plt
import scipy.stats as stats 
import statsmodels.api as sm

In [None]:
# Compute the CDFs for positive and negative samples
cdf_positive = np.sort(positive_samples_time)
cdf_negative = np.sort(negative_samples_time)

# Calculate the cumulative probability for each sorted list
prob_positive = np.arange(1, len(cdf_positive) + 1) / len(cdf_positive)
prob_negative = np.arange(1, len(cdf_negative) + 1) / len(cdf_negative)

# Plot the CDFs
plt.figure(figsize=(14, 6))
plt.subplot(1, 2, 1)
plt.plot(cdf_positive, prob_positive, marker='.', linestyle='none', color='green')
plt.title('CDF for Positive Samples')
plt.xlabel('Time in Hours')
plt.ylabel('CDF')

plt.subplot(1, 2, 2)
plt.plot(cdf_negative, prob_negative, marker='.', linestyle='none', color='red')
plt.title('CDF for Negative Samples')
plt.xlabel('Time in Hours')
plt.ylabel('CDF')

plt.tight_layout()
plt.show()

In [None]:
plt.scatter(time_hours, contains_isolates)

In [None]:

plt.hist(time_hours, bins = 20)

## Plots for processing time

In [None]:
# Assuming time_hours and contains_isolates are defined
data = pd.DataFrame({
    'TimeHours': time_hours,
    'ContainsIsolates': contains_isolates
})

# Create the FacetGrid
g = sns.FacetGrid(data, col="ContainsIsolates", height=6, aspect=1, sharey=True, sharex=True)
g.map_dataframe(sns.histplot, x='TimeHours', bins=60, stat='percent', common_norm=False, color='m', alpha=0.7)

# Dictionary for custom titles
title_dict = {True: 'T2 positive', False: 'T2 negative'}

# Applying custom labels
for ax, col_val in zip(g.axes.flatten(), data['ContainsIsolates'].unique()):
    # Ensure we use the actual boolean values as keys
    ax.set_title(title_dict[col_val], fontname = 'Arial')
g.tight_layout()
# Setting axis labels and adjusting layout
g.set_axis_labels("Time in Hours", "Percentage",fontname = 'Arial')
g.figure.subplots_adjust(top=0.85)  # Adjust the Figure to make room for the main title
g.figure.suptitle('Total Turnaround Time for T2 Samples', fontsize=16, fontname = 'Arial')  # Main title

# Show the plots
plt.show()

## Main fig: Distributions of times T2 pos/neg

In [None]:
# Assuming time_hours and contains_isolates are defined
data = pd.DataFrame({
    'TimeHours': time_hours,
    'ContainsIsolates': contains_isolates
})

# Bin the data into hourly intervals
data['TimeBin'] = pd.cut(data['TimeHours'], bins=np.arange(0, 100, 2), right=False)

# Create a summary DataFrame
summary_data = data.groupby(['TimeBin', 'ContainsIsolates']).size().reset_index(name='Counts')
summary_data = summary_data.pivot(index='TimeBin', columns='ContainsIsolates', values='Counts').fillna(0)

# Total counts for each category (from your specification)
total_positive_samples = 75
total_negative_samples = 565

# Normalize the counts to percentages of their total sample sizes
summary_data[True] = (summary_data[True] / total_positive_samples) * 100
summary_data[False] = (summary_data[False] / total_negative_samples) * 100

# Plotting the normalized clustered bar chart
fig, ax = plt.subplots(figsize=(12, 6))
bar_width = 0.35  # width of bars

# Create index for the grouped bar chart
index = np.arange(len(summary_data))
bar1 = ax.bar(index - bar_width/2, summary_data[True], bar_width, label='T2 positive', color='#fc8d62', edgecolor='black')
bar2 = ax.bar(index + bar_width/2, summary_data[False], bar_width, label='T2 negative', color='w', edgecolor='black')

# Customize the plot
ax.set_xlabel('Time in hours', fontsize=14, fontname='Arial',fontweight='bold')
ax.set_ylabel('Percentage of total in each category', fontsize=14, fontname='Arial',fontweight='bold')
ax.set_title('Distribution of turn-around time for T2', fontsize=16, fontname='Arial', fontweight='bold')
ax.set_xticks(index)
ax.set_xticklabels([str(x.left) + "-" + str(x.right) for x in summary_data.index], rotation=45, fontname='Arial')
ax.legend()
ax.yaxis.set_major_formatter(PercentFormatter(100))  # Format y-axis as percentages

plt.grid(False)
plt.grid(axis='y', linestyle='--', alpha=0.7)

plt.tight_layout()
tat_pos_neg = plt.gcf()
plt.show()

In [None]:
Plotter.saveplot_tofile(tat_pos_neg, name = "tat_pos_neg", format = "svg")

In [None]:
positive_samples = data[data['ContainsIsolates'] == True]
negative_samples = data[data['ContainsIsolates'] == False]

In [None]:
positive_samples_time = positive_samples['TimeHours']
negative_samples_time = negative_samples['TimeHours']

q1_positive = np.percentile(positive_samples_time, 25)
q3_positive = np.percentile(positive_samples_time, 75)
iqr_positive = q3_positive - q1_positive

# Calculate the interquartile range for negative_samples_time
q1_negative = np.percentile(negative_samples_time, 25)
q3_negative = np.percentile(negative_samples_time, 75)
iqr_negative = q3_negative - q1_negative

print("median positive:",np.median(positive_samples_time),"negative:", np.median(negative_samples_time))
print("IQR positive:",iqr_positive,"negative:", iqr_negative,"q1 positive:",q1_positive,"q3 positive:",q3_positive,"q1 negative:",q1_negative,"q3 negative:",q3_negative)
print("mean positive:",np.mean(positive_samples_time),"mean negative:", np.mean(negative_samples_time))

# Perform the Mann-Whitney U test
u_statistic, p_value_mann = stats.mannwhitneyu(positive_samples_time, negative_samples_time, alternative='two-sided')
t_statistic, p_value_t = stats.ttest_ind(positive_samples_time, negative_samples_time, equal_var=False)
print(f"Mann-Whitney U test result: U-statistic = {u_statistic}, P-value = {p_value_mann}")
print(f"t-test result: t-statistic = {t_statistic}, P-value = {p_value_t}")

In [None]:
plt.figure(figsize=(8, 6))
plt.boxplot([positive_samples_time, negative_samples_time], labels=['Positive', 'Negative'])
plt.title('Boxplot of Processing Times')
plt.ylabel('Time in Hours')
plt.show()


In [None]:
sample_ids = range(len(time_hours))  # Assuming sequential identifiers for samples

plt.figure(figsize=(10, 6))
plt.scatter(sample_ids, time_hours, c=contains_isolates, cmap='viridis')
plt.colorbar(ticks=[0, 1], label='Isolates Present (0 = No, 1 = Yes)')
plt.title('Scatter Plot of Processing Times by Sample')
plt.xlabel('Sample ID')
plt.ylabel('Time in Hours')
plt.show()


In [None]:
plt.figure(figsize=(8, 6))
plt.violinplot([positive_samples_time, negative_samples_time])
plt.xticks([1, 2], ['Positive', 'Negative'])
plt.title('Violin Plot of Processing Times')
plt.ylabel('Time in Hours')
plt.show()


In [None]:
plotter.check_distribution("NORMAL")

## Main fig : TAT BC - T2

In [None]:
plot3 = plotter.tat_boxplot()

In [None]:
for i, fig in enumerate(plot3):
    Plotter.saveplot_tofile(fig, f'Fig2_subplot{i+1}',"svg")

In [None]:
plotter.tat_distributions()

In [None]:
venn = plotter.plot_venn("BC_IN_PANEL")

In [None]:
Plotter.saveplot_tofile(venn, name = "venn_all_bc", format = "svg")

In [None]:
venn_legend = plotter.plot_bacteria_occurrences()

In [None]:
for i, fig in enumerate(venn_legend):
    Plotter.saveplot_tofile(fig, f'Venn_legend{i+1}',"svg")

In [None]:
plotter.show_time_between_BC_T2()

## Main Fig: weekdays etc
 


In [None]:
tat_duty = plotter.plot_tat_by_time(sample_type="t2", time_type="arrival", result_category="all")

In [None]:
tat_duty.savefig("tat_duty1.svg", format = "svg", bbox_inches = 'tight')

In [None]:
Plotter.saveplot_tofile(tat_duty, name="tat_duty", format = "svg")

In [None]:
type(tat_duty)

In [None]:
plot_2 = plotter.plot_longitudinal(positive=True) 

In [None]:
print(type(plot_2))

In [None]:
Plotter.saveplot_tofile(plot_2, "Fig2")

In [None]:
plotter.plot_longitudinal(positive=False)

In [None]:
displayer.tat_differences(4, "ARRIVAL")

In [None]:
displayer.tat_df()