## Observations and Insights 

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
from scipy.stats import linregress

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
combined_data_df = pd.merge(mouse_metadata, study_results, how = "outer", on = "Mouse ID")

# Display the data table for preview
combined_data_df.head()

In [None]:
combined_data_df.info()

In [None]:
# Checking the number of mice.
number_of_mice = combined_data_df["Mouse ID"].nunique()
number_of_mice

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
duplicate_mice = combined_data_df.loc[combined_data_df.duplicated(["Mouse ID", "Timepoint",]),"Mouse ID"].unique()
duplicate_mice

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
duplicate_mice_df = combined_data_df.loc[combined_data_df["Mouse ID"] == "g989"]
duplicate_mice_df

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
cleaned_combined_data_df = combined_data_df[combined_data_df["Mouse ID"].isin(duplicate_mice) == False]
cleaned_combined_data_df

In [None]:
# Checking the number of mice in the clean DataFrame.
mice_count = cleaned_combined_data_df["Mouse ID"].nunique()
mice_count

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# This method is the most straighforward, creating multiple series and putting them all together at the end.

# Extracting mean for the mean of each regimen
tumor_volume_mean = cleaned_combined_data_df.groupby('Drug Regimen').mean()["Tumor Volume (mm3)"]

# Extracting median for the mean of each regimen
tumor_volume_med = cleaned_combined_data_df.groupby('Drug Regimen').median()["Tumor Volume (mm3)"]

# Extracting variance for the mean of each regimen
tumor_volume_var = cleaned_combined_data_df.groupby('Drug Regimen').var()["Tumor Volume (mm3)"]

# Extracting standard deviation for the mean of each regimen
tumor_volume_std = cleaned_combined_data_df.groupby('Drug Regimen').std()["Tumor Volume (mm3)"]

# Extracting SEM for the mean of each regimen
tumor_volume_sem = cleaned_combined_data_df.groupby('Drug Regimen').sem()["Tumor Volume (mm3)"]

# Create data frame for the summary
statistic_summary1 = pd.DataFrame({"Mean of Tumor Volume": tumor_volume_mean,
                                   "Median of Tumor Volume": tumor_volume_med,
                                   "Variance of Tumor Volume": tumor_volume_var,
                                   "STD of Tumor Volume": tumor_volume_std,
                                   "SEM of Tumor Volume": tumor_volume_sem})

# Display summary table
statistic_summary1.round(2)

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# This method produces everything in a single groupby function

# Single group by function
statistic_summary2 =  cleaned_combined_data_df.groupby(["Drug Regimen"])[["Tumor Volume (mm3)"]].agg(["mean", "median", "var", "std", "sem"])

# Display summary table
statistic_summary2.round(2)

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas. 

# Calculate the total number of mice per treatment for the graph
mice_per_treat = cleaned_combined_data_df.groupby(["Drug Regimen"]).count()["Mouse ID"]

# Plot using pandas
mice_per_treat.plot.bar(color = "black")
plt.title("Total Number of Mice For Each Treatment")
plt.ylabel("Mice Per Time Point")
plt.xlabel("Drug Regimen")
plt.tight_layout()
plt.show()

In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pyplot.

# Prepare x-axis
drug_regimen = cleaned_combined_data_df["Drug Regimen"].value_counts().index

# Prepare y-axis
mice_per_treat2 = cleaned_combined_data_df.groupby(["Drug Regimen"]).count()["Mouse ID"].tolist()

# Plot using pyplot
plt.bar(drug_regimen, mice_per_treat, width = 0.5, color = "purple")
plt.xticks(drug_regimen, rotation = 90)

plt.title("Total Number of Mice For Each Treatment")
plt.ylabel("Mice Per Time Point")
plt.xlabel("Drug Regimen")
plt.tight_layout()
plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas

# Getting data for counts of male and female
grouped_gender  = pd.value_counts(cleaned_combined_data_df['Sex'])

# Plot using pandas
colors = ["skyblue", "pink"]
explode  = (0.1, 0)
pandas_plot = grouped_gender.plot.pie(startangle = 45,
                                      colors = colors,
                                      explode = explode,
                                      shadow = True,
                                      autopct = "%1.2f%%")
plt.title("Male vs Female Mice Count")
plt.ylabel("Gender")
plt.tight_layout()
plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

# Preparation for plotting
grouped_gender2 = grouped_gender.tolist()
colors = ["purple", "orange"]
labels = ["Male", "Female"]
plt.pie(grouped_gender2,
        labels = labels,
        startangle = 45,
        colors = colors,
        explode = explode,
        shadow = True,
        autopct = "%1.2f%%")
plt.title("Male vs Female Mice Count")
plt.ylabel("Gender")
plt.tight_layout()
plt.show()

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse
capomulin_df = cleaned_combined_data_df.loc[combined_data_df["Drug Regimen"] == "Capomulin"]
capomulin_last = capomulin_df.groupby("Mouse ID").max()["Timepoint"]

ramicane_df = cleaned_combined_data_df.loc[combined_data_df["Drug Regimen"] == "Ramicane"]
ramicane_last = ramicane_df.groupby("Mouse ID").max()["Timepoint"]

iInfubinol_df = cleaned_combined_data_df.loc[combined_data_df["Drug Regimen"] == "Infubinol"]
iInfubinol_last = iInfubinol_df.groupby("Mouse ID").max()["Timepoint"]

ceftamin_df = cleaned_combined_data_df.loc[combined_data_df["Drug Regimen"] == "Ceftamin"]
ceftamin_last = ceftamin_df.groupby("Mouse ID").max()["Timepoint"]

# Merge this group df with the original dataframe to get the tumor volume at the last timepoint
capomulin_vol = pd.DataFrame(capomulin_last)
capomulin_merge = pd.merge(combined_data_df, capomulin_vol, on = ("Mouse ID", "Timepoint"))

ramicane_vol = pd.DataFrame(ramicane_last)
ramicane_merge = pd.merge(combined_data_df, ramicane_vol, on = ("Mouse ID", "Timepoint"))

iInfubinol_vol = pd.DataFrame(iInfubinol_last)
iInfubinol_merge = pd.merge(combined_data_df, iInfubinol_vol, on = ("Mouse ID", "Timepoint"))

ceftamin_vol = pd.DataFrame(ceftamin_last)
ceftamin_merge = pd.merge(combined_data_df, ceftamin_vol, on = ("Mouse ID", "Timepoint"))

capomulin_df.head()

In [None]:
# Put treatments into a list for for loop (and later for plot labels)
treatment_list = ["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]

# Create empty list to fill with tumor vol data (for plotting)
tumor_vol = []   

In [None]:
# Calculate the IQR and quantitatively determine if there are any potential outliers
    # Locate the rows which contain mice on each drug and get the tumor volumes
    # add subset 
    # Determine outliers using upper and lower bounds
selected_regimen = cleaned_combined_data_df[cleaned_combined_data_df["Drug Regimen"].isin(treatment_list)]

last_timepoint = selected_regimen.groupby(["Drug Regimen", "Mouse ID"]).agg(tumor_vol = ("Tumor Volume (mm3)", lambda x: x.iloc[-1]))
last_timepoint = last_timepoint.stack(level=0).unstack(level=0)
last_timepoint

treatment = 0
for drug in treatment_list:
    quartiles = last_timepoint[drug].quantile([0.25, 0.5, 0.75]).round(2)
    lowerq = quartiles[0.25].round(2)
    upperq = quartiles[0.75].round(2)
    iqr = round(upperq - lowerq,2)
    lower_bound = round(lowerq - (1.5 * iqr), 2)
    upper_bound = round(upperq + (1.5 * iqr), 2)
    
    if treatment == 0:
        print(f"~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
    print(f"The lower quartile of {drug} treatments is: {lowerq}")
    print(f"The upper quartile of {drug} treatments is: {upperq}")
    print(f"The interquartile range of {drug} treatments is: {iqr}")
    print(f"Values below {lower_bound} could be {drug} outliers.")
    print(f"Values above {upper_bound} could be {drug} outliers.")
    print(f"~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~")
    treatment += 1

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
for row in treatment_list:
    tumor_vol.append(list(last_timepoint[row].dropna()))
    
fig = plt.figure()
plt.title("Tumor Volume at Mouse on Selected Treatment")
plt.xlabel("Drug Regimen")
plt.ylabel("Final Tumor Volume (mm3)")
plt.boxplot(tumor_vol, labels = treatment_list, widths = 0.5)
plt.tight_layout()
plt.show()

## Line and Scatter Plots

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin
capomulin_df_drop_dup = capomulin_df.drop_duplicates(subset = "Mouse ID", keep = "last")

mouse = capomulin_df.sample()
picked_mouse = mouse.iloc[0].at["Mouse ID"]

line_plot_df = capomulin_df.loc[capomulin_df["Mouse ID"] == picked_mouse]

x_axis = line_plot_df["Timepoint"]
y_axis = line_plot_df["Tumor Volume (mm3)"]

fig1, ax1 = plt.subplots()
plt.title("Capomulin treatment of mouse %s" % picked_mouse)
plt.plot(x_axis, y_axis, linewidth = 2, markersize = 15, color = "purple")
plt.xlabel("Timepoint (Days)")
plt.ylabel("Tumor Volume (mm3)")
coef = np.polyfit(x_axis, y_axis, 1)
poly1d_fn = np.poly1d(coef) 
plt.plot(x_axis, y_axis, x_axis, poly1d_fn(x_axis), "--k")
plt.tight_layout()
plt.show()

In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen
capomulin_df_mean = capomulin_df.groupby(["Mouse ID"]).mean()
plt.scatter(capomulin_df_mean["Weight (g)"], capomulin_df_mean["Tumor Volume (mm3)"], color = "purple")
plt.title("Mouse Weight Versus Average Tumor Volume")
plt.xlabel("Weight (g)")
plt.ylabel("Tumor Volume (mm3)")
plt.tight_layout()
plt.show()

## Correlation and Regression

In [None]:
color = "orange"

# Calculate the correlation coefficient and linear regression model
corr = st.pearsonr(capomulin_df_mean["Weight (g)"], capomulin_df_mean["Tumor Volume (mm3)"])
(slope, intercept, rvalue, pvalue, stderr) = linregress(capomulin_df_mean["Weight (g)"], capomulin_df_mean["Tumor Volume (mm3)"])
regress_values = capomulin_df_mean["Weight (g)"] * slope + intercept
line_eq = f"y = {round(slope, 2)} x + {round(intercept, 2)}"

# for mouse weight and average tumor volume for the Capomulin regimen
plt.scatter(capomulin_df_mean["Weight (g)"], capomulin_df_mean["Tumor Volume (mm3)"], color = "purple")
plt.plot(capomulin_df_mean["Weight (g)"], regress_values, color = color)
plt.annotate(line_eq, (21, 36), fontsize = 12, color = color)
plt.title("Mouse Weight Versus Average Tumor Volume")
plt.xlabel("Weight (g)")
plt.ylabel("Tumor Volume (mm3)")
plt.tight_layout()
plt.show()