The below study compares the performance of Capomulin treatment with other treatment regimens.  Throughout the analysis, two of the treatments stood out, the Capomulin and Ramicane regimens performed significantly better than others. The summary statistics table shows that the average and median tumour volume of those treatments were considerably smaller comparing to others. Both treatments had the lowest variance and standard deviation, which indicates that the values were the least spread from their means and the most precise. 

Similarly, the box plot presenting final tumour volume for four regimens (Capomulin, Ramicane, Ifubinol, Ceftamin) shows that Capomulin and Ramicane finished the study with the lowest final tumour size. The only outlier was for the Infubinol treatment.

The bar plot displaying the total number of timepoints for each treatment regimen shows that Capomulin and Ramicane collected the most measurements. Both treatments gathered around 250 timepoints each. The Propriva treatment collected only around 150 measurements, while the rest of the regimens averaged at around 175.

The linear regression model for Capomulin shows a positive correlation between the average size of tumour and the weight of the mice.

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "Resources/Mouse_metadata.csv"
study_results_path = "Resources/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
merge_df=pd.merge(mouse_metadata,study_results, on="Mouse ID")

# Display the data table for preview
merge_df.head()

In [None]:
# Checking the number of mice.
len(merge_df)

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
cleaned=merge_df.drop_duplicates(subset=["Mouse ID","Timepoint"], keep = False)
cleaned.head()

In [None]:
# Checking the number of mice in the clean DataFrame.
len(cleaned)

In [None]:
cleaned.groupby("Drug Regimen")

In [None]:
#Calcualting the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
mean=cleaned.groupby("Drug Regimen")["Tumor Volume (mm3)"].mean()
median=cleaned.groupby("Drug Regimen")["Tumor Volume (mm3)"].median()
variance=cleaned.groupby("Drug Regimen")["Tumor Volume (mm3)"].var()
std_dev=cleaned.groupby("Drug Regimen")["Tumor Volume (mm3)"].std()
sem=cleaned.groupby("Drug Regimen")["Tumor Volume (mm3)"].sem()

In [None]:
#building dataframe to display summary statistics
summary=pd.DataFrame({"Mean Tumor Volume": mean,
                     "Median Tumor Volume":median,
                     "Tumor Volume Variance":variance,
                      "Tumor Volume Standard Dev": std_dev,
                     "Tumor Volume Std. Err.":sem})
summary

In [None]:
#Finding values for x and y axis
#drug will be the x_axis and timepoint_count will be y_axis
drug=cleaned["Drug Regimen"].unique().tolist()
timepoint_count=cleaned.groupby("Drug Regimen")["Timepoint"].count()
df=pd.DataFrame(timepoint_count)

In [None]:
# Generating a bar plot showing the total number of timepoints for all mice tested for each drug regimen using Pandas.
mice_chart=df.plot(kind="bar", title = "Total number of measurments for each treatment regimen", color = "r", alpha =0.5)
mice_chart.set_xlabel("Drug Regimen")
mice_chart.set_ylabel("Total number of timepoints for all mice")
plt.show()
plt.tight_layout()

In [None]:
# Generating a bar plot showing the total number of timepoints for all mice tested for each drug regimen using pyplot.
plt.bar(drug, timepoint_count, color="r", alpha=0.5, align="center")
#labeling the chart
plt.title("Total number of measurments for each treatment regimen")
plt.xlabel("Drug Regimen")
plt.ylabel("Total number of timepoints for all mice")
tick_locations = [value for value in drug]
plt.xticks(tick_locations, drug, rotation="vertical")

In [None]:
# Generating a pie plot showing the distribution of female versus male mice using Pandas
gender_pie=cleaned.groupby("Sex")["Mouse ID"].count().plot(kind="pie", y="Sex", 
                                                           title="The distribution of female versus male mice",
                                                           autopct="%1.1f%%")
gender_pie.set_ylabel("Gender")

In [None]:
# Generating a pie plot showing the distribution of female versus male mice using pyplot
labels = ["Female", "Male"]
plt.pie(cleaned.groupby("Sex")["Mouse ID"].count(),labels=labels,autopct="%1.1f%%")
plt.ylabel("Gender")
plt.title("The distribution of female versus male mice")

In [None]:
# Calculating the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin
treatment=["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]
drug_regimen=cleaned.set_index("Drug Regimen").loc[treatment,:].drop_duplicates(subset=["Mouse ID"], keep = "last")
Capomulin=drug_regimen.loc["Capomulin","Tumor Volume (mm3)"]
Ramicane=drug_regimen.loc["Ramicane","Tumor Volume (mm3)"]
Infubinol=drug_regimen.loc["Infubinol","Tumor Volume (mm3)"]
Ceftamin=drug_regimen.loc["Ceftamin","Tumor Volume (mm3)"]

In [None]:
#Calculating the quartiles and IQR and looking for outliers for Capomulin
quartiles = Capomulin.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq
lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print(f"The quartiles are: {quartiles}")
print(f"The interquartile range is: {iqr}")
print(f"Values below {lower_bound} could be outliers.")
print(f"Values above {upper_bound} could be outliers.")

In [None]:
#Calculating the quartiles and IQR and looking for outliers for Ramicane
quartiles = Ramicane.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq
lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print(f"The quartiles are: {quartiles}")
print(f"The interquartile range is: {iqr}")
print(f"Values below {lower_bound} could be outliers.")
print(f"Values above {upper_bound} could be outliers.")

In [None]:
#Calculating the quartiles and IQR and looking for outliers for Infubinol
quartiles = Infubinol.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq
lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print(f"The quartiles are: {quartiles}")
print(f"The interquartile range is: {iqr}")
print(f"Values below {lower_bound} could be outliers.")
print(f"Values above {upper_bound} could be outliers.")

In [None]:
#Calculating the quartiles and IQR and looking for outliers for Ceftamin
quartiles = Ceftamin.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq
lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print(f"The quartiles are: {quartiles}")
print(f"The interquartile range is: {iqr}")
print(f"Values below {lower_bound} could be outliers.")
print(f"Values above {upper_bound} could be outliers.")

In [None]:
fig1, ax1 = plt.subplots()
ax1.set_ylabel("Final Tumor Volume (mm3)")
ax1.boxplot([Capomulin, Ramicane, Infubinol, Ceftamin], sym="r")
plt.xticks([1, 2, 3,4], ["Capomulin", "Ramicane", "Infubinol", "Ceftamin"])
plt.title("Final Tumor Volume for each regimen")
plt.show()

In [None]:
#Displaying all the data for mice treated with Capomulin
drug_regimen.loc["Capomulin"]

In [None]:
#selecting mouse ID "u364"
mouse_id="u364"
#Displaying data for mouse "u364"
mouse_data=cleaned.set_index("Mouse ID").loc[mouse_id][["Timepoint","Tumor Volume (mm3)"]]
#Generating a line plot of tumor volume vs. time point for mouse "u364"
x_axis=mouse_data["Timepoint"]
y_axis=mouse_data["Tumor Volume (mm3)"]
plt.plot(x_axis,y_axis)
plt.xlabel("Timepoint")
plt.ylabel("Tumor Volume (mm3)")
plt.title("Capomulin treatment of mouse u364")

In [None]:
#Finding weight of mice treated with Capomulin
#Calculating average size of tumor for each mice
# Generating a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen
weight_data=cleaned.set_index("Drug Regimen").loc["Capomulin"]
avg_tumor=weight_data.groupby("Mouse ID")["Tumor Volume (mm3)"].mean()
mice_size=drug_regimen.loc["Capomulin"].set_index("Mouse ID")["Weight (g)"]
merge_mice=pd.merge(mice_size,avg_tumor, on="Mouse ID")
tumor_data=merge_mice["Tumor Volume (mm3)"]
weight_axis=merge_mice["Weight (g)"]
plt.scatter(weight_axis,tumor_data, marker="o",facecolors="red", edgecolors="black", s=weight_axis)
plt.xlabel("Weight (g)")
plt.ylabel("Average size of tumor (mm3)")
plt.title("Avg tumor volume versus mouse weight for the Capomulin treatment regimen")

In [None]:
#Calculating the correlation coefficient and linear regression model 
correlation = st.pearsonr(weight_axis,tumor_data)
print(f"The correlation between both factors is {round(correlation[0],2)}")
(slope,intercept,rvalue,pvalue,stderr)=st.linregress(weight_axis,tumor_data)
regress_values=weight_axis*slope+intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(weight_axis,tumor_data)
plt.plot(weight_axis,regress_values,"r-")
plt.annotate(line_eq,(6,10),fontsize=15,color="red")
plt.xlabel("Weight (g)")
plt.ylabel("Average size of tumor (mm3)")
plt.show()