## Observations and Insights 

•	Based on the analysis, Capomulin appears to be the most effective regiment. Out of 25 mice, 84% of them survived by the end of the experiment.
•	Ramicane also seems to be an effective drug, the survival rate was 80%, and the median tumor volume at the end of the experiment was 36.56 (mm3), compared to  Capomulin’s median tumor volume of 38.13 (mm3).
•	The drug regimens that seem to be less effective than Placebo are Infubinol, and Propriva. Out of 25 mice, 11 survived with Placebo, 9 survived with Infubinol, and only 7 survived with Propriva.


In [31]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

In [32]:
# Data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata_df = pd.read_csv(mouse_metadata_path)
study_results_df = pd.read_csv(study_results_path)

# Combine the data into a single dataset
merged_data_df = pd.merge(mouse_metadata_df, study_results_df, on="Mouse ID")

merged_data_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [33]:
# Check the number of mice in the DataFrame.
mice_count=len(merged_data_df["Mouse ID"].unique())

#Set total mice data frame
total_mice_df=pd.DataFrame({"Total Mice": mice_count}, index=[0])

total_mice_df

Unnamed: 0,Total Mice
0,249


In [34]:
# Get the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
duplicate_mice_timepoint=merged_data_df[merged_data_df.duplicated(["Mouse ID", "Timepoint"])]

duplicate_mice_timepoint[["Mouse ID", "Timepoint"]]

Unnamed: 0,Mouse ID,Timepoint
909,g989,0
911,g989,5
913,g989,10
915,g989,15
917,g989,20


In [35]:
# Get all the data for the duplicate mouse ID. 
duplicate_mice_data=merged_data_df[merged_data_df.duplicated()]

duplicate_mice_data

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
909,g989,Propriva,Female,21,26,0,45.0,0


In [36]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.

#Drop duplicate mice
unique_mouseid_df=merged_data_df["Mouse ID"].drop_duplicates()
unique_mouseid_df

0       k403
10      s185
20      x401
30      m601
40      g791
        ... 
1858    z314
1860    z435
1863    z581
1873    z795
1883    z969
Name: Mouse ID, Length: 249, dtype: object

In [37]:
#Check the number of mice in the clean DataFrame.
total_mice=unique_mouseid_df.count()

total_mice_df=pd.DataFrame({"Total Mice": total_mice}, index=[0])

total_mice_df

Unnamed: 0,Total Mice
0,249


## Summary Statistics

In [38]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

#Calculate the mena, median, variance, standard deviation, and SEM
average_tumor_vol=merged_data_df.groupby("Drug Regimen").mean()["Tumor Volume (mm3)"]
variance_tumor_vol=merged_data_df.groupby("Drug Regimen").var()["Tumor Volume (mm3)"]
std_tumor_vol=merged_data_df.groupby("Drug Regimen").std()["Tumor Volume (mm3)"]
sem_tumor_vol=merged_data_df.groupby("Drug Regimen").sem()["Tumor Volume (mm3)"]

#Create DataFrame for calculations
tumor_summary_df=pd.DataFrame({"Average Tumor Volume": average_tumor_vol,
                            "Variance of Tumor Volume": variance_tumor_vol, 
                            "Std Dev of Tumor Volume": std_tumor_vol, "Sem of Tumor Volume": sem_tumor_vol})

tumor_summary_df

Unnamed: 0_level_0,Average Tumor Volume,Variance of Tumor Volume,Std Dev of Tumor Volume,Sem of Tumor Volume
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Capomulin,40.675741,24.947764,4.994774,0.329346
Ceftamin,52.591172,39.290177,6.268188,0.469821
Infubinol,52.884795,43.128684,6.567243,0.492236
Ketapril,55.235638,68.553577,8.279709,0.60386
Naftisol,54.331565,66.173479,8.134708,0.596466
Placebo,54.033581,61.168083,7.821003,0.581331
Propriva,52.322552,42.35107,6.50777,0.512884
Ramicane,40.216745,23.486704,4.846308,0.320955
Stelasyn,54.233149,59.450562,7.710419,0.573111
Zoniferol,53.236507,48.533355,6.966589,0.516398


## Bar Plots

In [39]:
import numpy as np

In [40]:
# Generate a bar plot showing the number of mice per time point for each treatment throughout the course of the study using pandas.

#Calculate the number of mice at each point in time.

mice_per_regimen=merged_data_df.groupby(["Drug Regimen", "Timepoint"])

regimen_count=mice_per_regimen["Drug Regimen"].count()

regimen_count_unstacked=regimen_count.unstack(level=0)

regimen_count_unstacked

Drug Regimen,Capomulin,Ceftamin,Infubinol,Ketapril,Naftisol,Placebo,Propriva,Ramicane,Stelasyn,Zoniferol
Timepoint,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,25,25,25,25,25,25,26,25,24,25
5,25,21,25,23,23,24,25,25,23,24
10,25,20,21,22,21,24,23,24,21,22
15,24,19,21,19,21,20,17,24,21,21
20,23,18,20,19,20,19,17,23,19,17
25,22,18,18,19,18,17,14,23,18,16
30,22,16,17,18,15,15,13,23,17,15
35,22,14,12,17,15,14,10,21,15,14
40,21,14,10,15,15,12,9,20,12,14
45,21,13,9,11,13,11,7,20,11,14


In [41]:
# Generate a bar plot showing the number of mice per time point for each treatment throughout the course of the study using pandas.

#Create bar plot
mice_bar_chart=regimen_count_unstacked.plot(kind="bar",title="Count of Mice per Regimen", figsize=(7,4))

# Add the descriptive title, x labels and y labels
mice_bar_chart.set_xlabel("Days")
mice_bar_chart.set_ylabel("Number of Mice")
plt.savefig("../Pymaceuticals/mice_bar_chart.png")
plt.show()

<IPython.core.display.Javascript object>

In [42]:
%matplotlib notebook

In [43]:
import numpy as np

In [44]:
# Generate a bar plot showing the number of mice per time point for each treatment throughout the course of the study using pyplot.

regimen_group = merged_data_df.groupby("Drug Regimen")

# Create a dataframe counting the number of data points for each Drug Regimen
drug_count = pd.DataFrame(regimen_group['Drug Regimen'].count())

#Set the x_axis to be the mice count per regimen
x_axis = np.arange(len(drug_count))

#Create bar plots
plt.bar(x_axis, drug_count["Drug Regimen"], color='green', align="center", width = 0.52)

#Set tick locations
tick_locations = [value for value in x_axis]
plt.xticks(tick_locations,list(drug_count.index), rotation='vertical')

plt.xlim(-0.7, len(x_axis)-0.3)
plt.ylim(0, max(drug_count["Drug Regimen"])*1.05)


# Add the descriptive title, x labels and y labels
plt.title("Count of Mice per Regimen")
plt.xlabel("Regimen")
plt.ylabel("Number of Mice")


plt.savefig("../Pymaceuticals/plyplot_mice_bar_chart.png")
plt.show()


<IPython.core.display.Javascript object>

## Pie Plots

In [45]:
#Group mice by gender
gender_breakdown=merged_data_df.groupby("Sex")

#Count the mice by gender
gender_count=gender_breakdown["Sex"].count()

gender_count

Sex
Female    935
Male      958
Name: Sex, dtype: int64

In [46]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
gender_pie_plots=gender_count.plot(kind="pie", y="Sex", title= "Distribution of Mice by Gender",startangle=150,  figsize=(4,4), autopct='%1.1f%%')

plt.axis("equal")

plt.savefig("../Pymaceuticals/mice_pie_chart.png")
plt.show

<IPython.core.display.Javascript object>

<function matplotlib.pyplot.show(*args, **kw)>

In [47]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
gender_breakdown=merged_data_df.groupby("Sex")

#Count the mice per gender
gender_count=gender_breakdown["Sex"].count()

gender_index=list(gender_count.index.values)

plt.pie(gender_count, labels=gender_index, startangle=150, autopct='%1.1f%%', )

# Add the descriptive title, and set parameters
plt.title("Distribution of Mice by Gender")
plt.rcParams["font.size"]=5
plt.rcParams["figure.figsize"]=[4,4]
plt.axis("equal")
plt.savefig("../Pymaceuticals/mice_pie_chart.png")
plt.show

<IPython.core.display.Javascript object>

<function matplotlib.pyplot.show(*args, **kw)>

## Quartiles, Outliers and Boxplots

In [48]:
# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens. Calculate the IQR and quantitatively determine if there are any potential outliers. 

#Set variable list
regimen_list=["Capomulin", "Ramicane", "Zoniferol", "Naftisol"]

#Identify Drug Regimen in regimen list
effective_regimens=merged_data_df[merged_data_df["Drug Regimen"].isin(regimen_list)]

eff_regimens_list=effective_regimens[["Mouse ID", "Drug Regimen","Timepoint", "Tumor Volume (mm3)"]]

#Identify the final tumor volume
tumorvol_by_regimen=eff_regimens_list.groupby(["Drug Regimen", "Mouse ID"]).last()["Tumor Volume (mm3)"]

#Unstack the data
regimen_unstacked=tumorvol_by_regimen.unstack(level=0)

regimen_unstacked

Drug Regimen,Capomulin,Naftisol,Ramicane,Zoniferol
Mouse ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
a401,,,,66.794156
a411,,,38.407618,
a444,,,43.047543,
a520,,,38.810366,
a644,,,32.978522,
...,...,...,...,...
y793,31.896238,,,
z234,,46.129357,,
z578,,,30.638696,
z795,,65.741070,,


In [49]:
counter = 0

# Calculate quartile for each drug regmin
for regimen in regimen_list:
    quartiles = regimen_unstacked[regimen].quantile([.25,.5,.75]).round(2)
    lowerq = quartiles[0.25].round(2)
    upperq = quartiles[0.75].round(2)
    iqr = round(upperq-lowerq,2)
    lower_bound = round(lowerq - (1.5*iqr),2)
    upper_bound = round(upperq + (1.5*iqr),2)
    
# Print calculations for each regimen
    if counter == 0:
        print(f"--------------------------------------------------------")
    print(f"{regimen} IQR and outliers calculation:")
    print(f"Lower quartile of {regimen}: {lowerq}")
    print(f"Upper quartile of {regimen}: {upperq}")
    print(f"Interquartile range: {iqr}")
    print(f"The the median Tumor Volume (mm3) is: {quartiles[0.5]} ")
    print(f"Values below {lower_bound} are outliers.")
    print(f"Values above {upper_bound} are outliers.")
    print(f"--------------------------------------------------------")
    counter += 1

--------------------------------------------------------
Capomulin IQR and outliers calculation:
Lower quartile of Capomulin: 32.38
Upper quartile of Capomulin: 40.16
Interquartile range: 7.78
The the median Tumor Volume (mm3) is: 38.13 
Values below 20.71 are outliers.
Values above 51.83 are outliers.
--------------------------------------------------------
Ramicane IQR and outliers calculation:
Lower quartile of Ramicane: 31.56
Upper quartile of Ramicane: 40.66
Interquartile range: 9.1
The the median Tumor Volume (mm3) is: 36.56 
Values below 17.91 are outliers.
Values above 54.31 are outliers.
--------------------------------------------------------
Zoniferol IQR and outliers calculation:
Lower quartile of Zoniferol: 49.99
Upper quartile of Zoniferol: 66.79
Interquartile range: 16.8
The the median Tumor Volume (mm3) is: 61.84 
Values below 24.79 are outliers.
Values above 91.99 are outliers.
--------------------------------------------------------
Naftisol IQR and outliers calculati

In [50]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest

boxplot_list=[]

#Identify drug regimen in list
for regimen in regimen_list:
    boxplot_list.append(list(regimen_unstacked[regimen].dropna()))

#Create box plot
fig1, ax = plt.subplots(figsize=(3,3))

# Add the descriptive title, x labels and y labels
ax.set_title("Tumor Volume by Drug Regimen")
ax.set_xlabel("Drug Regimen")
ax.set_ylabel("Tumor Vol (mm3)")
ax.boxplot(boxplot_list,notch=0,sym='gD')
plt.xticks([1,2,3,4],regimen_list)

# Save and print the box plot
plt.savefig("../Pymaceuticals/boxplot.png")

plt.show()

<IPython.core.display.Javascript object>

## Line and Scatter Plots

In [51]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin

#Select data for mouse with ID 'y793'
mouse_capomulin = merged_data_df[merged_data_df["Mouse ID"].isin(["y793"])]

tumor_over_time = mouse_capomulin[["Mouse ID", "Timepoint", "Tumor Volume (mm3)"]]

#Reset Index
tumor_reset= tumor_over_time.reset_index()

tumor_plot = tumor_reset[["Mouse ID", "Timepoint", "Tumor Volume (mm3)"]]

#Plot the line
tumor_line_plot = tumor_plot.plot.line()

# Add the descriptive title, x labels and y labels
plt.title("Tumor Volume over Time for Capomulin")
plt.xlabel("Days")
plt.ylabel("Tumor Volume (mm3)")

<IPython.core.display.Javascript object>

Text(0, 0.5, 'Tumor Volume (mm3)')

In [52]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen

#Identify drug regimen Capomulin
mice_capomulin = merged_data_df[merged_data_df["Drug Regimen"].isin(["Capomulin"])]

#Set data by Mouse ID, Weight and Tumor Volume
weight_tumor= mice_capomulin[["Mouse ID","Weight (g)", "Tumor Volume (mm3)"]]

#Group data by weight and calculate the average size of tumor
avg_tumor_weight = weight_tumor.groupby("Weight (g)"). mean()["Tumor Volume (mm3)"]

#Reset Index
tumor_weight_df = pd.DataFrame(avg_tumor_weight).reset_index()

#Create scatter plot
capomulin_scatter = tumor_weight_df.plot(kind="scatter", x="Weight (g)", y="Tumor Volume (mm3)", title="Weight vs Avg Tumor Volumen", figsize= (4,4))


<IPython.core.display.Javascript object>

## Correlation and Regression

In [53]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen

correlation = st.pearsonr(tumor_weight_df["Weight (g)"],tumor_weight_df["Tumor Volume (mm3)"])

print(f"The correlation between both factors is {round(correlation[0],2)}")

The correlation between both factors is 0.95


In [54]:
from scipy import stats

In [55]:
#Create x and y values 
x_values = tumor_weight_df["Weight (g)"]
y_values = tumor_weight_df["Tumor Volume (mm3)"]

#Create linear regression stats
tv_slope, tv_int, tv_r, tv_p, tv_std_err = stats.linregress(x_values, y_values)
line_eq = f'y = {str(round(tv_slope,2))}x + {str(round(tv_int,2))}'

In [56]:
# Set linear regression formula
tv_fit = tv_slope * x_values + tv_int

In [57]:
#Create scatter plot
plt.scatter(x_values,y_values)
plt.plot(x_values,tv_fit,"r-")

# Add the descriptive title, x labels and y labels
plt.title("Average Tumor Volume vs Weight")
plt.xlabel("Mouse weight (g)")
plt.ylabel("Tumor Volume (mm3)")

#Print linear regression line
print(f"The equation of the regression line is: {line_eq}")
plt.show()
#Note: line printed in the scatter plot above

<IPython.core.display.Javascript object>

The equation of the regression line is: y = 0.89x + 22.76
