## Observations and Insights 

In [26]:
# Dependencies and Setup
%matplotlib notebook
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata_path = "C:/Users/chadi/Documents/GitHub/PyPlot_Challenge/Resources/Mouse_metadata.csv"
study_results_path = "C:/Users/chadi/Documents/GitHub/PyPlot_Challenge/Resources/Study_results.csv"

# Read the mouse data and the study results
mm = pd.read_csv(mouse_metadata_path)
sr = pd.read_csv(study_results_path)

# Combine the data into a single dataset
mr = pd.merge(mm, sr, on = "Mouse ID")

# Display the data table for preview
mr



Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [3]:
# Checking the number of mice.
len(mr['Mouse ID'].unique())

249

In [4]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint.
dmr = mr[mr.duplicated(["Mouse ID", "Timepoint"], keep=False)]
idmr = dmr["Mouse ID"].unique()
idmr

array(['g989'], dtype=object)

In [5]:
# Optional: Get all the data for the duplicate mouse ID. 
dmrd = mr[mr["Mouse ID"].isin(idmr)]
dmrd

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
908,g989,Propriva,Female,21,26,0,45.0,0
909,g989,Propriva,Female,21,26,0,45.0,0
910,g989,Propriva,Female,21,26,5,48.786801,0
911,g989,Propriva,Female,21,26,5,47.570392,0
912,g989,Propriva,Female,21,26,10,51.745156,0
913,g989,Propriva,Female,21,26,10,49.880528,0
914,g989,Propriva,Female,21,26,15,51.325852,1
915,g989,Propriva,Female,21,26,15,53.44202,0
916,g989,Propriva,Female,21,26,20,55.326122,1
917,g989,Propriva,Female,21,26,20,54.65765,1


In [16]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
cmr = mr[~ mr["Mouse ID"].isin(idmr)]
cmr.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [17]:
# Checking the number of mice in the clean DataFrame.
len(cmr["Mouse ID"].unique())

248

## Summary Statistics

In [18]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
dg = cmr.groupby("Drug Regimen")
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
mean = dg["Tumor Volume (mm3)"].mean()
median = dg["Tumor Volume (mm3)"].median()
var = dg["Tumor Volume (mm3)"].var()
std_var = dg["Tumor Volume (mm3)"].std()
sem = dg["Tumor Volume (mm3)"].sem()
# Assemble the resulting series into a single summary dataframe.
dgdf = pd.DataFrame({"Mean": mean, "Median": median, "Variance": var, "Standard Deviation": std_var, "SEM": sem})
dgdf

Unnamed: 0_level_0,Mean,Median,Variance,Standard Deviation,SEM
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346
Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821
Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236
Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386
Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466
Placebo,54.033581,52.288934,61.168083,7.821003,0.581331
Propriva,52.32093,50.446266,43.852013,6.622085,0.544332
Ramicane,40.216745,40.673236,23.486704,4.846308,0.320955
Stelasyn,54.233149,52.431737,59.450562,7.710419,0.573111
Zoniferol,53.236507,51.818479,48.533355,6.966589,0.516398


In [9]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# Using the aggregation method, produce the same summary statistics in a single line
aggdg = cmr.groupby("Drug Regimen").agg({"Tumor Volume (mm3)": ["mean", "median", "var", "std", "sem"]})
aggdg

Unnamed: 0_level_0,Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3)
Unnamed: 0_level_1,mean,median,var,std,sem
Drug Regimen,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346
Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821
Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236
Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386
Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466
Placebo,54.033581,52.288934,61.168083,7.821003,0.581331
Propriva,52.32093,50.446266,43.852013,6.622085,0.544332
Ramicane,40.216745,40.673236,23.486704,4.846308,0.320955
Stelasyn,54.233149,52.431737,59.450562,7.710419,0.573111
Zoniferol,53.236507,51.818479,48.533355,6.966589,0.516398


## Bar and Pie Charts

In [27]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using Pandas.
tpd = cmr.groupby("Drug Regimen")["Timepoint"].count()
tpd.plot(kind="bar", figsize=(12,4))
plt.title("Total number of timepoints per drug regimen for all mice")
plt.show()
plt.tight_layout()

In [25]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using pyplot.
tpdp = cmr["Drug Regimen"].sort_values().unique()
cot = cmr.groupby("Drug Regimen")["Timepoint"].count()

x_axis = np.arange(len(tpdp))
tks = [x for x in x_axis]

plt.figure(figsize=(12,4))
plt.title("Total number of timepoints per drug regimen for all mice")
plt.xlabel("Drug Regimen")

plt.bar(x_axis, cot, align="center")
plt.xticks(tks, tpdp, rotation="vertical")

plt.show()
plt.tight_layout()

In [36]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas
cmm = mm.loc[~mm["Mouse ID"].isin(idmr)]
dmm = cmm["Sex"].value_counts()

dmm.plot(kind="pie", figsize=(5,5), startangle=60, autopct='%.2f%%')
plt.title("Distribution of female versus male mice")
plt.show()

In [37]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
labels = cmm["Sex"].unique()
plt.pie(dmm, labels=labels ,autopct="%.2f%%", startangle=60)
plt.axis("equal")
plt.title("Distribution of female versus male mice")
plt.ylabel("Sex")
plt.show()

## Quartiles, Outliers and Boxplots

In [38]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse
mtp = cmr.groupby("Mouse ID")["Timepoint"].max()

# Merge this group df with the original dataframe to get the tumor volume at the last timepoint
tvlt = pd.merge(mtp, cmr, how="left", on=["Mouse ID", "Timepoint"])
tvlt.head()

Unnamed: 0,Mouse ID,Timepoint,Drug Regimen,Sex,Age_months,Weight (g),Tumor Volume (mm3),Metastatic Sites
0,a203,45,Infubinol,Female,20,23,67.973419,2
1,a251,45,Infubinol,Female,21,25,65.525743,1
2,a262,45,Placebo,Female,17,29,70.717621,4
3,a275,45,Ceftamin,Female,20,28,62.999356,3
4,a366,30,Stelasyn,Female,16,29,63.440686,1


In [42]:
# Put treatments into a list for for loop (and later for plot labels)
tlt = ["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]

# Create empty list to fill with tumor vol data (for plotting)
tvd = []
    
# Calculate the IQR and quantitatively determine if there are any potential outliers. 
for x in tlt: 
    ttd = tvlt.loc[tvlt["Drug Regimen"] == x] 
    tvd.append(ttd["Tumor Volume (mm3)"])
    
    # Determine outliers using upper and lower bounds
for index, x in enumerate(tvd):
    trn = tlt[index]
    # init info needed for whisker box
    quartiles = x.quantile([.25,.5,.75])
    lowerq = quartiles[0.25]
    upperq = quartiles[0.75]
    iqr = upperq-lowerq
    # print statements to display required information
    print(f"{trn} lower quartile: {round(lowerq,2)}.")
    print(f"{trn} upper quartile: {round(upperq,2)}.")
    print(f"{trn} interquartile range: {round(iqr,2)}.")
    print(f"{trn} median: {round(quartiles[0.5],2)}.")
    #calculating lower and upper bound
    lower_bound = lowerq - (1.5*iqr)
    upper_bound = upperq + (1.5*iqr)
    # check values against bounds for outliers.
    print(f"{trn} analysis: values < {round(lower_bound,2)} --> outliers.")
    print(f"{trn} analysis: values > {round(upper_bound,2)} --> outliers.")
    ot = (tvlt.loc[(tvlt["Drug Regimen"] == trn) & ((tvlt["Tumor Volume (mm3)"] < lower_bound) | (tvlt["Tumor Volume (mm3)"] > upper_bound))])
    otn = len(ot)
    print(f"{trn} outliers count: {otn}.")
    if otn > 0:
        otl = ot["Mouse ID"].to_numpy()
        print(f"Outliers are as follows: {otl}.")
    print("")
    

Capomulin lower quartile: 32.38.
Capomulin upper quartile: 40.16.
Capomulin interquartile range: 7.78.
Capomulin median: 38.13.
Capomulin analysis: values < 20.7 --> outliers.
Capomulin analysis: values > 51.83 --> outliers.
Capomulin outliers count: 0.

Ramicane lower quartile: 31.56.
Ramicane upper quartile: 40.66.
Ramicane interquartile range: 9.1.
Ramicane median: 36.56.
Ramicane analysis: values < 17.91 --> outliers.
Ramicane analysis: values > 54.31 --> outliers.
Ramicane outliers count: 0.

Infubinol lower quartile: 54.05.
Infubinol upper quartile: 65.53.
Infubinol interquartile range: 11.48.
Infubinol median: 60.17.
Infubinol analysis: values < 36.83 --> outliers.
Infubinol analysis: values > 82.74 --> outliers.
Infubinol outliers count: 1.
Outliers are as follows: ['c326'].

Ceftamin lower quartile: 48.72.
Ceftamin upper quartile: 64.3.
Ceftamin interquartile range: 15.58.
Ceftamin median: 59.85.
Ceftamin analysis: values < 25.36 --> outliers.
Ceftamin analysis: values > 87.67

In [47]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
fig1, ax1 = plt.subplots()
ax1.set_title("Final Tumor Volume per Regimen")
ax1.set_xticklabels(tlt)
mp = ax1.boxplot(tvd)
for x in mp["fliers"]:
    x.set(marker="o", markeredgecolor="blue", alpha=0.8)
plt.show()

  ax1.set_xticklabels(tlt)


## Line and Scatter Plots

In [49]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin
mtc = "m957"
mdf = cmr.loc[cmr["Mouse ID"] == mtc, ["Timepoint", "Tumor Volume (mm3)"]]
plt.plot(mdf["Timepoint"], mdf["Tumor Volume (mm3)"], color="blue", label=mtc)
plt.title(f"Tumor volume against time point for mouse id: {mtc}")
plt.xlabel("Time Point")
plt.ylabel("Tumor Volume (mm3)")
plt.grid()
plt.xlim(0,max(mdf["Timepoint"]))
plt.ylim(int(min(mdf["Tumor Volume (mm3)"]))-2,int(max(mdf["Tumor Volume (mm3)"]))+2)

(30.0, 48.0)

In [50]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen
cdf = cmr.loc[cmr["Drug Regimen"] == "Capomulin", ["Mouse ID", "Tumor Volume (mm3)", "Weight (g)"]]
atm = cdf.groupby("Mouse ID")["Tumor Volume (mm3)"].mean()
weight = cdf.groupby("Mouse ID")["Weight (g)"].mean()
plt.scatter(weight,atm)
plt.title("Average tumor Volume against mouse weight for Capomulin")
plt.xlabel("Weight (g)")
plt.ylabel("Average Tumor Volume (mm3)")
plt.show()

## Correlation and Regression

In [54]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
slope, intercept, rvalue, pvalue, stderr = st.linregress(weight, atm)
fit = slope * weight + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(weight, atm)
plt.plot(weight,fit,"r-")
plt.title("Average tumor Volume against mouse weight for Capomulin")
plt.xlabel("Weight (g)")
plt.ylabel("Average Tumor Volume (mm3)")
plt.annotate(line_eq,(22,36), fontsize=12, color="red")
print(f"{round(rvalue**2,2)} shows a positive correlation between mouse weight and average tumor volume.")

0.71 shows a positive correlation between mouse weight and average tumor volume.
