## Observations and Insights 

In [274]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
combined_data= pd.merge(mouse_metadata,study_results, how='outer', on='Mouse ID')

# Display the data table for preview
combined_data.head(15)

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
5,k403,Ramicane,Male,21,16,25,33.464577,1
6,k403,Ramicane,Male,21,16,30,31.099498,1
7,k403,Ramicane,Male,21,16,35,26.546993,1
8,k403,Ramicane,Male,21,16,40,24.365505,1
9,k403,Ramicane,Male,21,16,45,22.050126,1


In [275]:
# Checking the number of mice.
mice_count = len(combined_data['Mouse ID'].unique())
print(f"Total Mice Count = {mice_count}")

Total Mice Count = 249


In [276]:
#check the data for any mouse ID with duplicate time points and remove any data associated with that mouse ID.
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
dup_mice= combined_data.loc[combined_data.duplicated(subset=['Mouse ID', 'Timepoint']),'Mouse ID'].unique()

#show which Mice ID is associated with duplicate Time points. Use this ID to select data that will be removed from dataset
print(f"Mice ID(s) {dup_mice} has duplicated time points and its data should be removed from data set")

Mice ID(s) ['g989'] has duplicated time points and its data should be removed from data set


In [277]:
# Optional: Get all the data for the duplicate mouse ID. 
dup_mice_df= combined_data[combined_data.duplicated(['Mouse ID', 'Timepoint'])]
dup_mice_df.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
909,g989,Propriva,Female,21,26,0,45.0,0
911,g989,Propriva,Female,21,26,5,47.570392,0
913,g989,Propriva,Female,21,26,10,49.880528,0
915,g989,Propriva,Female,21,26,15,53.44202,0
917,g989,Propriva,Female,21,26,20,54.65765,1


In [278]:
# Create a clean DataFrame by dropping all the records associated with the duplicate mouse by its ID.
cleaned_data=combined_data[combined_data['Mouse ID'].isin(dup_mice)!=True]

#show resulting data set
cleaned_data

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [279]:
# Checking the number of mice in the clean DataFrame.
new_mice_count = len(cleaned_data['Mouse ID'].unique())
print(f"Total Mice Count in cleaned set = {new_mice_count}")

Total Mice Count in cleaned set = 248


## Summary Statistics

In [280]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
sum_stat_df=cleaned_data.groupby('Drug Regimen')

# mean, median, variance, standard deviation, and SEM of the tumor volume. 
mean= sum_stat_df['Tumor Volume (mm3)'].mean()
median=sum_stat_df['Tumor Volume (mm3)'].median()
var=sum_stat_df['Tumor Volume (mm3)'].var()
stdev=sum_stat_df['Tumor Volume (mm3)'].std()
sem=sum_stat_df['Tumor Volume (mm3)'].sem()

#print(mean, median,var, stdev, sem)
# Assemble the resulting series into a single summary dataframe.
tumor_stats_df = pd.DataFrame({"Mean": mean, 
                           "Median": median, 
                           "Variance": var, 
                           "Standard Deviation": stdev, 
                           "SEM": sem})
#show new table with values
tumor_stats_df 

Unnamed: 0_level_0,Mean,Median,Variance,Standard Deviation,SEM
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346
Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821
Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236
Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386
Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466
Placebo,54.033581,52.288934,61.168083,7.821003,0.581331
Propriva,52.32093,50.446266,43.852013,6.622085,0.544332
Ramicane,40.216745,40.673236,23.486704,4.846308,0.320955
Stelasyn,54.233149,52.431737,59.450562,7.710419,0.573111
Zoniferol,53.236507,51.818479,48.533355,6.966589,0.516398


In [281]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# Using the aggregation method, produce the same summary statistics in a single line
tumor_stats_again = cleaned_data.groupby('Drug Regimen').agg(['mean','median','var','std','sem'])['Tumor Volume (mm3)']
tumor_stats_again

Unnamed: 0_level_0,mean,median,var,std,sem
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346
Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821
Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236
Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386
Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466
Placebo,54.033581,52.288934,61.168083,7.821003,0.581331
Propriva,52.32093,50.446266,43.852013,6.622085,0.544332
Ramicane,40.216745,40.673236,23.486704,4.846308,0.320955
Stelasyn,54.233149,52.431737,59.450562,7.710419,0.573111
Zoniferol,53.236507,51.818479,48.533355,6.966589,0.516398


## Bar and Pie Charts

In [303]:
# Generate a bar plot showing the total number of unique mice tested on each drug regimen using pandas.
mice_per_drug=cleaned_data.groupby('Drug Regimen')['Mouse ID'].nunique()
fig1, ax1 = plt.subplots(figsize=(5,4))
plot_mice=mice_per_drug.plot.bar(color='b')

plt.xlabel("Drug Regimen")
plt.ylabel("Number of Mice")
plt.title("Count of Mice per Treatment")
plt.tight_layout()


<IPython.core.display.Javascript object>

In [304]:
# Generate a bar plot showing the total number of unique mice tested on each drug regimen using pyplot.
#get list for x and Y axis
drug_names= tumor_stats_df.index.tolist()
x_axis=drug_names

In [305]:
mice_totals = (cleaned_data.groupby('Drug Regimen')['Mouse ID'].nunique()).tolist()
mice_totals

[25, 25, 25, 25, 25, 25, 24, 25, 24, 25]

In [321]:
#plot results
fig2, ax1 = plt.subplots(figsize=(8, 3))
plt.bar(x_axis, mice_totals, color='r')
plt.xlabel("Drug Regimen")
plt.ylabel("Number of Mice")
plt.title("Count of Mice per Treatment")
plt.tight_layout()
plt.show()

<IPython.core.display.Javascript object>

In [307]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
gender_data = pd.DataFrame(cleaned_data.groupby('Sex')['Mouse ID'].nunique()).reset_index()

gender_data.head()

Unnamed: 0,Sex,Mouse ID
0,Female,123
1,Male,125


In [313]:
plt.figure(figsize=(10,4))
ax1 = plt.subplot(121, aspect="equal")
gender_data.plot(kind="pie", y = "Mouse ID", ax=ax1, autopct='%1.1f%%',startangle=140, shadow=True, labels=gender_data["Sex"], legend = False)

plt.title('Distribution of Female vs Male')
plt.ylabel('')

<IPython.core.display.Javascript object>

Text(0, 0.5, '')

In [320]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
#create lists

#total counts
gen_totals = (cleaned_data.groupby('Sex')['Mouse ID'].nunique()).tolist()
#to use as labels
gen_names= gender_data['Sex'].tolist()

explode = (0.1, 0)

#start plotting
fig4, ax1 = plt.subplots(figsize=(4, 4))
plt.pie(gen_totals, explode=explode,labels=gen_names, colors=['r','b'], autopct="%1.1f%%", shadow=True, startangle=140,)
plt.title('Distribution of Female vs Male')
plt.ylabel(' ')

plt.show()


<IPython.core.display.Javascript object>

## Quartiles, Outliers and Boxplots

In [341]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin


# Start by getting the last (greatest) timepoint for each mouse
max_timepoint= cleaned_data.groupby('Mouse ID')['Timepoint'].max()

# Merge this group df with the original dataframe to get the tumor volume at the last timepoint
merged_study_data = pd.merge(max_timepoint, cleaned_data, on =("Mouse ID","Timepoint"))

merged_study_data.head()

Unnamed: 0,Mouse ID,Timepoint,Drug Regimen,Sex,Age_months,Weight (g),Tumor Volume (mm3),Metastatic Sites
0,a203,45,Infubinol,Female,20,23,67.973419,2
1,a251,45,Infubinol,Female,21,25,65.525743,1
2,a262,45,Placebo,Female,17,29,70.717621,4
3,a275,45,Ceftamin,Female,20,28,62.999356,3
4,a366,30,Stelasyn,Female,16,29,63.440686,1


In [350]:
# Put treatments into a list for for loop (and later for plot labels)
treatment_list = ['Capomulin', 'Ramicane', 'Infubinol', 'Ceftamin']

# Create empty list to fill with tumor vol data (for plotting)
volume_data=[]    

# Calculate the IQR and quantitatively determine if there are any potential outliers. 
    
# Locate the rows which contain mice on each drug and get the tumor volumes
for treatment in treatment_list:    
    temp_df= merged_study_data.loc[merged_study_data['Drug Regimen']== treatment]    
    
    # add subset 
    volumes=temp_df['Tumor Volume (mm3)']
    volume_data.append(volumes)
    
    # Calculate and print the interquartile range (IQR) for each treatment
    quartiles = volumes.quantile([.25,.5,.75])
    lowerq = quartiles[0.25]
    upperq = quartiles[0.75]
    iqr = upperq-lowerq
    
    print(f"--------------------------------------")
    print(f"Results for for {treatment} are: ")
    print(f"The lower quartile for {treatment} is: {lowerq}")
    print(f"The upper quartile for {treatment} is: {upperq}")
    print(f"The interquartile range (IQR) for {treatment} is: {iqr}")
    print(f"The the median for {treatment} is: {quartiles[0.5]} ")
    
    # Determine outliers using upper and lower bounds
    lower_bound = lowerq - (1.5*iqr)
    upper_bound = upperq + (1.5*iqr)
    print(f"Values below {lower_bound} could be outliers.")
    print(f"Values above {upper_bound} could be outliers.")

--------------------------------------
Results for for Capomulin are: 
The lower quartile for Capomulin is: 32.37735684
The upper quartile for Capomulin is: 40.1592203
The interquartile range (IQR) for Capomulin is: 7.781863460000004
The the median for Capomulin is: 38.125164399999996 
Values below 20.70456164999999 could be outliers.
Values above 51.83201549 could be outliers.
--------------------------------------
Results for for Ramicane are: 
The lower quartile for Ramicane is: 31.56046955
The upper quartile for Ramicane is: 40.65900627
The interquartile range (IQR) for Ramicane is: 9.098536719999998
The the median for Ramicane is: 36.56165229 
Values below 17.912664470000003 could be outliers.
Values above 54.30681135 could be outliers.
--------------------------------------
Results for for Infubinol are: 
The lower quartile for Infubinol is: 54.04860769
The upper quartile for Infubinol is: 65.52574285
The interquartile range (IQR) for Infubinol is: 11.477135160000003
The the medi

In [361]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
fig5, ax5 = plt.subplots(figsize=(8, 5))
plt.title('Tumor Volume by Drug Regimen')
plt.ylabel('Final Tumor Volume (mm3)')
plt.xlabel('Drug Regimen')

#highlight outliers
flierprops = dict(marker='o', markerfacecolor='r', markersize=8, markeredgecolor='black')
plt.boxplot(volume_data, labels=treatment_list, widths = 0.4, flierprops=flierprops, patch_artist=True,vert=True)

plt.ylim(5, 90)

<IPython.core.display.Javascript object>

(5.0, 90.0)

## Line and Scatter Plots

In [368]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin
#find mouse to test
cap_df= cleaned_data[cleaned_data['Drug Regimen']== 'Capomulin']
cap_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
10,s185,Capomulin,Female,3,17,0,45.000000,0
11,s185,Capomulin,Female,3,17,5,43.878496,0
12,s185,Capomulin,Female,3,17,10,37.614948,0
13,s185,Capomulin,Female,3,17,15,38.177232,0
14,s185,Capomulin,Female,3,17,20,36.866876,0
...,...,...,...,...,...,...,...,...
440,i557,Capomulin,Female,1,24,45,47.685963,1
1452,r157,Capomulin,Male,22,25,0,45.000000,0
1453,r157,Capomulin,Male,22,25,5,45.597064,0
1454,r157,Capomulin,Male,22,25,10,46.059608,0


In [369]:
#selected Mouse ID m957 from cleaned data to have enough points to graph
cap_line_df= cap_df.loc[cap_df['Mouse ID']== 'm957']
cap_line_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
135,m957,Capomulin,Female,3,19,0,45.0,0
136,m957,Capomulin,Female,3,19,5,45.622381,1
137,m957,Capomulin,Female,3,19,10,46.414518,1
138,m957,Capomulin,Female,3,19,15,39.804453,1
139,m957,Capomulin,Female,3,19,20,38.909349,1
140,m957,Capomulin,Female,3,19,25,37.695432,1
141,m957,Capomulin,Female,3,19,30,38.212479,1
142,m957,Capomulin,Female,3,19,35,32.562839,1
143,m957,Capomulin,Female,3,19,40,32.947615,1
144,m957,Capomulin,Female,3,19,45,33.329098,1


In [374]:
fig6, ax6 = plt.subplots(figsize=(7, 5))
plt.plot(cap_line_df['Timepoint'], cap_line_df['Tumor Volume (mm3)'], marker = 'o')

plt.title('Capomulin Treatmeant of Mouse m957')
plt.xlabel('Timepoint (Days)')
plt.ylabel('Tumor Volume (mm3)')


<IPython.core.display.Javascript object>

Text(0, 0.5, 'Tumor Volume (mm3)')

In [385]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen

#create data set that has average volume by finding the mean total volume
avg_vol= pd.DataFrame(cap_df.groupby(["Mouse ID", 'Weight (g)'])["Tumor Volume (mm3)"].mean()).reset_index()
avg_vol=avg_vol.rename(columns={"Tumor Volume (mm3)": "Average Volume"})
avg_vol

Unnamed: 0,Mouse ID,Weight (g),Average Volume
0,b128,22,41.963636
1,b742,21,40.083699
2,f966,17,36.505973
3,g288,19,41.990097
4,g316,22,44.613344
5,i557,24,44.80581
6,i738,20,39.141053
7,j119,23,44.465236
8,j246,21,43.216925
9,l509,21,44.434474


In [388]:
fig7, ax7 = plt.subplots(figsize=(7, 5))
plt.scatter(avg_vol['Weight (g)'], avg_vol['Average Volume'])


plt.xlabel("Weight (g)")
plt.ylabel("Average Tumor Volume (mm3)")
plt.title('Weight vs Average Tumor Volume for Capomulint')

plt.show()


<IPython.core.display.Javascript object>

## Correlation and Regression

In [412]:
#additional imports
from scipy.stats import linregress
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen

mouse_wght= avg_vol.iloc[:,1]
avg_tumor_volume = avg_vol.iloc[:,2]

correlation = st.pearsonr(mouse_wght, avg_tumor_volume)
print(f"The correlation between both factors is {round(correlation[0],2)}")

The correlation between both factors is 0.84


In [413]:
fig8, ax8 = plt.subplots(figsize=(7, 5))

x_values = avg_vol['Weight (g)']
y_values = avg_vol['Average Volume']
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values, y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(6,10),color="red")
plt.title('Linear Regression: Mouse Weight vs Average Tumor Volume')
plt.xlabel("Mouse Weight")
plt.ylabel("Average Tumor Volume")
plt.show()

<IPython.core.display.Javascript object>