In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
mouse_study_data = pd.merge(mouse_metadata, study_results, on="Mouse ID")

# Display the data table for preview
mouse_study_data.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [2]:
# Checking the number of mice.
unique_mice = (mouse_study_data["Mouse ID"].unique())

num_of_mice = len(unique_mice)

print(f"There are {num_of_mice} unique mice in this study.")

There are 249 unique mice in this study.


In [3]:
 # Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint.
duplicate_mice = mouse_study_data[mouse_study_data.duplicated(['Mouse ID', 'Timepoint'])]
    

In [4]:
# Optional: Get all the data for the duplicate mouse ID.
duplicate_mice

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
909,g989,Propriva,Female,21,26,0,45.0,0
911,g989,Propriva,Female,21,26,5,47.570392,0
913,g989,Propriva,Female,21,26,10,49.880528,0
915,g989,Propriva,Female,21,26,15,53.44202,0
917,g989,Propriva,Female,21,26,20,54.65765,1


In [6]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
cleaned_mouse_study_data = mouse_study_data.drop_duplicates(subset=['Mouse ID', 'Timepoint'])

cleaned_mouse_study_data.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [7]:
 # Checking the number of mice in the clean DataFrame.
cleaned_unique_mice = (cleaned_mouse_study_data["Mouse ID"].unique())

cleaned_num_of_mice = len(cleaned_unique_mice)

print(f"There are {cleaned_num_of_mice} unique mice in this study after removing duplicate data.")

There are 249 unique mice in this study after removing duplicate data.


In [17]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
drug_list = (cleaned_mouse_study_data["Drug Regimen"].unique())

drug_list

array(['Ramicane', 'Capomulin', 'Infubinol', 'Placebo', 'Ceftamin',
       'Stelasyn', 'Zoniferol', 'Ketapril', 'Propriva', 'Naftisol'],
      dtype=object)

In [20]:
# Creating dataframes for each drug regimen in order to find the summary statistics
ramicane_data = cleaned_mouse_study_data[(cleaned_mouse_study_data["Drug Regimen"] == "Ramicane")]
capomulin_data = cleaned_mouse_study_data[(cleaned_mouse_study_data["Drug Regimen"] == "Capomulin")]
infubinol_data = cleaned_mouse_study_data[(cleaned_mouse_study_data["Drug Regimen"] == "Infubinol")]
placedo_data = cleaned_mouse_study_data[(cleaned_mouse_study_data["Drug Regimen"] == "Placebo")]
ceftamin_data = cleaned_mouse_study_data[(cleaned_mouse_study_data["Drug Regimen"] == "Ceftamin")]
stelasyn_data = cleaned_mouse_study_data[(cleaned_mouse_study_data["Drug Regimen"] == "Stelasyn")]
zoniferol_data = cleaned_mouse_study_data[(cleaned_mouse_study_data["Drug Regimen"] == "Zoniferol")]
ketapril_data = cleaned_mouse_study_data[(cleaned_mouse_study_data["Drug Regimen"] == "Ketapril")]
propriva_data = cleaned_mouse_study_data[(cleaned_mouse_study_data["Drug Regimen"] == "Propriva")]
naftisol_data = cleaned_mouse_study_data[(cleaned_mouse_study_data["Drug Regimen"] == "Naftisol")]


In [21]:
ramicane_data.describe()

Unnamed: 0,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
count,228.0,228.0,228.0,228.0,228.0
mean,10.684211,19.679825,21.425439,40.216745,0.548246
std,5.946629,3.235014,14.27572,4.846308,0.691259
min,1.0,16.0,0.0,22.050126,0.0
25%,7.0,17.0,10.0,36.674635,0.0
50%,9.0,19.0,20.0,40.673236,0.0
75%,18.0,22.0,35.0,45.0,1.0
max,23.0,25.0,45.0,47.622816,3.0


In [22]:
capomulin_data.describe()

Unnamed: 0,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
count,230.0,230.0,230.0,230.0,230.0
mean,13.456522,19.965217,21.565217,40.675741,0.713043
std,7.721423,2.732404,14.384994,4.994774,0.848993
min,1.0,15.0,0.0,23.343598,0.0
25%,7.0,17.0,10.0,37.685933,0.0
50%,16.5,20.5,20.0,41.557809,0.0
75%,20.0,22.0,35.0,45.0,1.0
max,24.0,25.0,45.0,48.158209,3.0


In [23]:
infubinol_data.describe()

Unnamed: 0,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
count,178.0,178.0,178.0,178.0,178.0
mean,16.230337,27.196629,18.174157,52.884795,0.960674
std,7.510278,2.18381,13.473473,6.567243,1.027104
min,1.0,23.0,0.0,36.321346,0.0
25%,8.0,25.0,5.0,47.312353,0.0
50%,20.0,27.0,15.0,51.820584,1.0
75%,23.0,29.0,30.0,57.314444,2.0
max,24.0,30.0,45.0,72.226731,4.0


In [24]:
placedo_data.describe()

Unnamed: 0,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
count,181.0,181.0,181.0,181.0,181.0
mean,10.734807,27.928177,18.674033,54.033581,1.441989
std,6.354907,1.837973,13.890798,7.821003,1.338824
min,1.0,25.0,0.0,45.0,0.0
25%,5.0,27.0,5.0,47.459053,0.0
50%,10.0,28.0,15.0,52.288934,1.0
75%,17.0,30.0,30.0,59.916934,2.0
max,21.0,30.0,45.0,73.212939,4.0


In [25]:
ceftamin_data.describe()

Unnamed: 0,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
count,178.0,178.0,178.0,178.0,178.0
mean,13.247191,27.398876,19.747191,52.591172,1.179775
std,8.071406,1.58146,14.283969,6.268188,1.184283
min,2.0,25.0,0.0,45.0,0.0
25%,6.0,26.0,5.0,47.208427,0.0
50%,12.0,28.0,20.0,51.776157,1.0
75%,20.0,28.0,30.0,56.801438,2.0
max,24.0,30.0,45.0,68.923185,4.0


In [26]:
stelasyn_data.describe()

Unnamed: 0,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
count,181.0,181.0,181.0,181.0,181.0
mean,12.78453,27.856354,19.226519,54.233149,0.872928
std,7.939562,1.643616,13.84271,7.710419,0.972046
min,1.0,25.0,0.0,45.0,0.0
25%,4.0,27.0,5.0,48.047139,0.0
50%,14.0,28.0,20.0,52.431737,1.0
75%,21.0,29.0,30.0,58.719297,1.0
max,23.0,30.0,45.0,75.12369,4.0


In [27]:
zoniferol_data.describe()

Unnamed: 0,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
count,182.0,182.0,182.0,182.0,182.0
mean,12.598901,27.692308,19.368132,53.236507,1.230769
std,5.786114,1.419612,14.384679,6.966589,1.248884
min,2.0,25.0,0.0,45.0,0.0
25%,8.0,27.0,5.0,47.337876,0.0
50%,12.5,28.0,15.0,51.818479,1.0
75%,16.0,29.0,30.0,57.954259,2.0
max,24.0,30.0,45.0,73.324432,4.0


In [28]:
ketapril_data.describe()

Unnamed: 0,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
count,188.0,188.0,188.0,188.0,188.0
mean,15.659574,27.861702,19.707447,55.235638,1.297872
std,6.01967,1.841884,14.029935,8.279709,1.393873
min,1.0,25.0,0.0,45.0,0.0
25%,11.75,26.0,5.0,48.232987,0.0
50%,18.0,28.0,20.0,53.698743,1.0
75%,19.0,30.0,30.0,60.870951,2.0
max,24.0,30.0,45.0,78.567014,4.0


In [29]:
propriva_data.describe()

Unnamed: 0,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
count,156.0,156.0,156.0,156.0,156.0
mean,10.570513,27.076923,17.083333,52.393463,1.0
std,7.188801,1.686908,13.571297,6.568014,1.08954
min,1.0,25.0,0.0,45.0,0.0
25%,5.0,26.0,5.0,47.046068,0.0
50%,8.0,26.0,15.0,50.909965,1.0
75%,16.0,29.0,26.25,56.491585,1.0
max,24.0,30.0,45.0,72.455421,4.0


In [30]:
naftisol_data.describe()

Unnamed: 0,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
count,186.0,186.0,186.0,186.0,186.0
mean,12.0,27.166667,19.623656,54.331565,1.182796
std,6.715855,1.499249,14.184814,8.134708,1.216519
min,2.0,25.0,0.0,45.0,0.0
25%,8.0,26.0,5.0,47.285874,0.0
50%,9.0,27.0,20.0,52.509285,1.0
75%,19.0,28.0,30.0,59.963034,2.0
max,23.0,30.0,45.0,76.668817,4.0


In [None]:


# This method produces everything in a single groupby function

In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas. 

In [None]:
 # Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pyplot.

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint

In [None]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest

In [None]:
 # Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin

In [None]:
 # Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen