## Observations and Insights 

In [92]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "Resources/Mouse_metadata.csv"
study_results_path = "Resources/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset (Mouse ID)
comb_df = pd.merge(mouse_metadata, study_results, on='Mouse ID', how='outer')


# Display the data table for preview
comb_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [112]:
# Checking the number of mice. 
# comb_df.dtypes
len(comb_df["Mouse ID"].unique())

249

In [94]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
# There are 5 duplicates and they are only in the study_results data frame. 
dup_mt_df = pd.DataFrame(comb_df, columns = ["Mouse ID", "Timepoint"])

dup_mt_df = dup_mt_df[dup_mt_df.duplicated()]
dup_mt_df


Unnamed: 0,Mouse ID,Timepoint
909,g989,0
911,g989,5
913,g989,10
915,g989,15
917,g989,20


In [93]:
dup_study = pd.DataFrame(study_results, columns = ["Mouse ID", "Timepoint"])

dup_study = dup_study[dup_study.duplicated()]

dup_study

Unnamed: 0,Mouse ID,Timepoint
137,g989,0
360,g989,5
681,g989,10
869,g989,15
1111,g989,20


In [98]:
# Print these to find duplicates in each of the two main data frames
g989_meta = mouse_metadata.loc[mouse_metadata["Mouse ID"] == "g989", :]
g989_study = study_results.loc[study_results["Mouse ID"] == "g989", :]
g989_study

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites
107,g989,0,45.0,0
137,g989,0,45.0,0
329,g989,5,48.786801,0
360,g989,5,47.570392,0
620,g989,10,51.745156,0
681,g989,10,49.880528,0
815,g989,15,51.325852,1
869,g989,15,53.44202,0
950,g989,20,55.326122,1
1111,g989,20,54.65765,1


In [96]:
# Optional: Get all the data for the duplicate mouse ID. 
g989_all = comb_df.loc[comb_df["Mouse ID"] =="g989", :]
g989_all

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
908,g989,Propriva,Female,21,26,0,45.0,0
909,g989,Propriva,Female,21,26,0,45.0,0
910,g989,Propriva,Female,21,26,5,48.786801,0
911,g989,Propriva,Female,21,26,5,47.570392,0
912,g989,Propriva,Female,21,26,10,51.745156,0
913,g989,Propriva,Female,21,26,10,49.880528,0
914,g989,Propriva,Female,21,26,15,51.325852,1
915,g989,Propriva,Female,21,26,15,53.44202,0
916,g989,Propriva,Female,21,26,20,55.326122,1
917,g989,Propriva,Female,21,26,20,54.65765,1


In [71]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_df = comb_df.loc[comb_df["Mouse ID"] != "g989", :]
clean_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [100]:
# Checking the number of mice in the clean DataFrame. Since we took out the Mouse ID g989 there should be 248
len(clean_df['Mouse ID'].unique())


248

In [80]:
clean_study_results = study_results.loc[study_results["Mouse ID"] != "g989", :]
clean_study_results

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites
0,b128,0,45.000000,0
1,f932,0,45.000000,0
2,g107,0,45.000000,0
3,a457,0,45.000000,0
4,c819,0,45.000000,0
...,...,...,...,...
1888,r944,45,41.581521,2
1889,u364,45,31.023923,3
1890,p438,45,61.433892,1
1891,x773,45,58.634971,4


## Summary Statistics

In [110]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
#clean_df[["Drug Regimen", "Tumor Volume (mm3)"]].describe()
# This method is the most straighforward, creating multiple series and putting them all together at the end.




clean_df.agg({'Tumor Volume (mm3)': ['mean', 'median', 'var', 'std','sem']})

#df.groupby('A').agg({'B': ['min', 'max'], 'C': 'sum'})

clean_df.groupby("Drug Regimen").agg({"Tumor Volume (mm3)": ['mean', 'median', 'var', 'std','sem']})


Unnamed: 0_level_0,Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3)
Unnamed: 0_level_1,mean,median,var,std,sem
Drug Regimen,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346
Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821
Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236
Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386
Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466
Placebo,54.033581,52.288934,61.168083,7.821003,0.581331
Propriva,52.32093,50.446266,43.852013,6.622085,0.544332
Ramicane,40.216745,40.673236,23.486704,4.846308,0.320955
Stelasyn,54.233149,52.431737,59.450562,7.710419,0.573111
Zoniferol,53.236507,51.818479,48.533355,6.966589,0.516398


In [88]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
#group data by drug and timepoint 
drug_data = clean_df.groupby(["Drug Regimen", "Timepoint"])

drug_stats = drug_data.describe()
drug_stats.head(15)

# This method produces everything in a single groupby function


Unnamed: 0_level_0,Unnamed: 1_level_0,Age_months,Age_months,Age_months,Age_months,Age_months,Age_months,Age_months,Age_months,Weight (g),Weight (g),...,Tumor Volume (mm3),Tumor Volume (mm3),Metastatic Sites,Metastatic Sites,Metastatic Sites,Metastatic Sites,Metastatic Sites,Metastatic Sites,Metastatic Sites,Metastatic Sites
Unnamed: 0_level_1,Unnamed: 1_level_1,count,mean,std,min,25%,50%,75%,max,count,mean,...,75%,max,count,mean,std,min,25%,50%,75%,max
Drug Regimen,Timepoint,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2,Unnamed: 13_level_2,Unnamed: 14_level_2,Unnamed: 15_level_2,Unnamed: 16_level_2,Unnamed: 17_level_2,Unnamed: 18_level_2,Unnamed: 19_level_2,Unnamed: 20_level_2,Unnamed: 21_level_2,Unnamed: 22_level_2
Capomulin,0,25.0,14.04,7.850053,1.0,7.0,17.0,21.0,24.0,25.0,20.12,...,45.0,45.0,25.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
Capomulin,5,25.0,14.04,7.850053,1.0,7.0,17.0,21.0,24.0,25.0,20.12,...,45.760886,45.895647,25.0,0.16,0.374166,0.0,0.0,0.0,0.0,1.0
Capomulin,10,25.0,14.04,7.850053,1.0,7.0,17.0,21.0,24.0,25.0,20.12,...,46.408361,46.759074,25.0,0.32,0.627163,0.0,0.0,0.0,0.0,2.0
Capomulin,15,24.0,13.666667,7.788881,1.0,7.0,16.5,20.25,24.0,24.0,20.0,...,44.634897,47.488599,24.0,0.375,0.646899,0.0,0.0,0.0,1.0,2.0
Capomulin,20,23.0,13.304348,7.754382,1.0,7.0,16.0,19.5,24.0,23.0,19.782609,...,43.596783,48.158209,23.0,0.652174,0.775107,0.0,0.0,0.0,1.0,2.0
Capomulin,25,22.0,13.181818,7.91404,1.0,7.0,16.5,19.75,24.0,22.0,19.909091,...,43.211711,46.809225,22.0,0.818182,0.852803,0.0,0.0,1.0,1.0,3.0
Capomulin,30,22.0,13.181818,7.91404,1.0,7.0,16.5,19.75,24.0,22.0,19.909091,...,42.576043,45.261384,22.0,1.090909,0.811177,0.0,1.0,1.0,1.75,3.0
Capomulin,35,22.0,13.181818,7.91404,1.0,7.0,16.5,19.75,24.0,22.0,19.909091,...,41.179873,45.941949,22.0,1.181818,0.795006,0.0,1.0,1.0,2.0,3.0
Capomulin,40,21.0,12.809524,7.909608,1.0,7.0,16.0,19.0,24.0,21.0,19.857143,...,40.770813,46.82107,21.0,1.380952,0.804748,0.0,1.0,1.0,2.0,3.0
Capomulin,45,21.0,12.809524,7.909608,1.0,7.0,16.0,19.0,24.0,21.0,19.857143,...,40.15922,47.685963,21.0,1.47619,0.928388,0.0,1.0,1.0,2.0,3.0


## Bar and Pie Charts

In [9]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas. 



In [10]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pyplot.



In [11]:
# Generate a pie plot showing the distribution of female versus male mice using pandas



In [12]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [13]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [14]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [15]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [16]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin


In [17]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen


## Correlation and Regression

In [18]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
