## Observations and Insights 

In [173]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "Resource/Mouse_metadata.csv"
study_results_path = "Resource/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
Trial_data_complete = pd.merge(mouse_metadata, study_results, how="left", on=["Mouse ID", "Mouse ID"])

# Display the data table for preview
Trial_data_complete_df = Trial_data_complete
Trial_data_complete_df.head()



Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [174]:
# Checking the number of mice.
No_of_mouse = len(Trial_data_complete_df["Mouse ID"].unique())
No_of_mouse



249

In [175]:
#Rename the column Name for Mouse ID to MouseID
Trial_data_complete_renamed_df = Trial_data_complete_df.rename(columns={"Mouse ID":"MouseID"})
Trial_data_complete_renamed_df

Unnamed: 0,MouseID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [176]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
# Inserting a new column by concat Mouse ID and timepoint to identify any duplicated data (search from Youtube)
Trial_data_complete_renamed_df["MouseID_Timepoint"] = Trial_data_complete_renamed_df["MouseID"] + '-' + Trial_data_complete_renamed_df["Timepoint"].astype(str)
Trial_data_complete_renamed_df

Unnamed: 0,MouseID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites,MouseID_Timepoint
0,k403,Ramicane,Male,21,16,0,45.000000,0,k403-0
1,k403,Ramicane,Male,21,16,5,38.825898,0,k403-5
2,k403,Ramicane,Male,21,16,10,35.014271,1,k403-10
3,k403,Ramicane,Male,21,16,15,34.223992,1,k403-15
4,k403,Ramicane,Male,21,16,20,32.997729,1,k403-20
...,...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2,z969-25
1889,z969,Naftisol,Male,9,30,30,65.841013,3,z969-30
1890,z969,Naftisol,Male,9,30,35,69.176246,4,z969-35
1891,z969,Naftisol,Male,9,30,40,70.314904,4,z969-40


In [177]:
#Save the rename df to a variable
Trial_data_complete_rename =Trial_data_complete_renamed_df

In [178]:
#Find out how many duplicated data for the same MouseID and same Timepoint (search from youtube)
Trial_data_complete_renamed_df["MouseID_Timepoint"].duplicated().sum()

5

In [179]:
#Call out and print those 5 duplicated data by the column "MouseID_Timepoint"
#Result show they are related to one particular MouseID
Trial_data_complete_rename.loc[Trial_data_complete_renamed_df["MouseID_Timepoint"].duplicated(), :]

Unnamed: 0,MouseID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites,MouseID_Timepoint
909,g989,Propriva,Female,21,26,0,45.0,0,g989-0
911,g989,Propriva,Female,21,26,5,47.570392,0,g989-5
913,g989,Propriva,Female,21,26,10,49.880528,0,g989-10
915,g989,Propriva,Female,21,26,15,53.44202,0,g989-15
917,g989,Propriva,Female,21,26,20,54.65765,1,g989-20


In [180]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
Trial_data_complete_cleaned_df = Trial_data_complete_rename[Trial_data_complete_rename["MouseID"] != "g989"]
Trial_data_complete_cleaned_df

Unnamed: 0,MouseID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites,MouseID_Timepoint
0,k403,Ramicane,Male,21,16,0,45.000000,0,k403-0
1,k403,Ramicane,Male,21,16,5,38.825898,0,k403-5
2,k403,Ramicane,Male,21,16,10,35.014271,1,k403-10
3,k403,Ramicane,Male,21,16,15,34.223992,1,k403-15
4,k403,Ramicane,Male,21,16,20,32.997729,1,k403-20
...,...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2,z969-25
1889,z969,Naftisol,Male,9,30,30,65.841013,3,z969-30
1890,z969,Naftisol,Male,9,30,35,69.176246,4,z969-35
1891,z969,Naftisol,Male,9,30,40,70.314904,4,z969-40


In [181]:
#After removing the mouse that had duplicated timepoint, check how many mouse in the dataset
No_of_mouse_checker = len(Trial_data_complete_cleaned_df["MouseID"].unique())
No_of_mouse_checker

248

## Summary Statistics

In [182]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# This method is the most straighforward, creating multiple series and putting them all together at the end.


In [183]:
#Looking at the column name
Trial_data_complete_cleaned_df.columns

Index(['MouseID', 'Drug Regimen', 'Sex', 'Age_months', 'Weight (g)',
       'Timepoint', 'Tumor Volume (mm3)', 'Metastatic Sites',
       'MouseID_Timepoint'],
      dtype='object')

In [184]:
#To start my summary statistics. Need to find out how many regimen we have in the dataset
No_of_regimen = (Trial_data_complete_cleaned_df["Drug Regimen"].value_counts())
No_of_regimen

Capomulin    230
Ramicane     228
Ketapril     188
Naftisol     186
Zoniferol    182
Placebo      181
Stelasyn     181
Infubinol    178
Ceftamin     178
Propriva     148
Name: Drug Regimen, dtype: int64

In [185]:
#Create a new dataframe just to display regimen - "Capomulin"
Capomulin_df = Trial_data_complete_cleaned_df.loc[Trial_data_complete_cleaned_df["Drug Regimen"] == "Capomulin"]
Capomulin_df

Unnamed: 0,MouseID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites,MouseID_Timepoint
10,s185,Capomulin,Female,3,17,0,45.000000,0,s185-0
11,s185,Capomulin,Female,3,17,5,43.878496,0,s185-5
12,s185,Capomulin,Female,3,17,10,37.614948,0,s185-10
13,s185,Capomulin,Female,3,17,15,38.177232,0,s185-15
14,s185,Capomulin,Female,3,17,20,36.866876,0,s185-20
...,...,...,...,...,...,...,...,...,...
440,i557,Capomulin,Female,1,24,45,47.685963,1,i557-45
1452,r157,Capomulin,Male,22,25,0,45.000000,0,r157-0
1453,r157,Capomulin,Male,22,25,5,45.597064,0,r157-5
1454,r157,Capomulin,Male,22,25,10,46.059608,0,r157-10


In [186]:
#Calculate the mean across the board for drug - Capomulin
#This use as a reference
Capomulin_mean_Statistic = Capomulin_df.mean()
Capomulin_mean_Statistic


  Capomulin_mean_Statistic = Capomulin_df.mean()


Age_months            13.456522
Weight (g)            19.965217
Timepoint             21.565217
Tumor Volume (mm3)    40.675741
Metastatic Sites       0.713043
dtype: float64

In [187]:
Capomulin_Age_months_average = Capomulin_df["Age_months"].mean()
Capomulin_Age_months_average

13.456521739130435

In [188]:
Capomulin_weight_average = Capomulin_df["Weight (g)"].mean()
Capomulin_weight_average

19.965217391304346

In [189]:
Capomulin_Tumor_Volume_average = Capomulin_df["Tumor Volume (mm3)"].mean()
Capomulin_Tumor_Volume_average

40.67574114100001

In [190]:
Capomulin_Metastatic_sites_average = Capomulin_df["Metastatic Sites"].mean()
Capomulin_Metastatic_sites_average


0.7130434782608696

In [209]:
#Calculate the median across the board for drug - Capomulin
#This use as a reference
Capomulin_median_statistic = Capomulin_df.median()
Capomulin_median_statistic

  Capomulin_median_statistic = Capomulin_df.median()


Age_months            16.500000
Weight (g)            20.500000
Timepoint             20.000000
Tumor Volume (mm3)    41.557809
Metastatic Sites       0.000000
dtype: float64

In [192]:
Capomulin_Age_months_median = Capomulin_df["Age_months"].median()
Capomulin_Age_months_median


16.5

In [193]:
Capomulin_weight_median = Capomulin_df["Weight (g)"].median()
Capomulin_weight_median

20.5

In [194]:
Capomulin_Tumor_Volume_median = Capomulin_df["Tumor Volume (mm3)"].median()
Capomulin_Tumor_Volume_median

41.557808879999996

In [195]:
Capomulin_Metastatic_sites_median = Capomulin_df["Metastatic Sites"].median()
Capomulin_Metastatic_sites_median

0.0

In [221]:
Capomulin_Age_months_variance = Capomulin_df["Age_months"].var()
Capomulin_Age_months_variance

59.62037212834644

In [222]:
Capomulin_weight_variance = Capomulin_df["Weight (g)"].var()
Capomulin_weight_variance

7.466033795329428

In [196]:
Capomulin_Tumor_Volume_variance = Capomulin_df["Tumor Volume (mm3)"].var()
Capomulin_Tumor_Volume_variance


24.947764120254856

In [223]:
Capomulin_Metastatic_sites_variance = Capomulin_df["Metastatic Sites"].var()
Capomulin_Metastatic_sites_variance

0.7207898234288951

In [224]:
Capomulin_Age_months_STD_D = Capomulin_df["Age_months"].std()
Capomulin_Age_months_STD_D

7.7214229341712945

In [226]:
Capomulin_weight_STD_D = Capomulin_df["Weight (g)"].std()
Capomulin_weight_STD_D

2.7324043982048902

In [213]:
Capomulin_Tumor_Volume_STD_D = Capomulin_df["Tumor Volume (mm3)"].std()
Capomulin_Tumor_Volume_STD_D

4.9947736805840215

In [228]:
Capomulin_Metastatic_sites_STD_D = Capomulin_df["Metastatic Sites"].std()
Capomulin_Metastatic_sites_STD_D

0.8489934177771316

In [232]:
Capomulin_Age_months_SEM = Capomulin_df["Age_months"].sem()
Capomulin_Age_months_SEM

0.5091355509622955

In [231]:
Capomulin_weight_SEM = Capomulin_df["Weight (g)"].sem()
Capomulin_weight_SEM

0.1801694105597071

In [230]:
Capomulin_Tumor_Volume_SEM = Capomulin_df["Tumor Volume (mm3)"].sem()
Capomulin_Tumor_Volume_SEM

0.32934562340083096

In [229]:
Capomulin_Metastatic_sites_SEM = Capomulin_df["Metastatic Sites"].sem()
Capomulin_Metastatic_sites_SEM

0.055980968172379214

In [235]:
Capomulin_Summary_Statistic_df = pd.DataFrame({"Capomulin_Mean": [Capomulin_Age_months_average, Capomulin_weight_average, Capomulin_Tumor_Volume_average,Capomulin_Metastatic_sites_average],
                                            "Capomulin_Median": [Capomulin_Age_months_median, Capomulin_weight_median, Capomulin_Tumor_Volume_median, Capomulin_Metastatic_sites_median],
                                            "Capomulin_Variance": [Capomulin_Age_months_variance, Capomulin_weight_variance,Capomulin_Tumor_Volume_variance, Capomulin_Metastatic_sites_variance],
                                            "Capomulin_STD_D": [Capomulin_Age_months_STD_D, Capomulin_weight_STD_D, Capomulin_Tumor_Volume_STD_D, Capomulin_Metastatic_sites_STD_D],
                                            "Capomulin_SEM": [Capomulin_Age_months_SEM, Capomulin_weight_SEM, Capomulin_Tumor_Volume_SEM, Capomulin_Metastatic_sites_SEM],
                                             "Statistic Type": ["Age_Month","Weight","Tumor Volume","Metastatic Sites"]}, 
                                             columns=["Statistic Type","Capomulin_Mean","Capomulin_Median", "Capomulin_Variance","Capomulin_STD_D", "Capomulin_SEM"])
Capomulin_Summary_Statistic_df





Unnamed: 0,Statistic Type,Capomulin_Mean,Capomulin_Median,Capomulin_Variance,Capomulin_STD_D,Capomulin_SEM
0,Age_Month,13.456522,16.5,59.620372,7.721423,0.509136
1,Weight,19.965217,20.5,7.466034,2.732404,0.180169
2,Tumor Volume,40.675741,41.557809,24.947764,4.994774,0.329346
3,Metastatic Sites,0.713043,0.0,0.72079,0.848993,0.055981


In [198]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method produces everything in a single groupby function


## Bar and Pie Charts

In [199]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas. 



In [200]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pyplot.



In [201]:
# Generate a pie plot showing the distribution of female versus male mice using pandas



In [202]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [203]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [204]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [205]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [206]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin


In [207]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen


## Correlation and Regression

In [208]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
