## Observations and Insights 

In [18]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

In [19]:
mouse_metadata.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g)
0,k403,Ramicane,Male,21,16
1,s185,Capomulin,Female,3,17
2,x401,Capomulin,Female,16,15
3,m601,Capomulin,Male,22,17
4,g791,Ramicane,Male,11,16


In [20]:
study_results.head()

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites
0,b128,0,45.0,0
1,f932,0,45.0,0
2,g107,0,45.0,0
3,a457,0,45.0,0
4,c819,0,45.0,0


In [21]:
# Combine the data into a single dataset
# Display the data table for preview
mouse_data_df = pd.merge(mouse_metadata, study_results, on="Mouse ID")
print(mouse_data_df)

     Mouse ID Drug Regimen   Sex  Age_months  Weight (g)  Timepoint  \
0        k403     Ramicane  Male          21          16          0   
1        k403     Ramicane  Male          21          16          5   
2        k403     Ramicane  Male          21          16         10   
3        k403     Ramicane  Male          21          16         15   
4        k403     Ramicane  Male          21          16         20   
...       ...          ...   ...         ...         ...        ...   
1888     z969     Naftisol  Male           9          30         25   
1889     z969     Naftisol  Male           9          30         30   
1890     z969     Naftisol  Male           9          30         35   
1891     z969     Naftisol  Male           9          30         40   
1892     z969     Naftisol  Male           9          30         45   

      Tumor Volume (mm3)  Metastatic Sites  
0              45.000000                 0  
1              38.825898                 0  
2           

In [22]:
# Checking the number of mice
count = mouse_data_df["Mouse ID"].value_counts()
count

g989    13
w422    10
x581    10
t718    10
y163    10
        ..
u153     1
l872     1
h428     1
x336     1
t573     1
Name: Mouse ID, Length: 249, dtype: int64

In [23]:
# Counting the different drug regimens
count = mouse_data_df["Drug Regimen"].value_counts()
count

Capomulin    230
Ramicane     228
Ketapril     188
Naftisol     186
Zoniferol    182
Placebo      181
Stelasyn     181
Ceftamin     178
Infubinol    178
Propriva     161
Name: Drug Regimen, dtype: int64

In [24]:
# Getting a list of all the columns
mouse_data_df.columns

Index(['Mouse ID', 'Drug Regimen', 'Sex', 'Age_months', 'Weight (g)',
       'Timepoint', 'Tumor Volume (mm3)', 'Metastatic Sites'],
      dtype='object')

In [49]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
mouse_data_df = mouse_data_df.loc[(mouse_data_df["Mouse ID"].duplicated{subset="Timepoint", keep='first', inplace=False})]

SyntaxError: invalid syntax (<ipython-input-49-40de24f8e5ee>, line 2)

In [26]:
# Dropping duplicate timepoints
# For True in "Time Point" delete row
mouse_data_df = mouse_data_df.drop_duplicates(subset="Timepoint"
                                              if value in"Timepoint"=True
                                                  then 

mouse_data_df = mouse_data_df.loc[(
    mouse_data_df["Timepoint"]=True)]
mouse_data_df.head()

In [42]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_mouse_df = mouse_data_df
clean_mouse_df.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [28]:
# Checking the number of mice in the clean DataFrame.
count = clean_mouse_df["Mouse ID"].value_counts()
count

k403    10
Name: Mouse ID, dtype: int64

## Summary Statistics

In [29]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
clean_mouse_df.describe()

Unnamed: 0,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
count,10.0,10.0,10.0,10.0,10.0
mean,21.0,16.0,22.5,32.358859,0.8
std,0.0,0.0,15.138252,6.82568,0.421637
min,21.0,16.0,0.0,22.050126,0.0
25%,21.0,16.0,11.25,27.685119,1.0
50%,21.0,16.0,22.5,33.231153,1.0
75%,21.0,16.0,33.75,34.816702,1.0
max,21.0,16.0,45.0,45.0,1.0


In [None]:
# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume.                        

In [30]:
#Capomulin
#mean
capomulin_mean_numpy = np.mean(Tumor Volume (mm3))
print(f"The mean tumor volume for Capomulin is {round(capomulin_mean_numpy),2} mm3.")

#median
capomulin_median_numpy = np.median(Tumor Volume (mm3))
print(f"The median tumor volume for Capomulin is {round(capomulin_median_numpy),2}")

#variance
capomulin_var_numpy = np.var(Tumor Volume (mm3),ddof = 0)
print(f"The population variance for Capomulin using the NumPy module is {round(capomulin_var_numpy),2}")

#standard deviation
capomulin_sd_numpy = np.std((Tumor Volume (mm3),ddof = 0)
print(f"The population standard deviation for Capomulin using the NumPy module is {round(capomulin_sd_numpy),2}")

#SEM
sample = capomulin_tumor_volume.sample(30)
print(f"The SEM value for the Capomulin sample of tumor volume is {sem(sample.capomulin.Tumor Volume (mm3)}")     

SyntaxError: invalid syntax (<ipython-input-30-dcb45fc1ea3a>, line 3)

In [31]:
#Ramicane
#mean
ramicane_mean_numpy = np.mean(Tumor Volume (mm3))
print(f"The mean tumor volume for Ramicane is {round(ramicane_mean_numpy),2} mm3.")

#median
ramicane_median_numpy = np.median(Tumor Volume (mm3))
print(f"The median tumor volume for Ramicane is {round(ramicane_median_numpy),2}")

#variance
ramicane_var_numpy = np.var(Tumor Volume (mm3),ddof = 0)
print(f"The population variance for Ramicane using the NumPy module is {round(ramicane_var_numpy),2}")

#standard deviation
ramicane_sd_numpy = np.std((Tumor Volume (mm3),ddof = 0)
print(f"The population standard deviation for Ramicane using the NumPy module is {round(ramicane_sd_numpy),2}")

#SEM
sample = ramicane_tumor_volume.sample(30)
print(f"The SEM value for the Ramicane sample of tumor volume is {sem(sample.ramicane.Tumor Volume (mm3)}")     

SyntaxError: invalid syntax (<ipython-input-31-8abcd6eac2a5>, line 3)

In [32]:
#Ketapril
#mean
ketapril_mean_numpy = np.mean(Tumor Volume (mm3))
print(f"The mean tumor volume for Ketapril is {round(ketapril_mean_numpy),2} mm3.")

#median
ketapril_median_numpy = np.median(Tumor Volume (mm3))
print(f"The median tumor volume for Ketapril is {round(ketapril_median_numpy),2}")

#variance
ketapril_var_numpy = np.var(Tumor Volume (mm3),ddof = 0)
print(f"The population variance for Ketapril using the NumPy module is {round(ketapril_var_numpy),2}")

#standard deviation
ketapril_sd_numpy = np.std((Tumor Volume (mm3),ddof = 0)
print(f"The population standard deviation for Ketapril using the NumPy module is {round(ketapril_sd_numpy),2}")

#SEM
sample = ketapril_tumor_volume.sample(30)
print(f"The SEM value for the Ketapril sample of tumor volume is {sem(sample.ketapril.Tumor Volume (mm3)}")     

SyntaxError: invalid syntax (<ipython-input-32-3aeb65c4c572>, line 3)

In [33]:
Naftisol
#mean
naftisol_mean_numpy = np.mean(Tumor Volume (mm3))
print(f"The mean tumor volume for Naftisol is {round(naftisol_mean_numpy),2} mm3.")

#median
naftisol_median_numpy = np.median(Tumor Volume (mm3))
print(f"The median tumor volume for Naftisol is {round(naftisol_median_numpy),2}")

#variance
naftisol_var_numpy = np.var(Tumor Volume (mm3),ddof = 0)
print(f"The population variance for Naftisol using the NumPy module is {round(naftisol_var_numpy),2}")

#standard deviationNaftisol
naftisol_sd_numpy = np.std((Tumor Volume (mm3),ddof = 0)
print(f"The population standard deviation for Naftisol using the NumPy module is {round(naftisol_sd_numpy),2}")

#SEM
sample = naftisol_tumor_volume.sample(30)
print(f"The SEM value for the Naftisol sample of tumor volume is {sem(sample.naftisol.Tumor Volume (mm3)}")     

SyntaxError: invalid syntax (<ipython-input-33-efc37d3a7529>, line 3)

In [34]:
Zoniferol
#mean
zoniferol_mean_numpy = np.mean(Tumor Volume (mm3))
print(f"The mean tumor volume for Zoniferol is {round(zoniferol_zoniferolmean_numpy),2} mm3.")

#medianzoniferol
zoniferol_median_numpy = np.median(Tumor Volume (mm3))
print(f"The median tumor volume for Zoniferol is {round(zoniferol_median_numpy),2}")

#variance
zoniferol_var_numpy = np.var(Tumor Volume (mm3),ddof = 0)
print(f"The population variance for Zoniferol using the NumPy module is {round(zoniferol_var_numpy),2}")

#standard deviation
zoniferol_sd_numpy = np.std((Tumor Volume (mm3),ddof = 0)
print(f"The population standard deviation for Zoniferol using the NumPy module is {round(zoniferol_sd_numpy),2}")

#SEM
sample = zoniferol_tumor_volume.sample(30)
print(f"The SEM value for the Zoniferol sample of tumor volume is {sem(sample.zoniferol.Tumor Volume (mm3)}")     

SyntaxError: invalid syntax (<ipython-input-34-27b5037ac785>, line 3)

In [35]:
Placebo
#mean
placebo_mean_numpy = np.mean(Tumor Volume (mm3))
print(f"The mean tumor volume for Placebo is {round(placebo_mean_numpy),2} mm3.")

#median
placebo_median_numpy = np.median(Tumor Volume (mm3))
print(f"The median tumor volume for Placebo is {round(placebo_median_numpy),2}")

#variance
placebo_var_numpy = np.var(Tumor Volume (mm3),ddof = 0)
print(f"The population variance for Placebo using the NumPy module is {round(placebo_var_numpy),2}")

#standard deviation
placebo_sd_numpy = np.std((Tumor Volume (mm3),ddof = 0)
print(f"The population standard deviation for Placebo using the NumPy module is {round(placebo_sd_numpy),2}")

#SEM
sample = placebo_tumor_volume.sample(30)
print(f"The SEM value for the Placebo sample of tumor volume is {sem(sample.placebo.Tumor Volume (mm3)}")     

SyntaxError: invalid syntax (<ipython-input-35-4765dec18231>, line 3)

In [36]:
Stelasyn
#mean
stelasyn_mean_numpy = np.mean(Tumor Volume (mm3))
print(f"The mean tumor volume for Stelasyn is {round(stelasyn_mean_numpy),2} mm3.")

#median
stelasyn_median_numpy = np.median(Tumor Volume (mm3))
print(f"The median tumor volume for Stelasyn is {round(stelasyn_median_numpy),2}")

#variance
stelasyn_var_numpy = np.var(Tumor Volume (mm3),ddof = 0)
print(f"The population variance for Stelasyn using the NumPy module is {round(stelasyn_var_numpy),2}")

#standard deviation
stelasyn_sd_numpy = np.std((Tumor Volume (mm3),ddof = 0)
print(f"The population standard deviation for Stelasyn using the NumPy module is {round(stelasyn_sd_numpy),2}")

#SEM
sample = stelasyn_tumor_volume.sample(30)
print(f"The SEM value for the Stelasyn sample of tumor volume is {sem(sample.stelasyn.Tumor Volume (mm3)}")     

SyntaxError: invalid syntax (<ipython-input-36-b035d99655c2>, line 3)

In [37]:
Ceftamin
#mean
ceftamin_mean_numpy = np.mean(Tumor Volume (mm3))
print(f"The mean tumor volume for Ceftamin is {round(ceftamin_mean_numpy),2} mm3.")

#median
ceftamin_median_numpy = np.median(Tumor Volume (mm3))
print(f"The median tumor volume for Ceftamin is {round(ceftamin_median_numpy),2}")

#variance
ceftamin_var_numpy = np.var(Tumor Volume (mm3),ddof = 0)
print(f"The population variance for Ceftamin using the NumPy module is {round(ceftamin_var_numpy),2}")

#standard deviation
ceftamin_sd_numpy = np.std((Tumor Volume (mm3),ddof = 0)
print(f"The population standard deviation for Ceftamin using the NumPy module is {round(ceftamin_sd_numpy),2}")

#SEM
sample = ceftamin_tumor_volume.sample(30)
print(f"The SEM value for the Ceftamin sample of tumor volume is {sem(sample.ceftamin.Tumor Volume (mm3)}")     

SyntaxError: invalid syntax (<ipython-input-37-37d07d76af03>, line 3)

In [38]:
Infubinol
#mean
infubinol_mean_numpy = np.mean(Tumor Volume (mm3))
print(f"The mean tumor volume for Infubinol is {round(infubinol_mean_numpy),2} mm3.")

#median
infubinol_median_numpy = np.median(Tumor Volume (mm3))
print(f"The median tumor volume for Infubinol is {round(infubinol_median_numpy),2}")

#variance
infubinol_var_numpy = np.var(Tumor Volume (mm3),ddof = 0)
print(f"The population variance for Infubinol using the NumPy module is {round(infubinol_var_numpy),2}")

#standard deviation
infubinol_sd_numpy = np.std((Tumor Volume (mm3),ddof = 0)
print(f"The population standard deviation for Infubinol using the NumPy module is {round(infubinol_sd_numpy),2}")

#SEM
sample = infubinol_tumor_volume.sample(30)
print(f"The SEM value for the Infubinol sample of tumor volume is {sem(sample.infubinol.Tumor Volume (mm3)}")     

SyntaxError: invalid syntax (<ipython-input-38-795e6bb3b3a9>, line 3)

In [39]:
Propriva
#mean
propiva_mean_numpy = np.mean(Tumor Volume (mm3))
print(f"The mean tumor volume for Propriva is {round(propiva_mean_numpy),2} mm3.")

#median
propiva_median_numpy = np.median(Tumor Volume (mm3))
print(f"The median tumor volume for Propriva is {round(propiva_median_numpy),2}")

#variance
propiva_var_numpy = np.var(Tumor Volume (mm3),ddof = 0)
print(f"The population variance for Propriva using the NumPy module is {round(propiva_var_numpy),2}")

#standard deviation
propiva_sd_numpy = np.std((Tumor Volume (mm3),ddof = 0)
print(f"The population standard deviation for Propriva using the NumPy module is {round(propiva_sd_numpy),2}")

#SEM
sample = propiva_tumor_volume.sample(30)
print(f"The SEM value for the Propriva sample of tumor volume is {sem(sample.propiva.Tumor Volume (mm3)}")     

SyntaxError: invalid syntax (<ipython-input-39-e55c67839aae>, line 3)

In [40]:
# Assemble the resulting series into a single summary dataframe.

In [8]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Using the aggregation method, produce the same summary statistics in a single line


## Bar and Pie Charts

In [9]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using Pandas.
plt.bar


In [10]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using pyplot.



In [11]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas



In [12]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [13]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse
clean_mouse_df.loc("Timepoint")

# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [14]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)
tumor volume = [ ]


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [15]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [16]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin


In [17]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen


## Correlation and Regression

In [18]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
