## Observations and Insights 

In [3]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

mouse_metadata.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g)
0,k403,Ramicane,Male,21,16
1,s185,Capomulin,Female,3,17
2,x401,Capomulin,Female,16,15
3,m601,Capomulin,Male,22,17
4,g791,Ramicane,Male,11,16


In [63]:
# Combine the data into a single dataset
data_df=pd.merge(mouse_metadata,study_results, on="Mouse ID")

# Display the data table for preview
data_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [5]:
# Checking the number of mice.
data_df.nunique()

Mouse ID               249
Drug Regimen            10
Sex                      2
Age_months              24
Weight (g)              16
Timepoint               10
Tumor Volume (mm3)    1644
Metastatic Sites         5
dtype: int64

In [6]:
timepoints = list(data_df['Timepoint'].unique())
timepoints

[0, 5, 10, 15, 20, 25, 30, 35, 40, 45]

In [7]:
mice = list(data_df['Mouse ID'].unique())
mice[0]

'k403'

In [8]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 

dup_find = data_df[['Mouse ID','Timepoint']].value_counts()>1

dup_m = list()
dup_t = list()

for mouse in mice:
    for time in timepoints:
        try:
            if dup_find[mouse,time]==True:
                dup_m.append(mouse)
                dup_t.append(time)
        except:
            pass
        
print(dup_m)
print(dup_t)

['g989', 'g989', 'g989', 'g989', 'g989']
[0, 5, 10, 15, 20]


In [9]:
# Optional: Get all the data for the duplicate mouse ID. 
dup_mouse = data_df.loc[data_df['Mouse ID']==dup_m[0],:].sort_values('Timepoint')
dup_mouse

In [41]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_df = data_df.loc[data_df['Mouse ID']!=dup_m[0],:]


In [42]:
# Checking the number of mice in the clean DataFrame.
clean_df.nunique()

Mouse ID               248
Drug Regimen            10
Sex                      2
Age_months              24
Weight (g)              16
Timepoint               10
Tumor Volume (mm3)    1633
Metastatic Sites         5
dtype: int64

## Summary Statistics

In [103]:
# Generate a summary statistics table of 
# mean, median, variance, standard deviation and SEM of the tumor volume for each regimen

regimen = list(clean_df['Drug Regimen'].unique())
tumor_vols = dict()
tumor_stats = dict()
stat = ['mean', 'median', 'var', 'std', 'sem']

for drug in regimen:
    
    # Build a dictionary of tumor volumes for each drug
    tumor_vols[drug] = clean_df[['Tumor Volume (mm3)']].loc[clean_df['Drug Regimen'] == drug, :]
    
    # Determine summary statistics of tumors for drug
    mean = tumor_vols[drug]['Tumor Volume (mm3)'].mean()
    med = tumor_vols[drug]['Tumor Volume (mm3)'].median()
    var = tumor_vols[drug]['Tumor Volume (mm3)'].var()
    std = tumor_vols[drug]['Tumor Volume (mm3)'].std()
    sem = tumor_vols[drug]['Tumor Volume (mm3)'].sem()
    
    # Build a drug-key dictionary of summary statistic tumor volume
    # tumor_stats.update({drug: {mean,med,var,std,sem}})
    tumor_stats[drug] = {mean}
    
    tumor_stats[drug].append(med)
    
    # ? append multiple dictionaries to each drug?
    mean_d = {'mean':mean}
    med_d = {'med':med}
    v_d = {'var':var}
    
tumor_stats[regimen[0]]

# reg1 = pd.DataFrame(tumor_stats[regimen[0]])
# reg1.index = stat
# reg1
    

#### THIS WAS WORKING partly... the stats were out of order in the 
## list somehow, and BUT I also WANTED TO SET THE INDEX TO THE 
# STAT NAMES, AND MAYBE GOT SIDETRACKED WORMHOLED

AttributeError: 'set' object has no attribute 'append'

In [86]:
tumor_vols[regimen[2]]['Tumor Volume (mm3)'].mean()

#only_billys =      df.loc[      df["fir  st_name"] == "Billy"   , :]                    
# tumor_vols['Ramicane'] = tumor_vols.pop('Tumor Volume (mm3)')
# tumor_vols.rename(columns={'Tumor Volume (mm3)': regimen[0]})
# dictionary['ONE'] = dictionary.pop(1)



# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.

52.88479510859551

In [73]:
# Generate a summary statistics table of 
# mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Using the aggregation method, produce the same summary statistics in a single line

clean_group = clean_df.groupby('Drug Regimen')

stat = ['mean', 'median', 'var', 'std', 'sem']

a = clean_group['Tumor Volume (mm3)'].agg([stat[0]])

b = clean_group['Tumor Volume (mm3)'].agg([stat[1]])

c = clean_group['Tumor Volume (mm3)'].agg([stat[2]])

d = clean_group['Tumor Volume (mm3)'].agg([stat[3]])

e = clean_group['Tumor Volume (mm3)'].agg([stat[4]])

a[stat[0]] = a[stat[0]].map("{:.2f}".format)
b[stat[1]] = b[stat[1]].map("{:.2f}".format)
c[stat[2]] = c[stat[2]].map("{:.2f}".format)
d[stat[3]] = d[stat[3]].map("{:.2f}".format)
e[stat[4]] = e[stat[4]].map("{:.2f}".format)

f = a.join(b)
f = f.join(c)
f = f.join(d)
f = f.join(e)

summary_stat = f.copy()

summary_stat

Unnamed: 0_level_0,mean,median,var,std,sem
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.68,41.56,24.95,4.99,0.33
Ceftamin,52.59,51.78,39.29,6.27,0.47
Infubinol,52.88,51.82,43.13,6.57,0.49
Ketapril,55.24,53.7,68.55,8.28,0.6
Naftisol,54.33,52.51,66.17,8.13,0.6
Placebo,54.03,52.29,61.17,7.82,0.58
Propriva,52.32,50.45,43.85,6.62,0.54
Ramicane,40.22,40.67,23.49,4.85,0.32
Stelasyn,54.23,52.43,59.45,7.71,0.57
Zoniferol,53.24,51.82,48.53,6.97,0.52


## Bar and Pie Charts

In [9]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using Pandas.



In [10]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using pyplot.



In [11]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas



In [12]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [13]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [14]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [15]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [16]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin


In [17]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen


## Correlation and Regression

In [18]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
