## Observations and Insights 

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

mouse_metadata.head()

In [None]:
# Combine the data into a single dataset
data_df=pd.merge(mouse_metadata,study_results, on="Mouse ID")

# Display the data table for preview
data_df.head()

In [None]:
# Checking the number of mice.
data_df['Mouse ID'].nunique()

In [None]:
timepoints = list(data_df['Timepoint'].unique())
timepoints

In [None]:
mice = list(data_df['Mouse ID'].unique())
mice[0]

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 

dup_find = data_df[['Mouse ID','Timepoint']].value_counts()>1

dup_m = list()
dup_t = list()

for mouse in mice:
    for time in timepoints:
        try:
            if dup_find[mouse,time]==True:
                dup_m.append(mouse)
                dup_t.append(time)
        except:
            pass
        
print(dup_m)
print(dup_t)

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 

dup_mouse = data_df.loc[data_df['Mouse ID']==dup_m[0],:].sort_values('Timepoint')
dup_mouse

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.

clean_df = data_df.loc[data_df['Mouse ID']!=dup_m[0],:]

In [None]:
# Checking the number of mice in the clean DataFrame.

clean_df

## Summary Statistics

In [None]:
regimen = list(clean_df['Drug Regimen'].unique())
tumor_vols = dict()
drug_stats = dict()
stat_list = ['mean', 'median', 'var', 'std', 'sem']

for drug in regimen:
    
    # Build a dictionary of tumor volumes for each drug
    
    tumor_vols[drug] = clean_df.loc[clean_df['Drug Regimen'] == drug, ['Tumor Volume (mm3)']]
  
    # Determine summary statistics of tumors for drug
    
    stats = [
            tumor_vols[drug]['Tumor Volume (mm3)'].mean(),
            tumor_vols[drug]['Tumor Volume (mm3)'].median(),
            tumor_vols[drug]['Tumor Volume (mm3)'].var(),
            tumor_vols[drug]['Tumor Volume (mm3)'].std(),
            tumor_vols[drug]['Tumor Volume (mm3)'].sem()
            ]  

    drug_stats[drug] = pd.DataFrame(stats, index=[stat_list])


In [None]:
# Generate a summary statistics table of 
# mean, median, variance, standard deviation and SEM of the tumor volume for each regimen

a,b,c,d,e,f,g,h,i,j = regimen

x = a

a_df = pd.DataFrame(drug_stats[x])
a_df = a_df.set_axis([x], axis=1, inplace=False)

# Duplicate dataframe to a new name for joining subsequent frames
x_df = a_df
a_df


In [None]:
# Generate a summary statistics table of 
# mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

x = b

# Building dataframe for next drug, with same a_df title
a_df = pd.DataFrame(drug_stats[x])
a_df = a_df.set_axis([x], axis=1, inplace=False)

# Merging with growing summary dataframe
x_df = x_df.merge(a_df,how='outer',left_index=True,right_index=True)
x_df

In [None]:
# Generate a summary statistics table of 
# mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

x = c

# Building dataframe for next drug, with same a_df title
a_df = pd.DataFrame(drug_stats[x])
a_df = a_df.set_axis([x], axis=1, inplace=False)

# Merging with growing summary dataframe
x_df = x_df.merge(a_df,how='outer',left_index=True,right_index=True)
x_df

In [None]:
# Generate a summary statistics table of 
# mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

x = d

# Building dataframe for next drug, with same a_df title
a_df = pd.DataFrame(drug_stats[x])
a_df = a_df.set_axis([x], axis=1, inplace=False)

# Merging with growing summary dataframe
x_df = x_df.merge(a_df,how='outer',left_index=True,right_index=True)
x_df

In [None]:
# Generate a summary statistics table of 
# mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

x = e

# Building dataframe for next drug, with same a_df title
a_df = pd.DataFrame(drug_stats[x])
a_df = a_df.set_axis([x], axis=1, inplace=False)

# Merging with growing summary dataframe
x_df = x_df.merge(a_df,how='outer',left_index=True,right_index=True)
x_df

In [None]:
# Generate a summary statistics table of 
# mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

x = f

# Building dataframe for next drug, with same a_df title
a_df = pd.DataFrame(drug_stats[x])
a_df = a_df.set_axis([x], axis=1, inplace=False)

# Merging with growing summary dataframe
x_df = x_df.merge(a_df,how='outer',left_index=True,right_index=True)
x_df

In [None]:
# Generate a summary statistics table of 
# mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

x = g

# Building dataframe for next drug, with same a_df title
a_df = pd.DataFrame(drug_stats[x])
a_df = a_df.set_axis([x], axis=1, inplace=False)

# Merging with growing summary dataframe
x_df = x_df.merge(a_df,how='outer',left_index=True,right_index=True)
x_df

In [None]:
# Generate a summary statistics table of 
# mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

x = h

# Building dataframe for next drug, with same a_df title
a_df = pd.DataFrame(drug_stats[x])
a_df = a_df.set_axis([x], axis=1, inplace=False)

# Merging with growing summary dataframe
x_df = x_df.merge(a_df,how='outer',left_index=True,right_index=True)
x_df

In [None]:
# Generate a summary statistics table of 
# mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

x = i

# Building dataframe for next drug, with same a_df title
a_df = pd.DataFrame(drug_stats[x])
a_df = a_df.set_axis([x], axis=1, inplace=False)

# Merging with growing summary dataframe
x_df = x_df.merge(a_df,how='outer',left_index=True,right_index=True)
x_df

In [None]:
# Generate a summary statistics table of 
# mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

x = j

# Building dataframe for next drug, with same a_df title
a_df = pd.DataFrame(drug_stats[x])
a_df = a_df.set_axis([x], axis=1, inplace=False)

# Merging with growing summary dataframe
x_df = x_df.merge(a_df,how='outer',left_index=True,right_index=True)
x_df

In [None]:
# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.

clean_group = clean_df.groupby('Drug Regimen')


s1 = clean_group['Tumor Volume (mm3)'].mean()
s2 = clean_group['Tumor Volume (mm3)'].median()
s3 = clean_group['Tumor Volume (mm3)'].var()
s4 = clean_group['Tumor Volume (mm3)'].std()
s5 = clean_group['Tumor Volume (mm3)'].sem()

summary_df = pd.concat([s1,s2,s3,s4,s5],axis=1)
summary_df.columns = ['Tumor Vol (mm3) - Mean', 'Tumor Vol (mm3) - Median',
                      'Tumor Volume (mm3) - Variance', 'Tumor Volume (mm3) - STD',
                      'Tumor Volume (mm3) - SEM']
summary_df

In [None]:
# Generate a summary statistics table of 
# mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Using the aggregation method, produce the same summary statistics in a single line

clean_group = clean_df.groupby('Drug Regimen')

stat = ['mean', 'median', 'var', 'std', 'sem']

summary_cleantable = clean_group['Tumor Volume (mm3)'].agg(stat)

summary_cleantable.columns = ['Tumor Vol (mm3) - Mean', 'Tumor Vol (mm3) - Median',
                      'Tumor Volume (mm3) - Variance', 'Tumor Volume (mm3) - STD',
                      'Tumor Volume (mm3) - SEM']
summary_cleantable

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using Pandas.
# import numpy as np

cols = clean_df.columns

In [None]:
# Groupby Drug Regimen
mouse_g = clean_df.groupby('Drug Regimen')

# Count timpoints per Drug Regimen
count   = mouse_g['Timepoint'].count()

# Convert to dataframe, sort, and reset index
count_df = pd.DataFrame(count).sort_values(by='Timepoint',ascending=False)
count_df2 = count_df.reset_index()
count_df2

In [None]:
count_df2.plot.bar(x='Drug Regimen', y='Timepoint', rot=0, width = .8, color = 'g',
                    figsize=(10, 3), title = "Number of Timepoints Tested for Each Drug Regimen",
                   legend = False)

In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using pyplot.
import numpy as np
x_axis = np.arange(len(count_df2))
tick_locations = [value for value in x_axis]

plt.figure(figsize=(10,3))
plt.bar(x_axis, count_df2['Timepoint'], color='g', alpha=0.5, align="center")
plt.xticks(tick_locations, count_df2['Drug Regimen'])
plt.xlim(-.5, len(x_axis)-.5)
plt.title("Number of Timepoints Tested for Each Drug Regimen")
plt.xlabel("Drug Regimen")

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas

sex_g = clean_df.groupby('Sex').count()
sex_df = pd.DataFrame(sex_g)
sex_df = sex_df.rename(columns = {'Timepoint':'Sex'})
sex_df.plot.pie(y = 'Sex', legend = False, title = "Female vs. Male Representation", ylabel=' ',autopct='%1.1f%%')

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

# Labels for the sections of pie chart
labels = ['Females','Males']

# The values of each section of the pie chart
sizes = [sex_df['Sex'].iloc[0],sex_df['Sex'].iloc[1]]

# The colors of each section of the pie chart
colors = ["blue","orange"]

plt.figure()
plt.pie(sizes, labels=labels, colors=colors,autopct='%1.1f%%')
plt.title("Female vs. Male Representation")

# Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse
# Group dataframe by mouse and apply function to find max timepoint (tp) in dataframe

timepoint_group = clean_df.groupby('Mouse ID')

# Find max

timepoint_max = timepoint_group['Timepoint'].max()


# Create dataframe

timepoint_df = pd.DataFrame(timepoint_max)
timepoint_df = timepoint_df.reset_index()


In [None]:
# Merge this group df with the original dataframe to get the tumor volume at the last timepoint

max_df = timepoint_df.merge(clean_df, how='left', on=['Mouse ID', 'Timepoint'])
max_df

In [None]:
# Put treatments into a list for for loop 
# (and later for plot labels)
core = ['Capomulin', 'Ramicane', 'Infubinol', 'Ceftamin']

# Create empty list to fill with tumor vol data (for plotting)
vol_dict = dict()
vol_list = list()

for drug in core:
    df = max_df.loc[max_df['Drug Regimen'] == drug,['Tumor Volume (mm3)']]
    vol_list = df['Tumor Volume (mm3)'].tolist()
    vol_dict[drug] = vol_list
vol_df = pd.DataFrame(vol_dict)

In [None]:
# Calculate the IQR and quantitatively determine if there are any potential outliers. 
# Build quartiles
lower_bound_dict = dict()
upper_bound_dict = dict()

for drug in core:
    quartiles = vol_df[drug].quantile([0.25,0.50,0.75])
    lowerq = quartiles[0.25]
    upperq = quartiles[0.75]
    iqr = upperq-lowerq
    print('*******************************************')
    print(f'The IQR Value for {drug} is {iqr:.2f}.')

    # Set bounds
    lower_bound = lowerq - (1.5*iqr)
    upper_bound = upperq + (1.5*iqr)
    
    #building dictionary for dataframe
    lower_bound_dict[drug] = lower_bound
    upper_bound_dict[drug] = upper_bound

    outliers_hi = vol_df.loc[vol_df[drug] > upper_bound,:]
    outliers_lo = vol_df.loc[vol_df[drug] < lower_bound,:]
    outliers = pd.concat([outliers_lo[drug],outliers_hi[drug]])
    print(f'There are {len(outliers)} outliers in {drug} dataset.')
    print('*******************************************')

In [None]:
# Build dataframe for potential outliers 
# PART 1

filter_df = max_df[max_df['Drug Regimen'] == core[0]]

outliers_hi0 = filter_df.loc[filter_df['Tumor Volume (mm3)']
                            > upper_bound_dict[core[0]]]

outliers_lo0 = filter_df.loc[filter_df['Tumor Volume (mm3)']
                            < lower_bound_dict[core[0]]]

In [None]:
# Build dataframe for potential outliers
# PART 2

filter_df = max_df[max_df['Drug Regimen'] == core[1]]

outliers_hi1 = filter_df.loc[filter_df['Tumor Volume (mm3)']
                            > upper_bound_dict[core[1]]]

outliers_lo1 = filter_df.loc[filter_df['Tumor Volume (mm3)']
                            < lower_bound_dict[core[1]]]

In [None]:
# Build dataframe for potential outliers
# PART 3

filter_df = max_df[max_df['Drug Regimen'] == core[2]]

outliers_hi2 = filter_df.loc[filter_df['Tumor Volume (mm3)']
                            > upper_bound_dict[core[2]]]

outliers_lo2 = filter_df.loc[filter_df['Tumor Volume (mm3)']
                            < lower_bound_dict[core[2]]]

In [None]:
# Build dataframe for potential outliers
# PART 4

filter_df = max_df[max_df['Drug Regimen'] == core[3]]

outliers_hi3 = filter_df.loc[filter_df['Tumor Volume (mm3)']
                            > upper_bound_dict[core[3]]]

outliers_lo3 = filter_df.loc[filter_df['Tumor Volume (mm3)']
                            < lower_bound_dict[core[3]]]

In [None]:
# Assemble

outliers_hi = pd.concat([outliers_hi0,outliers_hi1,outliers_hi2,outliers_hi3])
outliers_lo = pd.concat([outliers_lo0,outliers_lo1,outliers_lo2,outliers_lo3])

In [None]:
# High end outliers
outliers_hi

In [None]:
# Low end outliers

outliers_lo

In [None]:
    # Locate the rows which contain mice on each drug and get the tumor volumes

Capomulin_vol_df = max_df.loc[max_df['Drug Regimen'] == core[0],['Drug Regimen','Tumor Volume (mm3)']]
Ramicane_vol_df = max_df.loc[max_df['Drug Regimen'] == core[1],['Drug Regimen','Tumor Volume (mm3)']]
Infubinol_vol_df = max_df.loc[max_df['Drug Regimen'] == core[2],['Drug Regimen','Tumor Volume (mm3)']]
Ceftamin_vol_df = max_df.loc[max_df['Drug Regimen'] == core[3],['Drug Regimen','Tumor Volume (mm3)']]

    # add subset 
    
vol_subset = pd.concat([Capomulin_vol_df,Ramicane_vol_df,Infubinol_vol_df,Ceftamin_vol_df])
vol_subset

In [None]:
    # Determine outliers using lower bounds

Capomulin_lb = Capomulin_vol_df.loc[Capomulin_vol_df['Tumor Volume (mm3)'] < lower_bound_dict[core[0]]]
Ramicane_lb = Ramicane_vol_df.loc[Ramicane_vol_df['Tumor Volume (mm3)'] < lower_bound_dict[core[1]]]
Infubinol_lb = Infubinol_vol_df.loc[Infubinol_vol_df['Tumor Volume (mm3)'] < lower_bound_dict[core[2]]]
Ceftamin_lb =  Ceftamin_vol_df.loc[Ceftamin_vol_df['Tumor Volume (mm3)'] < lower_bound_dict[core[3]]]

lower_bound_subset = pd.concat([Capomulin_lb,Ramicane_lb,Infubinol_lb,Ceftamin_lb])
lower_bound_subset

In [None]:
    # Determine outliers using upper bounds

Capomulin_ub = Capomulin_vol_df.loc[Capomulin_vol_df['Tumor Volume (mm3)'] > upper_bound_dict[core[0]]]
Ramicane_ub = Ramicane_vol_df.loc[Ramicane_vol_df['Tumor Volume (mm3)'] > upper_bound_dict[core[1]]]
Infubinol_ub = Infubinol_vol_df.loc[Infubinol_vol_df['Tumor Volume (mm3)'] > upper_bound_dict[core[2]]]
Ceftamin_ub =  Ceftamin_vol_df.loc[Ceftamin_vol_df['Tumor Volume (mm3)'] > upper_bound_dict[core[3]]]

upper_bound_subset = pd.concat([Capomulin_ub,Ramicane_ub,Infubinol_ub,Ceftamin_ub])
upper_bound_subset

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest

fig, ax = plt.subplots()
ax.set_title('Tumor Vol of Core Regimen')
ax.set_ylim([20, 75])
ax.set_ylabel('Tumor Volume (mm3)')

vol_df.boxplot()

## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin

Capomulin_df = clean_df[clean_df['Drug Regimen'] == 'Capomulin']
mouse = Capomulin_df.iloc[0,0]
mouse

In [None]:
mouse_df = Capomulin_df[Capomulin_df['Mouse ID'] == mouse]
mouse_df.head()

In [None]:
timepoint = mouse_df['Timepoint']
y_axis = mouse_df['Tumor Volume (mm3)']

plt.title(f'Tumor Vol vs. Time for Mouse {mouse} Treated with Capomulin')
plt.xlabel("Timepoint")
plt.ylabel("Tumor Volume (mm3)")
plt.plot(timepoint, y_axis)

In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen

# Gather mean data

Capomulin_mean = Capomulin_df.groupby('Mouse ID').mean()

Capomulin_mean.head()

In [None]:
weight = Capomulin_mean['Weight (g)']
avg_tumorvol = Capomulin_mean['Tumor Volume (mm3)']

Title2 = 'average tumor volume vs. mouse weight for the Capomulin regimen'.title()
plt.title(Title2)
plt.xlabel("Weight of Mouse (g)")
plt.ylabel("Tumor Volume (mm3)")
plt.scatter(weight, avg_tumorvol, marker="o", facecolors="red", edgecolors="black", alpha=0.75)
plt.show()

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen

correlation = st.pearsonr(weight,avg_tumorvol)
print(f"The correlation between weight and tumor volume is {round(correlation[0],2)}")

In [None]:
from scipy.stats import linregress

# Calculating line regression 
(slope, intercept, rvalue, pvalue, stderr) = linregress(weight, avg_tumorvol)
regress_values = weight * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))

plt.title(Title2)
plt.xlabel("Weight of Mouse (g)")
plt.ylabel("Tumor Volume (mm3)")
plt.scatter(weight, avg_tumorvol, marker="o", facecolors="red", edgecolors="black", alpha=0.75)

plt.plot(weight, regress_values,'r-')
plt.annotate(line_eq,(19,36),fontsize=15,color="red")