In [1]:
#For running matplotlib on Jupyter Notebook:
#%matplotlib notebook

#For running matplotlib on VS Code
%matplotlib ipympl  
#or use the code %matplotlib inline

In [2]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
from scipy.stats import sem
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

In [3]:
# Combine the data into a single dataset
mouse_study = pd.merge (mouse_metadata,study_results, on = "Mouse ID")

# Display the data table for preview
mouse_study.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumour Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [4]:
# Checking the number of mice.
mouse_count = mouse_study ['Mouse ID'].nunique()
mouse_count

249

In [5]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 

mouse_study_duplicated = mouse_study[['Mouse ID', 'Timepoint']].duplicated()
# get mouse IDs where duplication detected
mouse_id_duplicated = list(mouse_study.loc[mouse_study_duplicated]['Mouse ID'].unique()) 
mouse_id_duplicated

['g989']

In [6]:
#Calculating the total number of duplicated Mouse ID
mouse_duplicated_count = mouse_id_duplicated. count()
mouse_duplicated_count

TypeError: count() takes exactly one argument (0 given)

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
display_duplicate = mouse_study_duplicated.duplicated(subset=['Mouse ID'],keep=False)
display_duplicate1 = pd.DataFrame(display_duplicate)
display_duplicate1

TypeError: duplicated() got an unexpected keyword argument 'subset'

In [None]:
#Display the number of unique mice IDs in the data
unique_mouseid = mouse_study ["Mouse ID"].unique()
unique_mouseid

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
#clean_data = mouse_study.drop_duplicates(subset=['Mouse ID'], keep = 'last')
#clean_data = mouse_study.loc[mouse_study["Mouse ID"]!= ("".join((mouse_duplicated))),:]
clean_data= mouse_study[mouse_study["Mouse ID"].isin(display_duplicate1)==False]
clean_data

In [None]:
# Checking the number of mice in the clean DataFrame.
clean_data["Mouse ID"].count()

Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumour volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# Assemble the resulting series into a single summary dataframe.
# mean of the tumour volume. 
mean_clean_data = pd.DataFrame(clean_data.groupby(["Drug Regimen"]).mean()["Tumour Volume (mm3)"])
mean_clean_data = mean_clean_data.rename(columns={"Tumour Volume (mm3)":"Mean"})


In [None]:
# median of the tumour volume.
median_clean_data = pd.DataFrame(clean_data.groupby(["Drug Regimen"]).median()["Tumour Volume (mm3)"])
median_clean_data = median_clean_data.rename(columns={"Tumour Volume (mm3)":"Median"})


In [None]:
#variance of the tumour volume.
variance_clean_data = pd.DataFrame(clean_data.groupby(["Drug Regimen"]).var()["Tumour Volume (mm3)"])
variance_clean_data = variance_clean_data.rename(columns={"Tumour Volume (mm3)": "Variance"})


In [None]:
#standard deviation of the tumour volume.
std_clean_data = pd.DataFrame(clean_data.groupby(["Drug Regimen"]).std()["Tumour Volume (mm3)"])
std_clean_data = std_clean_data.rename(columns={"Tumour Volume (mm3)": "Standard Deviation"})


In [None]:
# SEM of the tumour volume.
sem_clean_data = pd.DataFrame(clean_data.groupby(["Drug Regimen"]).sem()["Tumour Volume (mm3)"])
sem_clean_data = sem_clean_data.rename(columns={"Tumour Volume (mm3)": "SEM"})

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumour volume for each regimen
statistics_clean_data = mean_clean_data.join(median_clean_data).join(variance_clean_data).join(std_clean_data).join(sem_clean_data)
statistics_clean_data

In [None]:
# Using the aggregation method, produce the same summary statistics in a single line
agg_summary_table = clean_data.groupby(["Drug Regimen"]).agg({"Tumour Volume (mm3)": ['mean','median','var','std','sem']})
agg_summary_table = agg_summary_table.rename(columns={'mean': "Mean", "var":"Variance", "std": "Standard Deviation", "median": "Median", "sem": "SEM"})
agg_summary_table

Bar and Pie Charts

In [None]:
#Set DataFrame for each drug regiment by using groupby
drug_regiment = pd.DataFrame(clean_data.groupby(["Drug Regimen"]).sum()["Timepoint"])
drug_regiment =drug_regiment.reset_index()

In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using Pandas.
drug_regiment_bar =drug_regiment.plot(x = "Drug Regimen", y= "Timepoint", kind = "bar", figsize = (6,10))
drug_regiment_bar.set_xlabel ("Drug Regimen")
drug_regiment_bar.set_ylabel ("Time Point")
drug_regiment_bar.set_title ("Drug Regiment vs Time Point")
plt.show()
plt.tight_layout()

In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using pyplot.
x_axis = np.arange(len(drug_regiment))
tick_locations = [values for values in x_axis]
plt.bar(x_axis, drug_regiment["Timepoint"], color = 'b', align = "center") #, clean_data["Timepoint"], alpha = 0.5, align="center")
plt.xticks(tick_locations, drug_regiment["Drug Regimen"])
plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas
gender_mice = clean_data.groupby(clean_data["Sex"]).count().plot ( y="Mouse ID", kind = "pie", autopct="%1.1f%%")
gender_mice.set_title ("The Distribution of Female vs Male Mice")
gender_mice.set_ylabel("")
plt.tight_layout()
plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
gender_mice_plt = clean_data.groupby(clean_data["Sex"]).count()
gender_mice_plt = pd.DataFrame(gender_mice_plt, columns = ["Mouse ID"])
gender_mice_plt = gender_mice_plt.reset_index()
my_label = gender_mice_plt["Sex"]
my_value = gender_mice_plt["Mouse ID"]
plt.ylabel("")
plt.pie(my_value, labels= my_label, autopct='%1.1f%%')
plt.show()


#plt.legend(pie[0],labels, bbox_to_anchor=(1,0.5), loc="center right", fontsize=10, bbox_transform=plt.gcf().transFigure)

Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin
final_tumour_volume = pd.DataFrame(clean_data.drop_duplicates(subset=["Drug Regimen"], keep='last')) 
final_tumour_volume = final_tumour_volume [["Mouse ID", "Drug Regimen", "Tumour Volume (mm3)", "Timepoint"]]

# Start by getting the last (greatest) timepoint for each mouse

# Merge this group df with the original dataframe to get the tumour volume at the last timepoint
merge_timepoint_tumour = final_tumour_volume.merge (clean_data, on = ["Mouse ID", "Timepoint"], how ="left")
merge_timepoint_tumour

In [None]:
# Put treatments into a list for for loop (and later for plot labels)
treatment_drug = list(merge_timepoint_tumour["Drug Regimen"])
treatment_drug_list = [treatment for treatment in treatment_drug]

# Create empty list to fill with tumour vol data (for plotting)
tumour_list = []

# Calculate the IQR and quantitatively determine if there are any potential outliers. 

for tumour in tumour_list:

#quartiles 
# Locate the rows which contain mice on each drug and get the tumour volumes
    tumour_volume = merge_timepoint_tumor.loc[merge_timepoint_tumor["Drug Regimen"] == tumour, "Tumour Volume (mm3)"]
    
# add subset 
    tumour_list.append(tumour_volume)
    
# Determine outliers using upper and lower bounds
    quartiles = tumour_volume.quantile ([.25,.5,.75])
    lowerq = quartiles[0.25]
    upperq = quartiles[0.75]
    iqr = upperq -  lowerq
    
    lower_bound = lowerq- (1.5*iqr)
    upper_bound = upperq +(1.5*iqr)    
    outliers_tumour = merge_timepoint_tumor.loc[(merge_timepoint_tumor["Tumour Volume (mm3)"] < lower_bound) | (merge_timepoint_tumor["Tumour Volume (mm3)"] > upper_bound)]
    outliers_tumour