In [1]:
%matplotlib notebook

In [2]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata_path = "../resources/Mouse_metadata.csv"
study_results_path = "../resources/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset

merged_df = pd.merge(mouse_metadata, study_results, on="Mouse ID", how="outer")

# Display the data table for preview
merged_df


Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [3]:
# Checking the number of mice.
unique_mice = merged_df["Mouse ID"].unique()
unique_mice
number_unique_mice = len(unique_mice)
number_unique_mice

249

In [4]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
merged_df.drop_duplicates()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [5]:
# Optional: Get all the data for the duplicate mouse ID. 

In [6]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_merged_df = merged_df.drop_duplicates()

In [7]:
# Checking the number of mice in the clean DataFrame.
unique_mice = clean_merged_df["Mouse ID"].unique()
unique_mice
number_unique_mice = len(unique_mice)
number_unique_mice

249

In [8]:
# Checking the number of mice in the clean DataFrame.
num_mice_clean = len(clean_merged_df['Mouse ID'])
num_mice_clean

1892

In [9]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 

# mean tumor volume
tumor_volume_regimen_mean = clean_merged_df["Tumor Volume (mm3)"].groupby(merged_df["Drug Regimen"])
mean_df = tumor_volume_regimen_mean.mean()
mean_df

# median tumor volume 
tumor_volume_regimen_median = clean_merged_df["Tumor Volume (mm3)"].groupby(merged_df["Drug Regimen"])
median_df = tumor_volume_regimen_median.median()
median_df

# vairance tumor volume
tumor_volume_regimen_variance = clean_merged_df["Tumor Volume (mm3)"].groupby(merged_df["Drug Regimen"])
variance_df = tumor_volume_regimen_variance.var()
variance_df

# std tumor volume
tumor_volume_regimen_std = clean_merged_df["Tumor Volume (mm3)"].groupby(merged_df["Drug Regimen"])
std_df = tumor_volume_regimen_std.std()
std_df

# sem tumor volume
tumor_volume_regimen_sem = clean_merged_df["Tumor Volume (mm3)"].groupby(merged_df["Drug Regimen"])
sem_df = tumor_volume_regimen_sem.sem()
sem_df

# Assemble the resulting series into a single summary dataframe.
tumor_volume_regimen = pd.DataFrame({#"Drug Regimen": drug_regimen,
                                    "Mean": mean_df,
                                    "Median": median_df,
                                    "Variance": variance_df,
                                    "Standard Deviation": std_df,
                                    "Standard Error of The Mean": sem_df}
)

tumor_volume_regimen

Unnamed: 0_level_0,Mean,Median,Variance,Standard Deviation,Standard Error of The Mean
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346
Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821
Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236
Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386
Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466
Placebo,54.033581,52.288934,61.168083,7.821003,0.581331
Propriva,52.368318,50.909965,42.27809,6.50216,0.514041
Ramicane,40.216745,40.673236,23.486704,4.846308,0.320955
Stelasyn,54.233149,52.431737,59.450562,7.710419,0.573111
Zoniferol,53.236507,51.818479,48.533355,6.966589,0.516398


In [10]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# Using the aggregation method, produce the same summary statistics in a single line



In [11]:
# BAR AND PIE CHARTS
# Generate a bar plot showing the total number of unique mice tested on each drug regimen using pandas.

#drug_list = ['Ramicane', 'Capomulin', 'Infubinol', 'Placebo', 'Ceftamin',
       #'Stelasyn', 'Zoniferol', 'Ketapril', 'Propriva', 'Naftisol']

mouse_drug_df = pd.DataFrame({"Drug Regimen": clean_merged_df["Drug Regimen"],
                             "Mouse ID": clean_merged_df["Mouse ID"]})
drop_mouse_df = mouse_drug_df.drop_duplicates()
group_drop_mouse_drug_df = drop_mouse_df.groupby('Drug Regimen')
group_drop_mouse_drug = group_drop_mouse_drug_df.count()  

drug_counts = [230,228,188,186,182,181,181,178,178,160]

x_axis = np.arange(len(drug_counts))

plot = group_drop_mouse_drug.plot.bar(x=x_axis, y= drug_counts, rot=0)

#tick_locations = x_axis
#plt.xticks(tick_locations, ["Capomulin", "Ramicane", 'Ketapril', 'Naftisol', 'Zoniferol', 'Placebo', 'Stelasyn', "Infubinol", 'Ceftamin', 'Propriva'])
#degrees = 65
#plt.xticks(rotation=degrees)

#plt.ylim(0, max(drug_counts) + 30)

#plt.title("Number of Mice/Drug Regimen")
#plt.xlabel("Drug Regimen")
#plt.ylabel("Number of Mice Tested")
#plt.show()


KeyError: "None of [Int64Index([0, 1, 2, 3, 4, 5, 6, 7, 8, 9], dtype='int64')] are in the [columns]"

In [None]:
# Generate a bar plot showing the total number of unique mice tested on each drug regimen using pyplot.

#list of unique mice
#unique_mice

mouse_drug_df = pd.DataFrame({"Drug Regimen": clean_merged_df["Drug Regimen"],
                             "Mouse ID": clean_merged_df["Mouse ID"]})
drop_mouse_df = mouse_drug_df.drop_duplicates()
group_drop_mouse_drug_df = drop_mouse_df.groupby('Drug Regimen')
group_drop_mouse_drug = group_drop_mouse_drug_df.count()

x_axis = np.arange(len(group_drop_mouse_drug))

plt.bar(x_axis, group_drop_mouse_drug['Mouse ID'], color='r', alpha=0.5, align="center")

tick_locations = x_axis
plt.xticks(tick_locations, group_drop_mouse_drug['Drug Regimen'])
degrees = 65
plt.xticks(rotation=degrees)

plt.title("Number of Unique Mice/Drug Regimen")
plt.xlabel("Drug Regimen")
plt.ylabel("Number of Unique Mice Tested")
plt.show()


In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas

# What does it mean by using pandas?

clean_merged_df['Sex'].value_counts()

labels = ["Male","Female"]
sizes =  [958, 934]
colors = ["red","lightskyblue"]

plt.pie(sizes, labels=labels, colors=colors, autopct="%1.1f%%", startangle=140)
plt.show()


In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

clean_merged_df['Sex'].value_counts()

labels = ["Male","Female"]
sizes =  [958, 934]
colors = ["red","lightskyblue"]

plt.pie(sizes, labels=labels, colors=colors, autopct="%1.1f%%", startangle=140)
plt.show()

In [None]:
# QUARTILES, OUTLIERS AND BOXPLOTS
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint

In [None]:
# TESTER
mouse_drug_df = pd.DataFrame({"Drug Regimen": clean_merged_df["Drug Regimen"],
                             "Mouse ID": clean_merged_df["Mouse ID"]})
drop_mouse_df = mouse_drug_df.drop_duplicates()
group_drop_mouse_drug_df = drop_mouse_df.groupby('Drug Regimen')
group_drop_mouse_drug = group_drop_mouse_drug_df.count()


In [None]:
# Put treatments into a list for for loop (and later for plot labels)

for treatment in treatments
    
    

# Create empty list to fill with tumor vol data (for plotting)

tumor_volume_data = []


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest

In [None]:
#LINE AND SCATTER PLOTS
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin

In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen

In [None]:
#CORRELATION AND REGRESSION
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen