In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
from sklearn import datasets
import numpy as np
from scipy.stats import linregress

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
combined_data_df = mouse_metadata.merge(study_results, on="Mouse ID")

# Display the data table for preview
combined_data_df

In [None]:
# Checking the number of mice.
combined_data_df["Mouse ID"].nunique()

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint.
duplicates_df = combined_data_df[combined_data_df.duplicated(["Mouse ID","Timepoint"])]
#duplicates_df["Mouse ID"]

In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
duplicates_df

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
cleaned_df = combined_data_df.drop_duplicates(["Mouse ID"])
#cleaned_df

In [None]:
# Checking the number of mice in the clean DataFrame.
cleaned_df["Mouse ID"]

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method is the most straighforward, creating multiple series and putting them all together at the end.

cleaned_df.describe()

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method produces everything in a single groupby function

grouped_drugs_df = cleaned_df.groupby(["Drug Regimen"])
#grouped_drugs_df.describe()

In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas.
drug_mouse_count_df = grouped_drugs_df.count()

drug_mouse_count_df = drug_mouse_count_df.loc[:,"Mouse ID"]
#print (drug_mouse_count_df)
drug_mouse_count_df.plot.bar()

In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pyplot.
#drug_mouse_count_df
x_axis = drug_mouse_count_df.index

plt.bar(x_axis, drug_mouse_count_df, color="b", align="center")

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
grouped_sex_df = cleaned_df.groupby(["Sex"])
#grouped_sex_df.describe()

grouped_sex_ct = grouped_sex_df.count()
grouped_sex_ct = grouped_sex_ct.loc[:,"Mouse ID"]
grouped_sex_ct.plot.pie()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
labels = grouped_sex_ct.index

plt.pie(grouped_sex_ct, autopct="%1.1f%%", labels=labels, startangle=90)

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse
# Merge this group df with the original dataframe to get the tumor volume at the last timepoint
mouse_treatments_df = cleaned_df[["Drug Regimen","Mouse ID", "Timepoint"]]
mouse_treatments_df = mouse_treatments_df.sort_values(["Mouse ID", "Timepoint"])
mouse_treatments_df = mouse_treatments_df.drop_duplicates("Mouse ID", keep="last", inplace=False)
drugs = ["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]
mouse_treatments_df = mouse_treatments_df.loc[mouse_treatments_df["Drug Regimen"].isin(drugs)]
mouse_treatments_df = mouse_treatments_df.merge(cleaned_df)
#mouse_treatments_df

In [None]:
# Put treatments into a list for for loop (and later for plot labels)
# Create empty list to fill with tumor vol data (for plotting)
drug_regimens = ["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]
#volumes_for_drug_regimen=[]

# Calculate the IQR and quantitatively determine if there are any potential outliers.
for drug in drug_regimens:
    print(drug)
    #volumes_for_drug_regimen = mouse_treatments_df.loc[mouse_treatments_df["Drug Regimen"] == drug,["Tumor Volume (mm3)"]].values.tolist()
    mouse_treatments_for_drug_df = mouse_treatments_df.loc[mouse_treatments_df["Drug Regimen"] == drug,["Tumor Volume (mm3)"]]
    #print(mouse_treatments_for_drug_df)
    volumes_for_drug_regimen = mouse_treatments_for_drug_df["Tumor Volume (mm3)"]
    quartiles = volumes_for_drug_regimen.quantile([.25,.5,.75])
    lowerq = quartiles[0.25]
    upperq = quartiles[0.75]
    iqr = upperq-lowerq
    print(f"The lower quartile of temperatures is: {lowerq}")
    print(f"The upper quartile of temperatures is: {upperq}")
    print(f"The interquartile range of temperatures is: {iqr}")
    print(f"The the median of temperatures is: {quartiles[0.5]} ")
    lower_bound = lowerq - (1.5*iqr)
    upper_bound = upperq + (1.5*iqr)
    print(f"Values below {lower_bound} could be outliers.")
    print(f"Values above {upper_bound} could be outliers.")
    
    # Determine outliers using upper and lower bounds
    #tumor_volumes_df = volumes_for_drug_regimen["Tumor Volume (mm3)"]
    fig1, ax1 = plt.subplots()
    ax1.set_title(drug)
    ax1.set_ylabel('Tumor Volume')
    # Generate a box plot of the final tumor volume of each mouse across four regimens of interest
    ax1.boxplot(volumes_for_drug_regimen)
    plt.show()

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin
mouse_trial_1 = combined_data_df.loc[combined_data_df['Mouse ID'] == 's185']
mouse_trial_1.head()
x_axis = mouse_trial_1['Timepoint']
volumes = mouse_trial_1['Tumor Volume (mm3)']
plt.xlabel("Timepoint")
plt.ylabel("Tumor Volume")
plt.title(f"Capomulin, Mouse ID: s185 ")
plt.plot(x_axis, volumes)
plt.show()

In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen
capomulin_data_df = combined_data_df[{'Drug Regimen', 'Mouse ID', 'Weight (g)', 'Tumor Volume (mm3)'}]
capomulin_data_df = capomulin_data_df.loc[capomulin_data_df['Drug Regimen'] == 'Capomulin']
capomulin_group_df = capomulin_data_df.groupby(['Drug Regimen', 'Weight (g)']).mean()
x_axis = capomulin_group_df.index.get_level_values('Weight (g)').values
y_axis = capomulin_group_df['Tumor Volume (mm3)'].values
plt.scatter(x_axis, y_axis, marker="o", facecolors="red", edgecolors="black")
plt.xlabel('Mouse Weight')
plt.ylabel('Avg Tumor Volume')
plt.title(f"Capomulin Regimen")
plt.show()

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen

# Correlation Coefficient
correlation = st.pearsonr(x_axis,y_axis)
print(f"The correlation between both factors is {round(correlation[0],2)}")

#Linear Regression Model
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_axis, y_axis)
regress_values = x_axis * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_axis,y_axis)
plt.plot(x_axis,regress_values,"r-")
plt.annotate(line_eq,(6,10),fontsize=15,color="red")
plt.xlabel('Mouse Weight')
plt.ylabel('Average Tumor Volume')
plt.title(f"Capomulin Regimen")
plt.show()