## Observations and Insights 

In [2]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
from scipy.stats import linregress
from scipy import stats

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

FileNotFoundError: [Errno 2] No such file or directory: 'data/Mouse_metadata.csv'

In [3]:
mouse_metadata_df = pd.DataFrame(mouse_metadata)
mouse_metadata_df

NameError: name 'mouse_metadata' is not defined

In [None]:
study_results_df = pd.DataFrame(study_results)
study_results_df

In [None]:
# Combine the data into a single dataset
mousedata_studyresults_df = pd.merge(mouse_metadata_df, study_results_df, on="Mouse ID", how="outer")

# Display the data table for preview
mousedata_studyresults_df

In [None]:
# Checking the number of mice.
mice_count = len(mousedata_studyresults_df["Mouse ID"].unique())
mice_count 

In [None]:
# Getting the Mouse ID numbers that show up with duplicate Timepoints. 
mousedata_studyresults_duplicates_df = mousedata_studyresults_df.loc[mousedata_studyresults_df.duplicated(), "Mouse ID"]
mousedata_studyresults_duplicates_df

In [None]:
# Optional: Display all the data for the duplicate mouse ID. 
mouse_data_duplicate_data = mousedata_studyresults_df.loc[mousedata_studyresults_df["Mouse ID"] == "g989"]
mouse_data_duplicate_data

In [None]:
mousedata_studyresults_new_df = mousedata_studyresults_df.set_index("Mouse ID")
mousedata_studyresults_new_df

In [None]:
# Create a clean DataFrame by dropping all data associated with the duplicate mouse by its ID.
mousedata_studyresults_clean_new_df = mousedata_studyresults_new_df.drop(index = "g989")
mousedata_studyresults_clean_new_df = mousedata_studyresults_clean_new_df.reset_index()
mousedata_studyresults_clean_new_df

In [None]:
# Checking the number of mice in the clean DataFrame.
mice_count_clean = len(mousedata_studyresults_clean_new_df["Mouse ID"].unique())
mice_count_clean

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume 
# for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.
combined_data_mean= mousedata_studyresults_clean_new_df.groupby(["Drug Regimen"]).mean()["Tumor Volume (mm3)"]
combined_data_mean

In [None]:
combined_data_median = mousedata_studyresults_clean_new_df.groupby(["Drug Regimen"]).median()["Tumor Volume (mm3)"]
combined_data_median

In [None]:
combined_data_var = mousedata_studyresults_clean_new_df.groupby(["Drug Regimen"]).var()["Tumor Volume (mm3)"]
combined_data_var



In [None]:
combined_data_std = mousedata_studyresults_clean_new_df.groupby(["Drug Regimen"]).std()["Tumor Volume (mm3)"]
combined_data_std

In [None]:
combined_data_SEM = mousedata_studyresults_clean_new_df.groupby(["Drug Regimen"]).sem()["Tumor Volume (mm3)"]
combined_data_SEM

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume
# for each regimen
combined_data_df = pd.DataFrame({"MEAN": combined_data_mean, "MEDIAN": combined_data_median, "VARIANCE": combined_data_var, "STANDARD DEVIATION": combined_data_std, "STANDARD OF ERROR": combined_data_SEM})
combined_data_df.head()

In [None]:
# Using the aggregation method, produce the same summary statistics in a single line

mean = combined_data_df["MEAN"].mean()
median = combined_data_df["MEDIAN"].mean()
variance = combined_data_df["VARIANCE"].mean()
standard_dev = combined_data_df["STANDARD DEVIATION"].mean()
standard_error = combined_data_df["STANDARD OF ERROR"].mean()

aggregation_df = pd.DataFrame({"MEAN": [mean], "MEDIAN": [median], "VARIANCE": [variance], "STANDARD DEVIATION": [standard_dev], "STANDARD OF ERROR": [standard_error]})
aggregation_df

In [None]:
aggregated_df = combined_data_df.mean(axis='rows')
aggregated_df

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pandas.
total_measurements = mousedata_studyresults_clean_new_df["Drug Regimen"].unique()
total_measurements

In [None]:
total_measurement_regimens = pd.Series(total_measurements)
total_measurement_regimens

In [None]:
total_measurements_regimen_counts = mousedata_studyresults_clean_new_df["Drug Regimen"].value_counts()
total_measurements_regimen_counts

In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pandas.
drug_regimen_plot = total_measurements_regimen_counts.plot(kind="bar")
drug_regimen_plot.set_ylim(0, 250)
drug_regimen_plot.set_ylabel("Times Administered")
drug_regimen_plot.set_title("Number of Times Each Drug was Administered") 

In [None]:
# Generate a bar plot showing the total number of measurements taken on each drug regimen using pyplot.
plt.bar(total_measurement_regimens, total_measurements_regimen_counts, color="r", alpha=0.5, align="center")
plt.ylim(0, max(total_measurements_regimen_counts)+50)
plt.title("Number of Times Each Drug was Administered")
plt.xlabel("Drug Name")
plt.ylabel("Times Administered")
plt.xticks(rotation = 60)
plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
mouse_sex_counts = mousedata_studyresults_clean_new_df["Sex"].value_counts()
mouse_sex_counts

In [None]:
mouse_sex_counts_chart = mouse_sex_counts.plot(kind= "pie", y= "Sex")
mouse_sex_counts_chart.set_title("Mice: Males vs Females")

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
labels = ["Males", "Females"]
values = mouse_sex_counts
plt.pie(values, labels=labels,colors=("red", "blue"))
plt.axis("equal")


## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [None]:
mousedata_studyresults_clean_new_df

In [None]:
mousedata_studyresults_clean_new_df.columns

In [None]:
mousedata_studyresults_clean_new_df["Timepoint"] = mousedata_studyresults_clean_new_df.loc[:, "Timepoint"].astype(float)
mousedata_studyresults_clean_new_df

In [None]:
final_tumor_time = mousedata_studyresults_clean_new_df.groupby(["Mouse ID"])
final_tumor_time_series = final_tumor_time["Timepoint"].max()
final_tumor_time_series 

In [None]:
final_tumor_time_series = final_tumor_time_series.reset_index()
final_tumor_time_series

In [None]:
tumor_df = pd.merge(final_tumor_time_series, mousedata_studyresults_clean_new_df, on=["Mouse ID", "Timepoint"], how="inner")
tumor_df

In [None]:
tumor_by_drug_regimen = ["Capomulin","Ramicane", "Infubinol", "Ceftamin"]


In [None]:
# Put treatments into a list for a for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. (use for loop)

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset (append the series from loc into empty tumor volume list)
    
    
    # Determine outliers using upper and lower bounds

treatments= []

for treatment in tumor_by_drug_regimen:
    drug_list = tumor_df.loc[tumor_df["Drug Regimen"] == treatment, "Tumor Volume (mm3)"]
    
    treatments.append(drug_list)

    quartiles = drug_list.quantile([.25,.5,.75])
    lowerq = quartiles[0.25]
    upperq = quartiles[0.75]
    iqr = upperq-lowerq
 
    lower_bound = lowerq - (1.5*iqr)
    upper_bound = upperq + (1.5*iqr)

    outlier_treatments = drug_list.loc[(drug_list < lower_bound) | (drug_list > upper_bound)]
    print(outlier_treatments)
    

 

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
fig1, ax1 = plt.subplots()
ax1.set_title('Tumor Volume by Drug Regimen')
ax1.set_ylabel('Tumor Volume (mm3)')
ax1.boxplot(treatments)
ax1.set_xticklabels(("Capomulin","Ramicane", "Infubinol", "Ceftamin"))
plt.show()

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin
capomulin_tumors = mousedata_studyresults_clean_new_df.loc[mousedata_studyresults_clean_new_df["Drug Regimen"] == "Capomulin",:]
capomulin_tumors.head()

In [None]:
tumor_time_s185 = capomulin_tumors.loc[capomulin_tumors["Mouse ID"] == "s185", :]
tumor_time_s185

In [None]:
tumor_time_s185.dtypes

In [None]:
tumor_time_s185.plot(x = "Timepoint", y = "Tumor Volume (mm3)")
plt.xlabel("Time")
plt.ylabel("Tumor Volume (mm3)")
plt.title("Tumor Growth over Time in Mouse s185")

## Line and Scatter Plots

In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen


In [None]:
average_tumor_volume = capomulin_tumors.groupby(["Mouse ID"])
average_tumor_volume_new = average_tumor_volume.mean()
average_tumor_volume_new

In [None]:
tumor_by_weight = average_tumor_volume_new.reset_index()
tumor_by_weight

In [None]:
plot = tumor_by_weight.plot.scatter("Tumor Volume (mm3)", "Weight (g)")
plt.show()

## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
x_values = tumor_by_weight.iloc[:,0]
y_values = tumor_by_weight.iloc[:,2]
(slope, intercept, r_value, p_value, slope_std_error) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(0,50),fontsize=15,color="red")
plt.xlabel('Tumor Volume (mm3)')
plt.ylabel('Weight (g)')
plt.show()



In [None]:
print(f"The r-squared is: {r_value**2}")