# Pymaceuticals Inc.
---

### 3 Inferences

- The Summary Statistics Table helps us understand how the Varience of each Drug Regimen shows how close or far each entry is from the Mean.
- Looking at the coorelation between the Average Tumor Volume and the mouse weight, it proves that the higher the weight of a mouse, the higher the tumor volume.
- The Bar graph shows that the highest number of mice tested were for Capomulin and the lowest were for Proppriva


In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
from scipy.stats import linregress
# import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data
mouse_data = pd.read_csv(mouse_metadata_path)
mouse_data.head()

In [None]:
# Read the study results
study_results = pd.read_csv(study_results_path)
study_results.head()

In [None]:
# Combine the data into a single dataset
Pymaceuticals_df = pd.merge(mouse_data, study_results, how='outer', on="Mouse ID")

# Display the data table for preview
Pymaceuticals_df.head()

In [None]:
# Checking the number of mice.
Number_of_mice = Pymaceuticals_df["Mouse ID"].nunique()
Number_of_mice

In [None]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint.
Duplicate_Mice = Pymaceuticals_df.loc[Pymaceuticals_df.duplicated(subset=['Mouse ID', 'Timepoint']),'Mouse ID'].unique()
Duplicate_Mice 

In [None]:
# Get all the data for the duplicate mouse ID.
Duplicate_Mice_df = Pymaceuticals_df.loc[Pymaceuticals_df["Mouse ID"] == "g989", :]
Duplicate_Mice_df

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
Clean_Pymaceuticals_df = Pymaceuticals_df[Pymaceuticals_df['Mouse ID'].isin(Duplicate_Mice)==False]
Clean_Pymaceuticals_df.head()

In [None]:
# Checking the number of mice in the clean DataFrame.
Number_of_mice = Clean_Pymaceuticals_df["Mouse ID"].nunique()
Number_of_mice

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary DataFrame.

mean = Clean_Pymaceuticals_df['Tumor Volume (mm3)'].groupby(Clean_Pymaceuticals_df['Drug Regimen']).mean()
median = Clean_Pymaceuticals_df['Tumor Volume (mm3)'].groupby(Clean_Pymaceuticals_df['Drug Regimen']).median()
variance = Clean_Pymaceuticals_df['Tumor Volume (mm3)'].groupby(Clean_Pymaceuticals_df['Drug Regimen']).var()
standard_deviation = Clean_Pymaceuticals_df['Tumor Volume (mm3)'].groupby(Clean_Pymaceuticals_df['Drug Regimen']).std()
SEM = Clean_Pymaceuticals_df['Tumor Volume (mm3)'].groupby(Clean_Pymaceuticals_df['Drug Regimen']).sem()

summary_statistics = pd.DataFrame({"Mean Tumor Volume":mean, 
                            "Median Tumor Volume":median, 
                           "Tumor Volume Variance":variance, 
                           "Tumor Volume Std. Dev.":standard_deviation, 
                           "Tumor Volume Std. Err.":SEM})
# Display the Summary statistics table grouped by 'Drug Regimen' column
summary_statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, 
# and SEM of the tumor volume for each regimen
# Using the aggregation method, produce the same summary statistics in a single line.

summary_aggregation = Clean_Pymaceuticals_df.groupby(['Drug Regimen'])[['Tumor Volume (mm3)']].agg(['mean', 'median', 'var', 'std', 'sem'])
summary_aggregation

## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using Pandas.
count_of_mice = Clean_Pymaceuticals_df["Drug Regimen"].value_counts()

bar_plot_pandas = count_of_mice.plot.bar(color='b')
plt.xlabel("Drug Regimen")
plt.ylabel("Number of Mice Tested")
plt.title("Timepoints for mice tested per drug regimen")

plt.show()

In [None]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using pyplot.
x_axis = count_of_mice.index.values
y_axis = count_of_mice.values

plt.bar(x_axis, y_axis, color='b', alpha=0.8, align='center')

plt.title("Timepoints for mice tested per drug regimen")
plt.xlabel("Drug Regimen")
plt.ylabel("Number of Mice Tested")
plt.xticks(rotation="vertical")

plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas
male_female_data = Clean_Pymaceuticals_df["Sex"].value_counts()
plt.title("Female vs. Male Mice")
male_female_data.plot.pie(autopct= "%1.1f%%")

plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
labels = ['Female', 'Male']
sizes = [49.7999197, 50.200803]
plot = male_female_data.plot.pie(y='Total Count', autopct="%1.1f%%")
plt.title('Female vs. Male Mice')
plt.ylabel('Sex')

plt.show()

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin
drug_regimen = ["Capomulin", "Ramicane", "Infubinol", "Ceftamin"]

# Start by getting the last (greatest) timepoint for each mouse
last_timepoint = Clean_Pymaceuticals_df.groupby(["Drug Regimen", "Mouse ID"]).agg(tumor_size=("Tumor Volume (mm3)", lambda x: x.iloc[-1]))
last_timepoint = last_timepoint.stack(level=0).unstack(level=0)

# Merge this group df with the original DataFrame to get the tumor volume at the last timepoint
combined_drugs = Clean_Pymaceuticals_df[Clean_Pymaceuticals_df["Drug Regimen"].isin(drug_regimen)]

In [None]:
# Put treatments into a list for for loop (and later for plot labels)
# Calculate the IQR and quantitatively determine if there are any potential outliers. 
treatment = 0
for drug in drug_regimen:
    quartiles = last_timepoint[drug].quantile([.25,.5,.75]).round(2)
    lowerq = quartiles[0.25].round(2)
    upperq = quartiles[0.75].round(2)
    iqr = round(upperq - lowerq,2)
    lower_bound = round(lowerq - (1.5*iqr),2)
    upper_bound = round(upperq + (1.5*iqr),2)

    if treatment == 0:
        print(f"------------------------------------------------------------")
    print(f"The lower quartile of {drug} treatments is {lowerq}")
    print(f"The upper quartile of {drug} treatments is {upperq}")
    print(f"The interquartile range of {drug} treatments is {iqr}")
    print(f"{drug}'s potential outliers: Values below {lower_bound} and Values above {upper_bound}")
    print(f"------------------------------------------------------------")
    treatment+=1

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest

boxplot_list = []
for drug in drug_regimen:
    boxplot_list.append(list(last_timepoint[drug].dropna()))
    
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
fig = plt.figure()
plt.xlabel("Drug Regimen")
plt.xticks([1,2,3,4], drug_regimen, rotation=45)
plt.ylabel("Final Tumor Volume (mm3)")
plt.title("Tumor Volume by Drug Regimen")
plt.boxplot(boxplot_list)
plt.show()

## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin
Capomulin_df = Clean_Pymaceuticals_df.loc[Clean_Pymaceuticals_df["Drug Regimen"] == "Capomulin",:]

lineplot_df = Capomulin_df.loc[Capomulin_df["Mouse ID"] == "l509",:]
x_axis = lineplot_df["Timepoint"]
tumor_volume = lineplot_df["Tumor Volume (mm3)"]

plt.title('Capomulin treatmeant of mouse l509')
plt.plot(x_axis, tumor_volume, linewidth=1.5, markersize=3,color="b", label="Fahreneit")
plt.xlabel('Timepoint (days)')
plt.ylabel('Tumor Volume (mm3)')


In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen

average_capomulin_volume = Capomulin_df.groupby(['Mouse ID']).mean()

plt.scatter(average_capomulin_volume['Weight (g)'],average_capomulin_volume['Tumor Volume (mm3)'], color="blue")
plt.title('Mouse Weight Versus Average Tumor Volume')
plt.xlabel('Weight (g)',fontsize =14)
plt.ylabel('Averag Tumor Volume (mm3)')


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
correlation = st.pearsonr(average_capomulin_volume['Weight (g)'],average_capomulin_volume['Tumor Volume (mm3)'])
print(f"The correlation between mouse weight and the average tumor volume is {round(correlation[0],2)}")

(slope, intercept,rvalue, pvalue, stderr) = linregress(average_capomulin_volume["Weight (g)"], average_capomulin_volume["Tumor Volume (mm3)"])
regress_values = average_capomulin_volume["Weight (g)"] * slope + intercept
line_eq= f"y = {round(slope, 2)} x + {round(intercept, 2)}"

plt.scatter(average_capomulin_volume["Weight (g)"],average_capomulin_volume["Tumor Volume (mm3)"],color='b')
plt.plot(average_capomulin_volume["Weight (g)"], regress_values, color='red')
plt.xlabel("Weight (g)")
plt.ylabel("Tumor Volume (mm3)")
plt.title("Mouse Weight vs Tumor Volume for Capomulin")