## Observations and Insights

## Dependencies and starter code

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata = "data/Mouse_metadata.csv"
study_results = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata)
study_results = pd.read_csv(study_results)

# Combine the data into a single dataset
merged_df=pd.merge(mouse_metadata, study_results, on="Mouse ID")
merged_df.head()

## Summary statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
by_drug=merged_df.groupby("Drug Regimen")["Tumor Volume (mm3)"]
tumor_mean=by_drug.mean()
tumor_median=by_drug.median()
tumor_variance=by_drug.var()
tumor_std=by_drug.std()
tumor_sem=by_drug.sem()

by_drug_summary=pd.DataFrame({"Mean": tumor_mean,
                        "Median": tumor_median,
                        "Variance": tumor_variance,
                        "Standard Deviation": tumor_std,
                        "SEM": tumor_sem
                        })
by_drug_summary

## Bar plots

In [None]:
# Generate a bar plot showing number of data points for each treatment regimen using pandas
tumor_data_point=merged_df.groupby("Drug Regimen")["Metastatic Sites"].count()

tumor_chart=tumor_data_point.plot(kind="bar")

tumor_chart.set_ylabel("Number of Data Points")
tumor_chart.set_title("Number of Data Points Per Drug Regimen")
plt.show()

In [None]:
# Generate a bar plot showing number of data points for each treatment regimen using pyplot
#print(str(tumor_data_point))
meta_sites=[230,178,178,188,186,181,161,228,181,182]
drug_regimen=np.arange(len(meta_sites))

tick_locations= [value for value in drug_regimen]
plt.xticks(tick_locations,
           ["Capomulim", "Ceftamin", "Infubinol", "Ketapril", "Naftisol",
            "Placebo", "Propriva", "Ramicane", "Stelasyn", "Zoniferol"],
          rotation="vertical")

plt.bar(drug_regimen, meta_sites, color="b", alpha=0.5, align="center")

plt.title("Number of Data Points Per Drug Regimen")
plt.xlabel("Drug Regimen")
plt.ylabel("Number of Data Points")
plt.show()
plt

## Pie plots

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
gender_pie=merged_df.groupby(["Sex"]).count()
gender_pie
#gender_list

#gender_pie.plot(kind="pie", y=gender_pie, title="Sex of Mice")

#plt.show()

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
labels=["Female", "Male"]
sizes=["935", "958"]
colors=["Red", "Blue"]
explode= (0,0)

plt.pie(sizes, explode=explode, labels=labels, colors=colors, autopct="%1.1f%%")
plt.ylabel("Sex")

## Quartiles, outliers and boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens. Calculate the IQR and quantitatively determine if there are any potential outliers. 

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest

## Line and scatter plots

In [None]:
#Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin
capomulin_df=merged_df.set_index("Mouse ID")
capomulin_mouse=input("What is the Mouse ID that was treated with Capomulin?")

timepoint=[0,5,10,15,20,25,30,35,40,45]
tumor_volume=capomulin_df.loc[capomulin_mouse,["Tumor Volume (mm3)"]]

plt.plot(timepoint, tumor_volume, color="green")
plt.xlabel("Timepoint (Days)")
plt.ylabel("Tumor Volume (mm3)")
plt.title("Capomulin Treatment of Mouse " + capomulin_mouse)

In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen

In [None]:
# Calculate the correlation coefficient and linear regression model for mouse weight and average tumor volume for the Capomulin regimen