## Observations and Insights 

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
print(mouse_metadata.count())
print(study_results.count())

study_data = pd.merge(mouse_metadata, study_results, on = 'Mouse ID', how="outer")

Mouse ID        249
Drug Regimen    249
Sex             249
Age_months      249
Weight (g)      249
dtype: int64
Mouse ID              1893
Timepoint             1893
Tumor Volume (mm3)    1893
Metastatic Sites      1893
dtype: int64


In [2]:
mouse_count = study_data["Mouse ID"].nunique()
mouse_count

# Checking the number of mice in the DataFrame.


249

In [3]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
duplicate_mouse = study_data.loc[study_data.duplicated(subset=["Mouse ID", "Timepoint"]),"Mouse ID"].unique()
#duplicate_mouse=study_data.loc[study_data.duplicated(subset=["Mouse ID", "Timepoint"]), "Mouse ID"].unique()
duplicate_mouse

array(['g989'], dtype=object)

In [23]:
# Optional: Get all the data for the duplicate mouse ID. 
mouse_info = study_data.loc[study_data['Mouse ID'] = 'g989']]

SyntaxError: invalid syntax (<ipython-input-23-d7a68f7e7bcf>, line 2)

In [5]:
# Create a cleanDataFrame by dropping the duplicate mouse by its ID.
cleaned_data = study_data[study_data['Mouse ID'].isin(duplicate_mouse)== False]
#mouse_df.head()

In [6]:
# Checking the number of mice in the clean DataFrame.
cleaned_data['Mouse ID'].nunique()

248

## Summary Statistics

In [7]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen


mean= cleaned_data.groupby(['Drug Regimen']).mean()
median = cleaned_data.groupby(['Drug Regimen']).median()
variance = cleaned_data.groupby(['Drug Regimen']).var()
stanDev = cleaned_data.groupby(['Drug Regimen']).std()
sem = cleaned_data.groupby(['Drug Regimen']).sem()


stat_summary = pd.DataFrame({"Mean":mean['Tumor Volume (mm3)'], 
                             "Median": median['Tumor Volume (mm3)'], 
                             "Variance": variance['Tumor Volume (mm3)'], 
                             "Standard Deviation": stanDev['Tumor Volume (mm3)'] , 
                             "SEM": sem['Tumor Volume (mm3)']})
stat_summary

# This method is the most straightforward, creating multiple series and putting them all together at the end.

Unnamed: 0_level_0,Mean,Median,Variance,Standard Deviation,SEM
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346
Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821
Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236
Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386
Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466
Placebo,54.033581,52.288934,61.168083,7.821003,0.581331
Propriva,52.32093,50.446266,43.852013,6.622085,0.544332
Ramicane,40.216745,40.673236,23.486704,4.846308,0.320955
Stelasyn,54.233149,52.431737,59.450562,7.710419,0.573111
Zoniferol,53.236507,51.818479,48.533355,6.966589,0.516398


In [8]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# This method produces everything in a single groupby function.

## Bar Plots

In [9]:
# Generate a bar plot showing the number of mice per time point for each treatment throughout the course of the study using pandas.

In [10]:
# Generate a bar plot showing the number of mice per time point for each treatment throughout the course of the study using pyplot.

## Pie Plots

In [11]:
# Generate a pie plot showing the distribution of female versus male mice using pandas

In [12]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

## Quartiles, Outliers and Boxplots

In [13]:
# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens. Calculate the IQR and quantitatively determine if there are any potential outliers. 


In [14]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest

## Line and Scatter Plots

In [15]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin


In [16]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen



## Correlation and Regression

In [17]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
