## Observations and Insights 

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import scipy.stats as st

# Study data files
mouse_metadata_path = "Mouse_metadata.csv"
study_results_path = "Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single dataset
df = pd.merge(mouse_metadata, study_results, on="Mouse ID")

# Clean up column names
df = df.rename(columns={"Mouse ID": "Mouse_ID","Drug Regimen": "Drug_Regimen","Weight (g)":"Weight_Grams","Tumor Volume (mm3)":"Tumor_Volume_mm3","Metastatic Sites":"Metastatic_Sites" })

##If the index needs to be set at Mouse ID
#df.set_index('Mouse ID', inplace=True)



# Display the data table for preview
df.to_csv('MouseStudy.csv')
df
#print(df.head())

Unnamed: 0,Mouse_ID,Drug_Regimen,Sex,Age_months,Weight_Grams,Timepoint,Tumor_Volume_mm3,Metastatic_Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [2]:
# Checking the number of mice.
# print(df['Mouse_ID'].nunique())
df['Mouse_ID'].value_counts()

g989    13
m550    10
r944    10
m957    10
k862    10
        ..
x336     1
x226     1
o848     1
l872     1
n482     1
Name: Mouse_ID, Length: 249, dtype: int64

In [3]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
# print(df['Mouse_ID'].value_counts())
# df.duplicated[['Mouse_ID','Timepoint']]
# print(df['Mouse_ID'].value_counts(),df['Timepoint'].value_counts())
# print(df.duplicated(subset=['Mouse_ID','Timepoint'],keep='last'))
duplicates = df.duplicated(subset=['Mouse_ID','Timepoint'])
duplicateMT = df.loc[duplicates == True]
duplicateMT


Unnamed: 0,Mouse_ID,Drug_Regimen,Sex,Age_months,Weight_Grams,Timepoint,Tumor_Volume_mm3,Metastatic_Sites
909,g989,Propriva,Female,21,26,0,45.0,0
911,g989,Propriva,Female,21,26,5,47.570392,0
913,g989,Propriva,Female,21,26,10,49.880528,0
915,g989,Propriva,Female,21,26,15,53.44202,0
917,g989,Propriva,Female,21,26,20,54.65765,1


In [4]:
# Optional: Get all the data for the duplicate mouse ID. 



In [4]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
#dropna
Clean_Mouse_df = df.drop_duplicates(subset=['Mouse_ID'], keep='last')
Clean_Mouse_df


Unnamed: 0,Mouse_ID,Drug_Regimen,Sex,Age_months,Weight_Grams,Timepoint,Tumor_Volume_mm3,Metastatic_Sites
9,k403,Ramicane,Male,21,16,45,22.050126,1
19,s185,Capomulin,Female,3,17,45,23.343598,1
29,x401,Capomulin,Female,16,15,45,28.484033,0
39,m601,Capomulin,Male,22,17,45,28.430964,1
49,g791,Ramicane,Male,11,16,45,29.128472,1
...,...,...,...,...,...,...,...,...
1859,z314,Stelasyn,Female,21,28,5,45.934712,0
1862,z435,Propriva,Female,12,26,10,48.710661,0
1872,z581,Infubinol,Female,24,25,45,62.754451,3
1882,z795,Naftisol,Female,13,29,45,65.741070,3


In [5]:
# Checking the number of mice in the clean DataFrame.
#len()
# print(df['Mouse_ID'].nunique())
Clean_Mouse_df['Mouse_ID'].value_counts()


s141    1
f394    1
f932    1
f993    1
o331    1
       ..
v835    1
m546    1
s166    1
n923    1
m550    1
Name: Mouse_ID, Length: 249, dtype: int64

## Summary Statistics

In [30]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary dataframe.

regStatTumorMean_df = Clean_Mouse_df.groupby("Drug_Regimen")["Tumor_Volume_mm3"].mean()
regStatTumorMedian_df = Clean_Mouse_df.groupby("Drug_Regimen")["Tumor_Volume_mm3"].median()
# regStatTumorVariance_df = Clean_Mouse_df.groupby("Drug_Regimen")["Tumor_Volume_mm3"].var()
# regStatTumorStd_df = Clean_Mouse_df.groupby("Drug_Regimen")["Tumor_Volume_mm3"].std()
# regStatTumorSem_df = Clean_Mouse_df.groupby("Drug_Regimen")["Tumor_Volume_mm3"].sem()
# regStatTumorSem_df.rename( columns={1 :'SEM'}, inplace=True)
# regStatTumorSem_df
nwpd = regStatTumorMean_df.merge(regStatTumorMedian_df, how='inner',on='Drug_Regimen')
nwpd
# frames = [regStatTumorMean_df,regStatTumorMedian_df,regStatTumorVariance_df,regStatTumorStd_df,regStatTumorSem_df]
# result = pd.merge(regStatTumorMean_df,regStatTumorMedian_df,regStatTumorVariance_df,regStatTumorStd_df,regStatTumorSem_df,how="inner",on="Drug_Regimen")
# result
# result = regStatTumorMean_df.join([regStatTumorMedian_df,regStatTumorVariance_df,regStatTumorStd_df,regStatTumorSem_df])
# result
#



AttributeError: 'Series' object has no attribute 'merge'

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Using the aggregation method, produce the same summary statistics in a single line


## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of unique mice tested on each drug regimen using pandas.



In [None]:
# Generate a bar plot showing the total number of unique mice tested on each drug regimen using pyplot.



In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas



In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [None]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin


In [None]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
