## Observations and Insights 

In [18]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

In [19]:
mouse_metadata.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g)
0,k403,Ramicane,Male,21,16
1,s185,Capomulin,Female,3,17
2,x401,Capomulin,Female,16,15
3,m601,Capomulin,Male,22,17
4,g791,Ramicane,Male,11,16


In [20]:
study_results.head()

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites
0,b128,0,45.0,0
1,f932,0,45.0,0
2,g107,0,45.0,0
3,a457,0,45.0,0
4,c819,0,45.0,0


In [21]:
# Combine the data into a single dataset
# Display the data table for preview
mouse_data_df = pd.merge(mouse_metadata, study_results, on="Mouse ID")
print(mouse_data_df)

     Mouse ID Drug Regimen   Sex  Age_months  Weight (g)  Timepoint  \
0        k403     Ramicane  Male          21          16          0   
1        k403     Ramicane  Male          21          16          5   
2        k403     Ramicane  Male          21          16         10   
3        k403     Ramicane  Male          21          16         15   
4        k403     Ramicane  Male          21          16         20   
...       ...          ...   ...         ...         ...        ...   
1888     z969     Naftisol  Male           9          30         25   
1889     z969     Naftisol  Male           9          30         30   
1890     z969     Naftisol  Male           9          30         35   
1891     z969     Naftisol  Male           9          30         40   
1892     z969     Naftisol  Male           9          30         45   

      Tumor Volume (mm3)  Metastatic Sites  
0              45.000000                 0  
1              38.825898                 0  
2           

In [22]:
# Checking the number of mice
count = mouse_data_df["Mouse ID"].value_counts()
count

g989    13
w422    10
x581    10
t718    10
y163    10
        ..
u153     1
l872     1
h428     1
x336     1
t573     1
Name: Mouse ID, Length: 249, dtype: int64

In [23]:
# Counting the different drug regimens
count = mouse_data_df["Drug Regimen"].value_counts()
count

Capomulin    230
Ramicane     228
Ketapril     188
Naftisol     186
Zoniferol    182
Placebo      181
Stelasyn     181
Ceftamin     178
Infubinol    178
Propriva     161
Name: Drug Regimen, dtype: int64

In [24]:
# Getting a list of all the columns
mouse_data_df.columns

Index(['Mouse ID', 'Drug Regimen', 'Sex', 'Age_months', 'Weight (g)',
       'Timepoint', 'Tumor Volume (mm3)', 'Metastatic Sites'],
      dtype='object')

In [49]:
# Getting the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 
mouse_data_df = mouse_data_df.loc[(mouse_data_df["Mouse ID"].duplicated{subset="Timepoint", keep='first', inplace=False})]

SyntaxError: invalid syntax (<ipython-input-49-40de24f8e5ee>, line 2)

In [26]:
# Dropping duplicate timepoints
# For True in "Time Point" delete row
mouse_data_df = mouse_data_df.drop_duplicates(subset="Timepoint"
                                              if value in"Timepoint"=True
                                                  then 

mouse_data_df = mouse_data_df.loc[(
    mouse_data_df["Timepoint"]=True)]
mouse_data_df.head()

In [42]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_mouse_df = mouse_data_df
clean_mouse_df.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [28]:
# Checking the number of mice in the clean DataFrame.
count = clean_mouse_df["Mouse ID"].value_counts()
count

k403    10
Name: Mouse ID, dtype: int64

## Summary Statistics

In [29]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
clean_mouse_df.describe()

Unnamed: 0,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
count,10.0,10.0,10.0,10.0,10.0
mean,21.0,16.0,22.5,32.358859,0.8
std,0.0,0.0,15.138252,6.82568,0.421637
min,21.0,16.0,0.0,22.050126,0.0
25%,21.0,16.0,11.25,27.685119,1.0
50%,21.0,16.0,22.5,33.231153,1.0
75%,21.0,16.0,33.75,34.816702,1.0
max,21.0,16.0,45.0,45.0,1.0


In [None]:
# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume.      

# Extract the following columns: ""Drug Regiman" and "Tumor Volume (mm3)"
drug_regimen_df = clean_mouse_df.[["Drug Regimen", "Tumor Volume (mm3)"]]
drug_regimen_df.head()

In [None]:
# Rename Tumor volume
renamed_regimen_df = drug_regimen_df.rename(columns={"Tumor Volume (mm3)": "Tumor Volume"})
renamed_regimen_df

In [40]:
# Assemble the resulting series into a single summary dataframe.
# Create a dataframe of the average stats for each drug regimen.
regimen_group = renamed_regimen_df.groupby(["Drug Regimen"])

tvol_comparison_df = renamed_regimen_df.mean("Tumor Volume")
tvol_comparison_df
tvol_comparison_df = renamed_regimen_df.median("Tumor Volume")
tvol_comparison_df
tvol_comparison_df = renamed_regimen_df.var("Tumor Volume")
tvol_comparison_df
tvol_comparison_df = renamed_regimen_df.std("Tumor Volume")
tvol_comparison_df
tvol_comparison_df = renamed_regimen_df.sem("Tumor Volume")
tvol_comparison_df

In [8]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
print(tvol_comparison_df).head()

# Using the aggregation method, produce the same summary statistics in a single line
renamed_regimen_df.agg({"Tumor Volume" : ['mean', 'median', 'var', 'std', 'sem'])

## Bar and Pie Charts

In [9]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using Pandas.
plt.bar


In [10]:
# Generate a bar plot showing the total number of timepoints for all mice tested for each drug regimen using pyplot.



In [11]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas



In [12]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot



## Quartiles, Outliers and Boxplots

In [13]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse
clean_mouse_df.loc("Timepoint")

# Merge this group df with the original dataframe to get the tumor volume at the last timepoint


In [14]:
# Put treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)
tumor volume = [ ]


# Calculate the IQR and quantitatively determine if there are any potential outliers. 

    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    
    
    # add subset 
    
    
    # Determine outliers using upper and lower bounds
    

In [15]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest


## Line and Scatter Plots

In [16]:
# Generate a line plot of tumor volume vs. time point for a mouse treated with Capomulin


In [17]:
# Generate a scatter plot of average tumor volume vs. mouse weight for the Capomulin regimen


## Correlation and Regression

In [18]:
# Calculate the correlation coefficient and linear regression model 
# for mouse weight and average tumor volume for the Capomulin regimen
