## Observations and Insights

## Dependencies and starter code

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata = "Mouse_metadata.csv"
study_results = "Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata)
study_results = pd.read_csv(study_results)

# Combine the data into a single dataset
combined_df = pd.merge(study_results,mouse_metadata,on='Mouse ID',how='left')

# Display the data table for preview
combined_df.head()

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,b128,0,45.0,0,Capomulin,Female,9,22
1,f932,0,45.0,0,Ketapril,Male,15,29
2,g107,0,45.0,0,Ketapril,Female,2,29
3,a457,0,45.0,0,Ketapril,Female,11,30
4,c819,0,45.0,0,Ketapril,Male,21,25


## Summary statistics

In [14]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

drug = combined_df.groupby('Drug Regimen')
drug_types = drug['Drug Regimen'].count()
drug_types

combined_df.describe()


Unnamed: 0,Timepoint,Tumor Volume (mm3),Metastatic Sites,Age_months,Weight (g)
count,1893.0,1893.0,1893.0,1893.0,1893.0
mean,19.572108,50.448381,1.021659,12.81458,25.662441
std,14.07946,8.894722,1.137974,7.189592,3.921622
min,0.0,22.050126,0.0,1.0,15.0
25%,5.0,45.0,0.0,7.0,25.0
50%,20.0,48.951474,1.0,13.0,27.0
75%,30.0,56.2922,2.0,20.0,29.0
max,45.0,78.567014,4.0,24.0,30.0


In [3]:
# Store the Mean "Tumor Volume Data" Grouped by Drug and Timepoint 
combine_group_mean = combined_df.groupby(["Drug Regimen","Timepoint"]).mean()
combine_group_mean.reset_index(level = None, inplace = True)

# Convert to DataFrame
tumor_response_mean_df = pd.DataFrame(combine_group_mean)

# Preview DataFrame 
tumor_response_mean_df.head()

Unnamed: 0,Drug Regimen,Timepoint,Tumor Volume (mm3),Metastatic Sites,Age_months,Weight (g)
0,Capomulin,0,45.0,0.0,14.04,20.12
1,Capomulin,5,44.266086,0.16,14.04,20.12
2,Capomulin,10,43.084291,0.32,14.04,20.12
3,Capomulin,15,42.064317,0.375,13.666667,20.0
4,Capomulin,20,40.716325,0.652174,13.304348,19.782609


In [4]:
# Store the Standard Error of Tumor Volumes Grouped by Drug and Timepoint
combine_group_std = combined_df.groupby(["Drug Regimen","Timepoint"]).sem()
combine_group_std.reset_index(level = None, inplace = True)

# Convert to DataFrame
tumor_response_sem_tumorvol_df = pd.DataFrame(combine_group_std)

# Preview DataFrame 
tumor_response_sem_tumorvol_df.head()

Unnamed: 0,Drug Regimen,Timepoint,Age_months,Metastatic Sites,Mouse ID,Sex,Tumor Volume (mm3),Weight (g)
0,Capomulin,0,1.570011,0.0,,,0.0,0.57248
1,Capomulin,5,1.570011,0.074833,,,0.448593,0.57248
2,Capomulin,10,1.570011,0.125433,,,0.702684,0.57248
3,Capomulin,15,1.589899,0.132048,,,0.838617,0.583592
4,Capomulin,20,1.6169,0.161621,,,0.909731,0.565673


In [5]:
#Syntax: DataFrame.pivot(index=None, columns=None, values=None)[source]

# Minor Data Munging to Re-Format the Data Frames
pivot_table = tumor_response_mean_df.pivot(index ="Timepoint", columns = 'Drug Regimen', values = "Tumor Volume (mm3)")
pivot_table.reset_index(level = None, inplace = True)

#Preview the formatted table
pivot_table.head()

Drug Regimen,Timepoint,Capomulin,Ceftamin,Infubinol,Ketapril,Naftisol,Placebo,Propriva,Ramicane,Stelasyn,Zoniferol
0,0,45.0,45.0,45.0,45.0,45.0,45.0,45.0,45.0,45.0,45.0
1,5,44.266086,46.503051,46.541247,47.389175,46.796098,47.125589,47.248967,43.944859,47.47083,46.851818
2,10,43.084291,48.285125,49.403909,49.582269,48.69421,49.423329,49.101541,42.531957,49.335368,48.689881
3,15,42.064317,50.094055,51.296397,52.399974,50.933018,51.359742,51.067318,41.495061,51.448025,50.779059
4,20,40.716325,52.157049,53.197691,54.920935,53.644087,54.364417,53.346737,40.238325,53.97008,53.170334


## Bar plots

In [20]:
# Generate a bar plot showing number of data points for each treatment regimen using pandas

tumor_size_bar = tumor_response_mean_df['Tumor Volume (mm3)']
timepoint_bar = [0,5,10,15,20]

df = pd.DataFrame({'Tumor Size': tumor_size_bar, 'Timepoint': timepoint_bar}, index = drug)
ax = df.plot.bar(rot=0)

ValueError: cannot copy sequence with size 230 to array axis with dimension 8

In [None]:
# Generate a bar plot showing number of data points for each treatment regimen using pyplot

mean_drug_tumor_vol = []
x_axis = combined_df.groupby(['Drug Regimen'])  

plt.bar(x_axis, mean_drug_tumor_vol, color='r', alpha=0.5, align="center")  

tick_locations = [value for value in x_axis]
plt.xticks(tick_locations, ["Capomulin", "Ceftamin", "Infubinol", "Ketapril", "Naftisol", "Placebo", "Propriva", "Ramicane", "Stelasyn", "Zoniferal"])  # Sets the x limits of the current chart
plt.xlim(-0.75, len(x_axis)-0.25)  # Sets the y limits of the current chart
plt.ylim(0, max(users)+5000)  # Give our chart some labels and a tile
plt.title("Tumor Volume by Treatment (Drug Regimen)")
plt.xlabel("Drug Regimen")
plt.ylabel("Tumor Volume (mm3)")

## Pie plots

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

 # Labels for the sections of our pie chart
labels = ["Males", "Females"]

# The values of each section of the pie chart
sizes = []

# The colors of each section of the pie chart
colors = ["orange", "purple"]

# Tells matplotlib to seperate the "Humans" section from the others
explode = (0.1, 0, 0, 0)  

# Creates the pie chart based upon the values above

# Automatically finds the percentages of each part of the pie chart
plt.pie(sizes, explode=explode, labels=labels, colors=colors,
        autopct="%1.1f%%", shadow=True, startangle=140)  

# Tells matplotlib that we want a pie chart with equal axes
plt.axis("equal")

## Quartiles, outliers and boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens. Calculate the IQR and quantitatively determine if there are any potential outliers. 

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest

## Line and scatter plots

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin

In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen

In [None]:
# Calculate the correlation coefficient and linear regression model for mouse weight and average tumor volume for the Capomulin regimen