## Observations and Insights

## Dependencies and starter code

In [77]:
%matplotlib notebook

In [78]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np
from scipy.stats import linregress

# Study data files
mouse_metadata = "data/Mouse_metadata.csv"
study_results = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata)
study_results = pd.read_csv(study_results)

# Combine the data into a single dataset
merge_table = pd.merge(mouse_metadata, study_results, on="Mouse ID")


mouse_metadata.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g)
0,k403,Ramicane,Male,21,16
1,s185,Capomulin,Female,3,17
2,x401,Capomulin,Female,16,15
3,m601,Capomulin,Male,22,17
4,g791,Ramicane,Male,11,16


## Summary statistics

In [79]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

#Creating a group based on the drug name
grouped_drug = merge_table.groupby(['Drug Regimen'])

#Counting the drugs to see if I did it right.
count_drugs = grouped_drug['Drug Regimen'].count()

#Showing it.
count_drugs

Drug Regimen
Capomulin    230
Ceftamin     178
Infubinol    178
Ketapril     188
Naftisol     186
Placebo      181
Propriva     161
Ramicane     228
Stelasyn     181
Zoniferol    182
Name: Drug Regimen, dtype: int64

In [80]:
#Creating the stats table for mean, median, variance, standard deviation, and SEM of the tumor volume
stats = merge_table.groupby(['Drug Regimen']).agg({'Tumor Volume (mm3)' : ['mean', 'median', 'var', 'std', 'sem', 'sum', 'count']})

#Showing it
stats

Unnamed: 0_level_0,Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3),Tumor Volume (mm3)
Unnamed: 0_level_1,mean,median,var,std,sem,sum,count
Drug Regimen,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
Capomulin,40.675741,41.557809,24.947764,4.994774,0.329346,9355.420462,230
Ceftamin,52.591172,51.776157,39.290177,6.268188,0.469821,9361.228582,178
Infubinol,52.884795,51.820584,43.128684,6.567243,0.492236,9413.493529,178
Ketapril,55.235638,53.698743,68.553577,8.279709,0.60386,10384.299876,188
Naftisol,54.331565,52.509285,66.173479,8.134708,0.596466,10105.671026,186
Placebo,54.033581,52.288934,61.168083,7.821003,0.581331,9780.078122,181
Propriva,52.322552,50.854632,42.35107,6.50777,0.512884,8423.930878,161
Ramicane,40.216745,40.673236,23.486704,4.846308,0.320955,9169.417875,228
Stelasyn,54.233149,52.431737,59.450562,7.710419,0.573111,9816.199991,181
Zoniferol,53.236507,51.818479,48.533355,6.966589,0.516398,9689.044192,182


## Bar plots

In [81]:
# Generate a bar plot showing number of data points for each treatment regimen using pandas

#Creating a dataframe of the drug counts
counts = merge_table['Drug Regimen'].value_counts()

count_df = counts.rename_axis('Drug Regimen').to_frame('counts')

count_df


Unnamed: 0_level_0,counts
Drug Regimen,Unnamed: 1_level_1
Capomulin,230
Ramicane,228
Ketapril,188
Naftisol,186
Zoniferol,182
Stelasyn,181
Placebo,181
Ceftamin,178
Infubinol,178
Propriva,161


In [82]:
# Using DataFrame.plot() to create a bar chart of the data
count_df.plot(kind="bar", figsize=(20,3))

# Title
plt.title("Drug Count")

plt.show()
plt.tight_layout()



<IPython.core.display.Javascript object>

In [89]:
# Generate a bar plot showing number of data points for each treatment regimen using pyplot

#Plotting bar graph
x_axis = np.arange(len(count_df))

plt.bar(x_axis, count_df['counts'], color="b", align="center")

# tick_locations = [value for value in x_axis]
# plt.xticks(tick_locations, count_df)



<BarContainer object of 10 artists>

## Pie plots

In [85]:
# Generate a pie plot showing the distribution of female versus male mice using pandas

#Creating a table of counts
counts = merge_table['Sex'].value_counts()

#Creating new dataframe
sex_count_df = df = counts.rename_axis('Sex').reset_index(name='Counts')

# sex_count_df

colors = ['red', 'blue']

# Trying to create a pie chart
plt.pie(
    # using data total)arrests
    sex_count_df['Counts'],
    # with the labels being officer names
    labels=sex_count_df['Sex'],
    # with no shadows
    shadow=False,
    # with colors
    colors=colors,
    # with the start angle at 90%
    startangle=90,
    # with the percent listed as a fraction
    autopct='%1.1f%%',
    )
plt.axis('equal')
plt.tight_layout()
plt.show()

In [86]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot

# Trying to create a pie chart again
plt.pie(sex_count_df['counts'], labels=labels, colors=colors,
        autopct="%1.1f%%", shadow=True, startangle=140)
# Create axes which are equal so we have a perfect circle
plt.axis("equal")

plt.show()

KeyError: 'counts'

## Quartiles, outliers and boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the most promising treatment regimens. Calculate the IQR and quantitatively determine if there are any potential outliers. 
#Grouping by mouse id and setting the index
max_tumor = merge_table.groupby(["Mouse ID"]).max()
max_tumor = max_tumor.reset_index()

# Merging with original dataframe to get the tumor volume at the last timepoint
merged_data = max_tumor[['Mouse ID','Timepoint']].merge(merge_table,on=['Mouse ID','Timepoint'],how="left")
#Isolating the 4 drugs
merged_data = merged_data.loc[(merged_data['Drug Regimen'] == 'Capomulin') | (merged_data['Drug Regimen'] == 'Ramicane') | (merged_data['Drug Regimen'] == 'Infubinol') | (merged_data['Drug Regimen'] == 'Ceftamin')]
#Sorting by drug name and tumor volume
merged_data.sort_values(by=['Drug Regimen','Tumor Volume (mm3)'], inplace=True)

#merged_data


In [None]:
#Getting the IQR by forming quartiles

tumas = merged_data['Tumor Volume (mm3)']

quartiles = tumas.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq

print(f"The lower quartile of temperatures is: {lowerq}")
print(f"The upper quartile of temperatures is: {upperq}")
print(f"The interquartile range of temperatures is: {iqr}")
print(f"The the median of temperatures is: {quartiles[0.5]} ")

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print(f"Values below {lower_bound} could be outliers.")
print(f"Values above {upper_bound} could be outliers.")

In [None]:
# Generate a box plot of the final tumor volume of each mouse across four regimens of interest
#Creating df for each drug
cap = merged_data.loc[(merged_data['Drug Regimen'] == 'Capomulin')]
ram = merged_data.loc[(merged_data['Drug Regimen'] == 'Ramicane')]
inf = merged_data.loc[(merged_data['Drug Regimen'] == 'Infubinol')]
cef = merged_data.loc[(merged_data['Drug Regimen'] == 'Ceftamin')]  

#Creating a list of df to use in the box plot
drugs = [cap['Tumor Volume (mm3)'], ram['Tumor Volume (mm3)'], inf['Tumor Volume (mm3)'], cef['Tumor Volume (mm3)'] ]

In [None]:
#Box plot for tumor volume after putting dataframes by drug in a list

fig1, ax1 = plt.subplots()
ax1.set_title('Four Major Drugs Tumor Volume')
ax1.set_ylabel('Tumor Volume')
ax1.boxplot(drugs)
ax1.set_xticklabels(['Capomulin', 'Ramicane', 'Infubinol', 'Ceftamin'])
plt.show()

## Line and scatter plots

In [None]:
# Generate a line plot of time point versus tumor volume for a mouse treated with Capomulin
#Creating a df for the mouse m601 that's receiving Capomulin
mymouse = merge_table.loc[(merge_table['Mouse ID'] == 'm601')]

#Plotting the data 
mymouse.set_index('Timepoint')['Tumor Volume (mm3)'].plot(kind='line')  

plt.xlabel("Timepoint")
plt.ylabel("Tumor Volume (mm3)")
plt.title("Tumor Growth")
plt.show()

In [None]:
# Generate a scatter plot of mouse weight versus average tumor volume for the Capomulin regimen
#Singling out Capomulin
capped = merge_table.loc[(merge_table['Drug Regimen'] == 'Capomulin')]

#Creating the scatter plot
plt.scatter(capped.iloc[:,4],capped.iloc[:,6])
plt.title("Mouse Weight VS Average Tumor Volume")
plt.xlabel("Weight (g)")
plt.ylabel("Tumor Volume (mm3)")
plt.show()

In [None]:
# Calculate the correlation coefficient and linear regression model for mouse weight and average tumor volume for the Capomulin regimen
#Finding the correlation coefficient
weight = capped.iloc[:,4]
tumorv = capped.iloc[:,6]
correlation = st.pearsonr(weight,tumorv)
print(f"The correlation between both factors is {round(correlation[0],2)}")

In [88]:
#Finding the regression and showing it.

x_values = capped['Weight (g)']
y_values = capped['Tumor Volume (mm3)']
(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))
plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(6,10),fontsize=15,color="red")
plt.xlabel('Weight')
plt.ylabel('Tumor Volume')
plt.show()