In [1]:
%matplotlib notebook

import matplotlib.pyplot as plt
import pandas as pd
from scipy.stats import linregress
import numpy as np

mouse_metadata_path = "Pymaceuticals/data/Mouse_metadata.csv"
study_results_path = "Pymaceuticals/data/Study_results.csv"

mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

In [2]:
study_df = pd.DataFrame.merge(mouse_metadata, study_results, how='outer', )

study_df.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1


In [3]:
print(study_df[study_df.duplicated(['Mouse ID', 'Timepoint'], keep=False)])

    Mouse ID Drug Regimen     Sex  Age_months  Weight (g)  Timepoint  \
908     g989     Propriva  Female          21          26          0   
909     g989     Propriva  Female          21          26          0   
910     g989     Propriva  Female          21          26          5   
911     g989     Propriva  Female          21          26          5   
912     g989     Propriva  Female          21          26         10   
913     g989     Propriva  Female          21          26         10   
914     g989     Propriva  Female          21          26         15   
915     g989     Propriva  Female          21          26         15   
916     g989     Propriva  Female          21          26         20   
917     g989     Propriva  Female          21          26         20   

     Tumor Volume (mm3)  Metastatic Sites  
908           45.000000                 0  
909           45.000000                 0  
910           48.786801                 0  
911           47.570392        

In [4]:
clean_study_df = study_df.drop_duplicates(['Mouse ID', 'Timepoint'])
print(len(clean_study_df['Mouse ID'].unique()))

249


In [5]:
grouped_study = clean_study_df.groupby(['Drug Regimen'])

grouped_study['Tumor Volume (mm3)'].describe()

Unnamed: 0_level_0,count,mean,std,min,25%,50%,75%,max
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
Capomulin,230.0,40.675741,4.994774,23.343598,37.685933,41.557809,45.0,48.158209
Ceftamin,178.0,52.591172,6.268188,45.0,47.208427,51.776157,56.801438,68.923185
Infubinol,178.0,52.884795,6.567243,36.321346,47.312353,51.820584,57.314444,72.226731
Ketapril,188.0,55.235638,8.279709,45.0,48.232987,53.698743,60.870951,78.567014
Naftisol,186.0,54.331565,8.134708,45.0,47.285874,52.509285,59.963034,76.668817
Placebo,181.0,54.033581,7.821003,45.0,47.459053,52.288934,59.916934,73.212939
Propriva,156.0,52.393463,6.568014,45.0,47.046068,50.909965,56.491585,72.455421
Ramicane,228.0,40.216745,4.846308,22.050126,36.674635,40.673236,45.0,47.622816
Stelasyn,181.0,54.233149,7.710419,45.0,48.047139,52.431737,58.719297,75.12369
Zoniferol,182.0,53.236507,6.966589,45.0,47.337876,51.818479,57.954259,73.324432


In [20]:
tv_mean = round(grouped_study['Tumor Volume (mm3)'].mean(), 2)
tv_med = round(grouped_study['Tumor Volume (mm3)'].median(), 2)
tv_var = round(grouped_study['Tumor Volume (mm3)'].var(), 2)
tv_std = round(grouped_study['Tumor Volume (mm3)'].std(), 2)
tv_sem = round(grouped_study['Tumor Volume (mm3)'].sem(), 2)

summary_stats = pd.DataFrame({'Mean': tv_mean,
                            'Median': tv_med,
                             'Variance': tv_var,
                             'Standard Deviation': tv_std,
                             'Standard Error of the Mean': tv_sem})

summary_stats.head(20)

Unnamed: 0_level_0,Mean,Median,Variance,Standard Deviation,Standard Error of the Mean
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
Capomulin,40.68,41.56,24.95,4.99,0.33
Ceftamin,52.59,51.78,39.29,6.27,0.47
Infubinol,52.88,51.82,43.13,6.57,0.49
Ketapril,55.24,53.7,68.55,8.28,0.6
Naftisol,54.33,52.51,66.17,8.13,0.6
Placebo,54.03,52.29,61.17,7.82,0.58
Propriva,52.39,50.91,43.14,6.57,0.53
Ramicane,40.22,40.67,23.49,4.85,0.32
Stelasyn,54.23,52.43,59.45,7.71,0.57
Zoniferol,53.24,51.82,48.53,6.97,0.52


In [7]:
#using matplotlib
plt.figure(1)
drug_names = grouped_study.groups
mouse_count = grouped_study['Mouse ID'].count()
x_axis = np.arange(len(grouped_study))
tick_locations = [value for value in x_axis]
plt.bar(x_axis, mouse_count)
plt.xticks(tick_locations, list(drug_names), 
           rotation='vertical')
plt.xlabel('Drug Regimen')
plt.ylabel('Number of Mice')
plt.title("Numer of Mice per Drug Regimen")
plt.legend(['Mouse ID'], loc='best')
plt.show()

<IPython.core.display.Javascript object>

In [8]:
#Using pandas
plt.figure(2)
mouse_count.plot(kind='bar', figsize=(5,5), color='blue')
plt.xlabel('Drug Regimen')
plt.ylabel('Number of Mice')
plt.title("Numer of Mice per Drug Regimen")
plt.legend()
plt.show()



<IPython.core.display.Javascript object>

In [9]:
#MAtplotlib
plt.figure(3)
sex_grouped = clean_study_df.groupby(['Sex'])
sex_count = sex_grouped['Mouse ID'].count()
colors=['lightskyblue', 'lightcoral']
labels=['Male', 'Female']

plt.pie(sex_count, labels=labels, colors=colors, autopct='%1.1f%%', 
        shadow=False, startangle=140)
plt.axis('equal')
plt.legend()
plt.show()


<IPython.core.display.Javascript object>

In [10]:
#Pandas
plt.figure(4)
sex_count.plot(kind='pie', autopct = '%1.1f%%')
plt.legend()
plt.show()

<IPython.core.display.Javascript object>

In [11]:
capomulin_df = study_df.loc[(study_df['Drug Regimen'] == 'Capomulin')]
ramicane_df = study_df.loc[(study_df['Drug Regimen'] == 'Ramicane')]
infubinol_df = study_df.loc[(study_df['Drug Regimen'] == 'Infubinol')]
ceftamin_df = study_df.loc[(study_df['Drug Regimen'] == 'Ceftamin')]

capomulin_end = capomulin_df.groupby('Mouse ID')
ramicane_end = ramicane_df.groupby('Mouse ID')
infubinol_end = infubinol_df.groupby('Mouse ID')
ceftamin_end = ceftamin_df.groupby('Mouse ID')

capomulin_end = capomulin_end['Tumor Volume (mm3)'].last()
ramicane_end = ramicane_end['Tumor Volume (mm3)'].last()
infubinol_end = infubinol_end['Tumor Volume (mm3)'].last()
ceftamin_end = ceftamin_end['Tumor Volume (mm3)'].last()



In [12]:
caquartiles = capomulin_end.quantile([.25, .5, .75])
calowerq = round(caquartiles[.25], 2)
caupperq = round(caquartiles[.75], 2)
caiqr = round(caupperq-calowerq, 2)
calower_bound = round(calowerq - (1.5*caiqr), 2)
caupper_bound = round(caupperq + (1.5*caiqr), 2)

print('Capomulin')
print('------------------')
print(f"The lower quartile is {calowerq}.")
print(f"The upper quartile is {caupperq}.")
print(f"The IQR is {caiqr}")
print(f"Values below {calower_bound} could be outliers.")
print(f"Values above {caupper_bound} could be outliers.")

rquartiles = ramicane_end.quantile([.25, .5, .75])
rlowerq = round(rquartiles[.25], 2)
rupperq = round(rquartiles[.75], 2)
riqr = round(rupperq-rlowerq, 2)
rlower_bound = round(rlowerq - (1.5*riqr), 2)
rupper_bound = round(rupperq + (1.5*riqr), 2)

print( )
print('Ramicane')
print('------------------')
print(f"The lower quartile is {rlowerq}.")
print(f"The upper quartile is {rupperq}.")
print(f"The IQR is {riqr}")
print(f"Values below {rlower_bound} could be outliers.")
print(f"Values above {rupper_bound} could be outliers.")

iquartiles = infubinol_end.quantile([.25, .5, .75])
ilowerq = round(iquartiles[.25], 2)
iupperq = round(iquartiles[.75], 2)
iiqr = round(iupperq-ilowerq, 2)
ilower_bound = round(ilowerq - (1.5*iiqr), 2)
iupper_bound = round(iupperq + (1.5*iiqr), 2)

print( )
print('Infubinol')
print('------------------')
print(f"The lower quartile is {ilowerq}.")
print(f"The upper quartile is {iupperq}.")
print(f"The IQR is {iiqr}")
print(f"Values below {ilower_bound} could be outliers.")
print(f"Values above {iupper_bound} could be outliers.")


cequartiles = ceftamin_end.quantile([.25, .5, .75])
celowerq = round(cequartiles[.25], 2)
ceupperq = round(cequartiles[.75], 2)
ceiqr = round(ceupperq-celowerq, 2)
celower_bound = round(celowerq - (1.5*ceiqr), 2)
ceupper_bound = round(ceupperq + (1.5*ceiqr), 2)

print( )
print('Ceftamin')
print('------------------')
print(f"The lower quartile is {celowerq}.")
print(f"The upper quartile is {ceupperq}.")
print(f"The IQR is {ceiqr}")
print(f"Values below {celower_bound} could be outliers.")
print(f"Values above {ceupper_bound} could be outliers.")

Capomulin
------------------
The lower quartile is 32.38.
The upper quartile is 40.16.
The IQR is 7.78
Values below 20.71 could be outliers.
Values above 51.83 could be outliers.

Ramicane
------------------
The lower quartile is 31.56.
The upper quartile is 40.66.
The IQR is 9.1
Values below 17.91 could be outliers.
Values above 54.31 could be outliers.

Infubinol
------------------
The lower quartile is 54.05.
The upper quartile is 65.53.
The IQR is 11.48
Values below 36.83 could be outliers.
Values above 82.75 could be outliers.

Ceftamin
------------------
The lower quartile is 48.72.
The upper quartile is 64.3.
The IQR is 15.58
Values below 25.35 could be outliers.
Values above 87.67 could be outliers.


In [13]:
plt.figure(5)
plt.boxplot([capomulin_end, ramicane_end, infubinol_end, ceftamin_end], 0, 'rD')
plt.xticks([1, 2, 3, 4], ['Capomulin', 'Ramicane', 'Infubinol', 'Ceftamin'])
plt.title('Final Tumor Volume per Drug Regimen')
plt.xlabel('Drug Regimen')
plt.ylabel('Tumor Volume (mm3)')
plt.show()

<IPython.core.display.Javascript object>

In [14]:
capomulin_group = capomulin_df.groupby('Mouse ID')
capomulin_weight_mean = capomulin_group['Weight (g)'].mean()
capomulin_tumor_mean = capomulin_group['Tumor Volume (mm3)'].mean()

In [15]:
one_mouse = capomulin_df.loc[(capomulin_df['Mouse ID'] == 'l509')]
x_time = one_mouse['Timepoint']
y_tumor = one_mouse['Tumor Volume (mm3)']
plt.figure(7)
plt.plot(x_time, y_tumor)
plt.title('Tumor Volume Over Time of Mouse l509')
plt.xlabel('Timepoint')
plt.ylabel('Tumor Volume (mm3)')
plt.show()

<IPython.core.display.Javascript object>

In [17]:
(slope, intercept, rvalue, pvalue, stderr) = linregress(capomulin_weight_mean, 
                                                        capomulin_tumor_mean)
regress_values = slope*capomulin_weight_mean + intercept
line_eq = 'y = ' + str(round(slope,2)) + 'x + ' + str(round(intercept,2))

plt.figure(6)
plt.scatter(capomulin_weight_mean, capomulin_tumor_mean)
plt.plot(capomulin_weight_mean, regress_values, 'r-')
plt.annotate(line_eq, (6,6), fontsize=12, color='black')
plt.title('Mice Weight vs Tumor Volume')
plt.xlabel('Mice Weight (g)')
plt.ylabel('Tumor Volume (mm3)')
plt.legend(['Line of Best Fit', 'Mice'], loc='best')
plt.show()

<IPython.core.display.Javascript object>

In [18]:
infubinol_end

Mouse ID
a203    67.973419
a251    65.525743
a577    57.031862
a685    66.083066
c139    72.226731
c326    36.321346
c895    60.969711
e476    62.435404
f345    60.918767
i386    67.289621
k483    66.196912
k804    62.117279
m756    47.010364
n671    60.165180
o809    55.629428
o813    45.699331
q132    54.656549
s121    55.650681
v339    46.250112
v719    54.048608
v766    51.542431
w193    50.005138
w584    58.268442
y163    67.685569
z581    62.754451
Name: Tumor Volume (mm3), dtype: float64