In [2]:
# Dependencies
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import scipy.stats as sts

In [4]:
mouse = pd.read_csv('Mouse_metadata.csv')
study = pd.read_csv('Study_results.csv')

In [10]:
mouse.head()


Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g)
0,k403,Ramicane,Male,21,16
1,s185,Capomulin,Female,3,17
2,x401,Capomulin,Female,16,15
3,m601,Capomulin,Male,22,17
4,g791,Ramicane,Male,11,16


In [15]:
study.head()
print(study.shape)
print(mouse.shape)

(1893, 4)
(249, 5)


In [140]:
df = pd.merge(mouse, study, how='left', on="Mouse ID")

drug = df['Drug Regimen'].value_counts()
drug

df['Drug Regimen'].unique()

drugs = ['Ramicane', 'Capomulin', 'Infubinol', 'Placebo', 'Ceftamin',
       'Stelasyn', 'Zoniferol', 'Ketapril', 'Propriva', 'Naftisol']

df.head(10)

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
5,k403,Ramicane,Male,21,16,25,33.464577,1
6,k403,Ramicane,Male,21,16,30,31.099498,1
7,k403,Ramicane,Male,21,16,35,26.546993,1
8,k403,Ramicane,Male,21,16,40,24.365505,1
9,k403,Ramicane,Male,21,16,45,22.050126,1


In [129]:
grouped = df.groupby(['Drug Regimen'])
grouped.count().head()

Unnamed: 0_level_0,Mouse ID,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
Capomulin,230,230,230,230,230,230,230
Ceftamin,178,178,178,178,178,178,178
Infubinol,178,178,178,178,178,178,178
Ketapril,188,188,188,188,188,188,188
Naftisol,186,186,186,186,186,186,186
Placebo,181,181,181,181,181,181,181
Propriva,161,161,161,161,161,161,161
Ramicane,228,228,228,228,228,228,228
Stelasyn,181,181,181,181,181,181,181
Zoniferol,182,182,182,182,182,182,182


In [66]:
#Generate a summary statistics table consisting of the mean, median, variance, standard deviation, 
#and SEM of the tumor volume for each drug regimen.

tumor_mean = grouped['Tumor Volume (mm3)'].mean()
tumor_median = grouped['Tumor Volume (mm3)'].median()
tumor_std = grouped['Tumor Volume (mm3)'].std()
tumor_sem = grouped['Tumor Volume (mm3)'].sem()


In [101]:
by_drug = pd.DataFrame({'Avg Tumor Volume': tumor_mean,
                       'Median Tumor Volume': tumor_median,
                        'STD Tumor Volume': tumor_std,
                        'Tumor SEM': tumor_sem
                       
                       })



by_drug.index.name = 'Drug Regimen'


by_drug

Unnamed: 0_level_0,Avg Tumor Volume,Median Tumor Volume,STD Tumor Volume,Tumor SEM
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Capomulin,40.675741,41.557809,4.994774,0.329346
Ceftamin,52.591172,51.776157,6.268188,0.469821
Infubinol,52.884795,51.820584,6.567243,0.492236
Ketapril,55.235638,53.698743,8.279709,0.60386
Naftisol,54.331565,52.509285,8.134708,0.596466
Placebo,54.033581,52.288934,7.821003,0.581331
Propriva,52.322552,50.854632,6.50777,0.512884
Ramicane,40.216745,40.673236,4.846308,0.320955
Stelasyn,54.233149,52.431737,7.710419,0.573111
Zoniferol,53.236507,51.818479,6.966589,0.516398


In [122]:


x_axis = np.arange(len(drugs))
tick_locations = [value+0.4 for value in x_axis]

avg = by_drug['Avg Tumor Volume']
med = by_drug['Median Tumor Volume']
std = by_drug['STD Tumor Volume']
sem = by_drug['Tumor SEM']



by_drug.index


Index(['Capomulin', 'Ceftamin', 'Infubinol', 'Ketapril', 'Naftisol', 'Placebo',
       'Propriva', 'Ramicane', 'Stelasyn', 'Zoniferol'],
      dtype='object', name='Drug Regimen')

In [128]:
mouse_count = df['Mouse ID'].nunique()
mouse_count

249

In [156]:
df.head(10)

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.0,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
5,k403,Ramicane,Male,21,16,25,33.464577,1
6,k403,Ramicane,Male,21,16,30,31.099498,1
7,k403,Ramicane,Male,21,16,35,26.546993,1
8,k403,Ramicane,Male,21,16,40,24.365505,1
9,k403,Ramicane,Male,21,16,45,22.050126,1


In [166]:
newdf = df[(df['Drug Regimen'] == 'Capomulin') & (df['Timepoint'] == 45) | (df['Drug Regimen'] == 'Ceftamin') & (df['Timepoint'] == 45) 
           | (df['Drug Regimen'] == 'Infubinol') & (df['Timepoint'] == 45) | 
                (df['Drug Regimen'] == 'Ramicane') & (df['Timepoint'] == 45)]


newdf.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
9,k403,Ramicane,Male,21,16,45,22.050126,1
19,s185,Capomulin,Female,3,17,45,23.343598,1
29,x401,Capomulin,Female,16,15,45,28.484033,0
39,m601,Capomulin,Male,22,17,45,28.430964,1
49,g791,Ramicane,Male,11,16,45,29.128472,1


In [167]:
newdf = newdf.groupby(['Drug Regimen'])

newdf.head()

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
9,k403,Ramicane,Male,21,16,45,22.050126,1
19,s185,Capomulin,Female,3,17,45,23.343598,1
29,x401,Capomulin,Female,16,15,45,28.484033,0
39,m601,Capomulin,Male,22,17,45,28.430964,1
49,g791,Ramicane,Male,11,16,45,29.128472,1
59,s508,Ramicane,Male,1,17,45,30.276232,0
74,m546,Ramicane,Male,18,16,45,30.564625,1
84,z578,Ramicane,Male,11,16,45,30.638696,0
104,u364,Capomulin,Male,18,17,45,31.023923,3
124,y793,Capomulin,Male,17,17,45,31.896238,2


In [145]:
top_four = ['Capomulin', 'Ceftamin', 'Infubinol', 'Ramicane']


if x in df:
    
    x = df.loc[df['Timepoint'] == 45, 'Tumor Volume (mm3)']






NameError: name 'x' is not defined

In [125]:
top_four = by_drug.iloc[[0,1,2,7],:]

top_four



Unnamed: 0_level_0,Avg Tumor Volume,Median Tumor Volume,STD Tumor Volume,Tumor SEM
Drug Regimen,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
Capomulin,40.675741,41.557809,4.994774,0.329346
Ceftamin,52.591172,51.776157,6.268188,0.469821
Infubinol,52.884795,51.820584,6.567243,0.492236
Ramicane,40.216745,40.673236,4.846308,0.320955


In [117]:
# Determine if there are any potential outliers in the average occupancy in California
quartiles = c['AveOccup'].quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq-lowerq

print(f"The lower quartile of occupancy is: {lowerq}")
print(f"The upper quartile of occupancy is: {upperq}")
print(f"The interquartile range of occupancy is: {iqr}")
print(f"The the median of occupancy is: {quartiles[0.5]} ")

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print(f"Values below {lower_bound} could be outliers.")
print(f"Values above {upper_bound} could be outliers.")

outlier_cond = (california_data['AveOccup'] < lower_bound) | (california_data['AveOccup'] > upper_bound)
outlier_occupancy = california_data.loc[outlier_cond]
print()
print(outlier_occupancy.shape)
outlier_occupancy.head(3)

NameError: name 'c' is not defined