In [161]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "Mouse_metadata.csv"
study_results_path = "Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)


In [162]:
#Combining the data into a single dataset
merg_df = pd.merge(mouse_metadata, study_results, on="Mouse ID")
#Displaying the data table for preview
merg_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0,k403,Ramicane,Male,21,16,0,45.000000,0
1,k403,Ramicane,Male,21,16,5,38.825898,0
2,k403,Ramicane,Male,21,16,10,35.014271,1
3,k403,Ramicane,Male,21,16,15,34.223992,1
4,k403,Ramicane,Male,21,16,20,32.997729,1
...,...,...,...,...,...,...,...,...
1888,z969,Naftisol,Male,9,30,25,63.145652,2
1889,z969,Naftisol,Male,9,30,30,65.841013,3
1890,z969,Naftisol,Male,9,30,35,69.176246,4
1891,z969,Naftisol,Male,9,30,40,70.314904,4


In [163]:
#Checking the number of mice 
merg_df2 = merg_df.loc[:, ['Mouse ID', 'Drug Regimen', 'Sex', 'Age_months', 'Weight (g)', 'Timepoint', 'Tumor Volume (mm3)', 'Metastatic Sites']]
merg_df2 = merg_df2.drop_duplicates(subset=['Mouse ID'])
count = merg_df2['Mouse ID'].count()
nunique = merg_df2['Timepoint'].nunique()
total_mice = count - nunique
pd.DataFrame({'Number Of Mice': [total_mice]})

Unnamed: 0,Number Of Mice
0,248


In [164]:
#Getting the duplicate mice by ID number that shows up for Mouse ID and TImepoint
merg_df = merg_df[merg_df.duplicated(subset=['Mouse ID', 'Timepoint'], keep=False)]
mouse_data = merg_df.set_index('Mouse ID')
time_point_data = mouse_data["Timepoint"]
mouse_summary = pd.DataFrame({'Timepoint': time_point_data})

mouse_summary

Unnamed: 0_level_0,Timepoint
Mouse ID,Unnamed: 1_level_1
g989,0
g989,0
g989,5
g989,5
g989,10
g989,10
g989,15
g989,15
g989,20
g989,20


In [165]:
#Getting all the data for the duplicatemouse ID
#pd.concat(g for _, g in merg_df.groupby(['Mouse ID', 'Timepoint']) if len(g) > 1)

In [166]:
#Getting all the data for the duplicatemouse ID
dup_df = merg_df[merg_df['Mouse ID'].duplicated() == True]   
dup_df

Unnamed: 0,Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
909,g989,Propriva,Female,21,26,0,45.0,0
910,g989,Propriva,Female,21,26,5,48.786801,0
911,g989,Propriva,Female,21,26,5,47.570392,0
912,g989,Propriva,Female,21,26,10,51.745156,0
913,g989,Propriva,Female,21,26,10,49.880528,0
914,g989,Propriva,Female,21,26,15,51.325852,1
915,g989,Propriva,Female,21,26,15,53.44202,0
916,g989,Propriva,Female,21,26,20,55.326122,1
917,g989,Propriva,Female,21,26,20,54.65765,1


In [167]:
#Creating a lclean DataFrame by dropping the duplicate mouse by it's ID
mouse_data = merg_df2.set_index('Mouse ID')

clean_summary = pd.DataFrame(mouse_data.drop('g989'))
clean_summary

Unnamed: 0_level_0,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
Mouse ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
k403,Ramicane,Male,21,16,0,45.0,0
s185,Capomulin,Female,3,17,0,45.0,0
x401,Capomulin,Female,16,15,0,45.0,0
m601,Capomulin,Male,22,17,0,45.0,0
g791,Ramicane,Male,11,16,0,45.0,0
...,...,...,...,...,...,...,...
z314,Stelasyn,Female,21,28,0,45.0,0
z435,Propriva,Female,12,26,0,45.0,0
z581,Infubinol,Female,24,25,0,45.0,0
z795,Naftisol,Female,13,29,0,45.0,0


In [168]:
count_mice = mouse_data.nunique()
clean_summary = pd.DataFrame({'Number Of Mice': [count_mice]})

In [169]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
import numpy as np
import scipy.stats as sts
# Demonstrate calculating measures of tumors
tumor_count = merg_df['Tumor Volume (mm3)']
mean_numpy = np.mean(tumor_count)
print(f"The mean of tumor count is {mean_numpy}")
median_numpy = np.median(tumor_count)
print(f"The median of tumor count is {median_numpy}")
mode_scipy = sts.mode(tumor_count)
print(f"The mode of tumor count is {mode_scipy}")
print(f"----------------------------------------------------")
# Use groupby and summary statistical methods to calculate the following properties of each drug regimen:
drug_regimen_group = merg_df.groupby('Drug Regimen')
# Count how many times each maker appears in our group
drug_regimen_count = drug_regimen_group['Drug Regimen'].count()
drug_regimen_count
# mean, median, variance, standard deviation, and SEM of the tumor volume.
var_numpy = np.var(tumor_count,ddof = 0)
print(f"The population variance using the NumPy module is {var_numpy}")
sd_numpy = np.std(tumor_count,ddof = 0)
print(f"The population standard deviation using the NumPy module is {sd_numpy}")
print(f"----------------------------------------------------")
print(f"Roughly 68% of the data is between {round(mean_numpy-sd_numpy,3)} and {round(mean_numpy+sd_numpy,3)}")
print(f"Roughly 95% of the data is between {round(mean_numpy-2*sd_numpy,3)} and {round(mean_numpy+2*sd_numpy,3)}")
print(f"Roughly 99.7% of the data is between {round(mean_numpy-3*sd_numpy,3)} and {round(mean_numpy+3*sd_numpy,3)}")
# Assemble the resulting series into a single summary dataframe.

# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
mean_numpy = clean_summary.groupby('Drug Regimen')['Tumor Volume (mm3)'].mean()
median_numpy = clean_summary.groupby('Drug Regimen')['Tumor Volume (mm3)'].median()
var_numpy = clean_summary.groupby('Drug Regimen')['Tumor Volume (mm3)'].var()
sd_numpy = clean_summary.groupby('Drug Regimen')['Tumor Volume (mm3)'].std()
sem = clean_summary.groupby('Drug Regimen')['Tumor Volume (mm3)'].sem()
# Using the aggregation method, produce the same summary statistics in a single line
summary = pd.DataFrame ({
    'Mean' : mean_numpy,
    'Median' : median_numpy,
    'Variance' : var_numpy,
    'Standard Deviation' : sd_numpy
})
summary

The mean of tumor count is 50.27345214
The median of tumor count is 50.603189935
The mode of tumor count is ModeResult(mode=array([45.]), count=array([2]))
----------------------------------------------------
The population variance using the NumPy module is 12.335359191773517
The population standard deviation using the NumPy module is 3.5121730014014854
----------------------------------------------------
Roughly 68% of the data is between 46.761 and 53.786
Roughly 95% of the data is between 43.249 and 57.298
Roughly 99.7% of the data is between 39.737 and 60.81


KeyError: 'Drug Regimen'