In [1]:
# Dependencies and Setup
%matplotlib inline
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from scipy.stats import linregress
from scipy.stats import sem

# Hide warning messages in notebook
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Declare where are the files to load, then read them as CSV and merge them (using the "Mouse ID")
mouse_data = pd.read_csv("data/mouse_drug_data.csv")
clinical_data = pd.read_csv("data/clinicaltrial_data.csv")
tutti_data = pd.merge(mouse_data, clinical_data, how="outer", on="Mouse ID" )
tutti_data.head()

Unnamed: 0,Mouse ID,Drug,Timepoint,Tumor Volume (mm3),Metastatic Sites
0,f234,Stelasyn,0,45.0,0
1,f234,Stelasyn,5,47.313491,0
2,f234,Stelasyn,10,47.904324,0
3,f234,Stelasyn,15,48.735197,1
4,f234,Stelasyn,20,51.112713,2


In [3]:
# General exploration of the data
tutti_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1906 entries, 0 to 1905
Data columns (total 5 columns):
Mouse ID              1906 non-null object
Drug                  1906 non-null object
Timepoint             1906 non-null int64
Tumor Volume (mm3)    1906 non-null float64
Metastatic Sites      1906 non-null int64
dtypes: float64(1), int64(2), object(2)
memory usage: 89.3+ KB


In [4]:
# General exploration of the data
tutti_data.describe()

Unnamed: 0,Timepoint,Tumor Volume (mm3),Metastatic Sites
count,1906.0,1906.0,1906.0
mean,19.5383,50.46812,1.018363
std,14.064786,8.869571,1.135991
min,0.0,22.050126,0.0
25%,5.0,45.0,0.0
50%,20.0,48.991921,1.0
75%,30.0,56.288484,2.0
max,45.0,78.567014,4.0


In [5]:
# Second version, with Timepoint as "object" instead of integer
# Declare where are the files to load, then read them as CSV and merge them (using the "Mouse ID")
mouse_data = pd.read_csv("data/mouse_drug_data.csv")
clinical_data = pd.read_csv("data/clinicaltrial_data.csv")
tutti_data_v2 = pd.merge(mouse_data, clinical_data, how="outer", on="Mouse ID" )

# Adjusting Timepoints as objects, instead of integers (if needed)
tutti_data_v2["Timepoint"] = tutti_data_v2["Timepoint"].astype("object")
#tutti_data.dtypes
tutti_data_v2.head()

Unnamed: 0,Mouse ID,Drug,Timepoint,Tumor Volume (mm3),Metastatic Sites
0,f234,Stelasyn,0,45.0,0
1,f234,Stelasyn,5,47.313491,0
2,f234,Stelasyn,10,47.904324,0
3,f234,Stelasyn,15,48.735197,1
4,f234,Stelasyn,20,51.112713,2


In [6]:
# General exploration of the data
tutti_data_v2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1906 entries, 0 to 1905
Data columns (total 5 columns):
Mouse ID              1906 non-null object
Drug                  1906 non-null object
Timepoint             1906 non-null object
Tumor Volume (mm3)    1906 non-null float64
Metastatic Sites      1906 non-null int64
dtypes: float64(1), int64(1), object(3)
memory usage: 89.3+ KB


In [11]:
# Store the Mean Tumor Volume Data Grouped by Drug and Timepoint 
g_by_d_and_t = tutti_data_v2.groupby(["Drug", "Timepoint"])
mtv = g_by_d_and_t["Tumor Volume (mm3)"].mean()
mms = g_by_d_and_t["Metastatic Sites"].mean()
n_mice = g_by_d_and_t["Mouse ID"].nunique()
mtv_se = g_by_d_and_t["Tumor Volume (mm3)"].sem()
mms_se = g_by_d_and_t["Metastatic Sites"].sem()


# Convert to DataFrames
mtv_by_drug_time_df = pd.DataFrame({"Mean Tumor Volume (mm3)" : mtv})
mms_by_drug_time_df = pd.DataFrame({"Mean Metastatic Sites" : mms})
nmice_by_drug_time_df = pd.DataFrame({"Mouse Count" : n_mice})
se_mtv_by_drug_time_df = pd.DataFrame({"Tumor Volume Standard Error": mtv_se})
se_mms_by_drug_time_df = pd.DataFrame({"Metastatic Sites Standard Error" : mms_se})

# Task 1. Tumor Response to Treatment
# Task 1.1 Preview DataFrame
mtv_by_drug_time_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Mean Tumor Volume (mm3)
Drug,Timepoint,Unnamed: 2_level_1
Capomulin,0,45.000000
Capomulin,5,44.266086
Capomulin,10,43.084291
Capomulin,15,42.064317
Capomulin,20,40.716325
...,...,...
Zoniferol,25,55.432935
Zoniferol,30,57.713531
Zoniferol,35,60.089372
Zoniferol,40,62.916692


In [12]:
# Task 1.2 Store the Standard Error of Tumor Volumes Grouped by Drug and Timepoint
se_mtv_by_drug_time_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Tumor Volume Standard Error
Drug,Timepoint,Unnamed: 2_level_1
Capomulin,0,0.000000
Capomulin,5,0.448593
Capomulin,10,0.702684
Capomulin,15,0.838617
Capomulin,20,0.909731
...,...,...
Zoniferol,25,0.602513
Zoniferol,30,0.800043
Zoniferol,35,0.881426
Zoniferol,40,0.998515


In [15]:
# 1.3 Minor Data Munging to Re-Format the Data Frames
# Preview that Reformatting worked
# Using Pivot_table, then rearange the columns and print
munging_mtv = pd.pivot_table(mtv_by_drug_time_df,values="Mean Tumor Volume (mm3)", index="Timepoint", columns="Drug")
munging_mtv.head()

Drug,Capomulin,Ceftamin,Infubinol,Ketapril,Naftisol,Placebo,Propriva,Ramicane,Stelasyn,Zoniferol
Timepoint,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
0,45.0,45.0,45.0,45.0,45.0,45.0,45.0,45.0,45.0,45.0
5,44.266086,46.503051,47.062001,47.389175,46.796098,47.125589,47.248967,43.944859,47.527452,46.851818
10,43.084291,48.285125,49.403909,49.582269,48.69421,49.423329,49.101541,42.531957,49.463844,48.689881
15,42.064317,50.094055,51.296397,52.399974,50.933018,51.359742,51.067318,41.495061,51.529409,50.779059
20,40.716325,52.157049,53.197691,54.920935,53.644087,54.364417,53.346737,40.238325,54.067395,53.170334


In [None]:
# 1.4 # Generate the Plot (with Error Bars)
fig, ax = plt.subplots()
ax.errorbar(np.arange(0, len(samples), 1)+1,means, yerr=sems, fmt="o", color="r",
            alpha=0.5, label="Mean of Whatever")
ax.set_xlim(0, len(means)+1)
ax.set_xlabel("Sample #")
ax.set_ylabel("Mean of Median houses_prices ($1000)")
plt.legend(loc="best", fontsize="small", fancybox=True)
plt.show()

# Save the Figure

# Show the Figure

In [None]:
# Task 2. Metastatic Response to Treatment
# Task 2.1 Preview DataFrame
mms_by_drug_time_df

In [None]:
# Task 2.2 Store the Standard Error associated with Met. Sites Grouped by Drug and Timepoint 
# Convert to DataFrames
# Preview DataFrame
se_mms_by_drug_time_df

In [None]:
# 2.3 Minor Data Munging to Re-Format the Data Frames
# Preview that Reformatting worked
# Using Pivot_table, then rearange the columns and print
munging_mms = pd.pivot_table(mms_by_drug_time_df,values="Mean Metastatic Sites", index="Timepoint", columns="Drug")
munging_mms.head()

In [None]:
# Task 3. Survival Rates
# Task 3.1 Preview DataFrame
nmice_by_drug_time_df

In [None]:
# This step is not needed for the Survival activity

# Convert to DataFrames

# Minor Data Munging to Re-Format the Data Frames
# Preview that Reformatting worked

# Preview DataFrame

In [None]:
# Generate the Plot (with Error Bars)

# Save the Figure

# Show the Figure
#plt.show()

In [None]:
# Make a variable called "drug_name" and store a "drug" in it
#drug_name = "SEA00001"
# (Capomulin, Ceftamin, Infubinol, Ketapril, Naftisol, Placebo, Propriva, Ramicane, Stelasyn, Zoniferol)

# Collect the trips of the 'bikeid' above
#just_capo = sum_it_up.loc[bike_id]

# Place the gender keys for that single bike into a list
#gender_list = just_capo.keys()

In [None]:
# Task 1. Trying different options

# Set line
#(slope, intercept, _, _, _) = linregress(x_axis, fake)
#fit = slope * x_axis + intercept

# Plot data
#fig, ax = plt.subplots()

#fig.suptitle("Fake Banana Data!", fontsize=16, fontweight="bold")

#ax.set_xlim(0, 10)
#ax.set_ylim(0, 10)

#ax.set_xlabel("Fake Banana Ages (in days)")
#ax.set_ylabel("Fake Banana Weights (in Hundres of Kilograms)")

#ax.plot(x_axis, fake, linewidth=0, marker='o')
#ax.plot(x_axis, fit, 'b--')

#plt.show()

In [None]:
# Task 1. Creating a scatter plot that shows how the tumor volume changes over time for each treatment.
    # Formatting: You must use proper labeling of your plots, including aspects like: 
    # Formatting: Plot Titles, Axes Labels, Legend Labels, X and Y Axis Limits, etc.
    # Formatting: Your scatter plots must include error bars.
   

In [None]:
# Task 2. Creating a scatter plot that shows how the number of metastatic (cancer spreading) sites 
# changes over time for each treatment.
    # Formatting: You must use proper labeling of your plots, including aspects like: 
    # Formatting: Plot Titles, Axes Labels, Legend Labels, X and Y Axis Limits, etc.
    # Formatting: Your scatter plots must include error bars.

In [None]:
# Task 3. Creating a scatter plot that shows the number of mice still alive 
# through the course of treatment (Survival Rate)
    # Formatting: You must use proper labeling of your plots, including aspects like: 
    # Formatting: Plot Titles, Axes Labels, Legend Labels, X and Y Axis Limits, etc.
    # Formatting: Your scatter plots must include error bars.


In [None]:
# Task 4. Creating a bar graph that compares the total % tumor volume change for each drug across the full 45 days.
    # Formatting: Titles, Axes Labels, Legend Labels, etc.
    # Formatting: Your bar graph should indicate tumor growth as red and tumor reduction as green.
    # Formatting: It should also include a label with the percentage change for each bar. 


In [None]:
# Task 5. Include 3 observations about the results of the study. Use the visualizations you generated 
# from the study data as the basis for your observations.

In [None]:
# Generate groups by treatments 
# (Capomulin, Ceftamin, Infubinol, Ketapril, Naftisol, Placebo, Propriva, Ramicane, Stelasyn, Zoniferol)
 
# Using GroupBy in order to organize the data into fields according to "Drugs/Treatments"
#capo_df = tutti_data.groupby(["Drug"])
# In order to be visualized, a data function must be used
# The numeric variables can be now calculated per drug
#school_average_math = grouped_schools_df["math_score"].mean()
# Average Reading Score
#school_average_reading = grouped_schools_df["reading_score"].mean()
# Looking for the ones that pass math per school
#pass_math_per_school = pass_math_df.groupby(["school_name"])
# In order to get the percentage passing math
#perc_pass_math = (pass_math_per_school["Student ID"].count() / grouped_schools_df["Student ID"].count())*100
# Looking for the ones that pass reading per school
#pass_reading_per_school = pass_reading_df.groupby(["school_name"])
# In order to get the percentage passing reading
#perc_pass_reading = (pass_reading_per_school["Student ID"].count()/grouped_schools_df["Student ID"].count())*100
# Overall Passing Rate (Average of math and reading)
#perc_overall_passing = (perc_pass_math+perc_pass_reading)/2
# Create a school summary for future calculations (tasks 7 and onward), but a formatted version for tasks 2,3 and 4.
#school_summary = pd.DataFrame({"Total Students" : total_students, 
#                               "Total School Budget" : budget_per_school,
#                               "Per Student Budget" : budget_per_student,
#                               "Average Math Score" : school_average_math,
#                               "Average Reading Score" : school_average_reading,
#                               "% Passing Math" : perc_pass_math,
#                               "% Passing Reading" : perc_pass_reading,
#                               "Overall Passing Rate" : perc_overall_passing})
# Now, print to make sure all OK 
#school_summary_f