In [1]:
# Dependencies and Setup
%matplotlib inline
import matplotlib.pyplot as plt
from scipy.stats import linregress
import pandas as pd
import numpy as np

# Hide warning messages in notebook
import warnings
warnings.filterwarnings("ignore")

In [2]:
# Declare where are the files to load, then read them as CSV and merge them (using the "Mouse ID")
mouse_data = pd.read_csv("data/mouse_drug_data.csv")
clinical_data = pd.read_csv("data/clinicaltrial_data.csv")
tutti_data = pd.merge(mouse_data, clinical_data, how="outer", on="Mouse ID" )
tutti_data.head()

Unnamed: 0,Mouse ID,Drug,Timepoint,Tumor Volume (mm3),Metastatic Sites
0,f234,Stelasyn,0,45.0,0
1,f234,Stelasyn,5,47.313491,0
2,f234,Stelasyn,10,47.904324,0
3,f234,Stelasyn,15,48.735197,1
4,f234,Stelasyn,20,51.112713,2


In [3]:
# General exploration of the data
tutti_data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1906 entries, 0 to 1905
Data columns (total 5 columns):
Mouse ID              1906 non-null object
Drug                  1906 non-null object
Timepoint             1906 non-null int64
Tumor Volume (mm3)    1906 non-null float64
Metastatic Sites      1906 non-null int64
dtypes: float64(1), int64(2), object(2)
memory usage: 89.3+ KB


In [4]:
# General exploration of the data
tutti_data.describe()

Unnamed: 0,Timepoint,Tumor Volume (mm3),Metastatic Sites
count,1906.0,1906.0,1906.0
mean,19.5383,50.46812,1.018363
std,14.064786,8.869571,1.135991
min,0.0,22.050126,0.0
25%,5.0,45.0,0.0
50%,20.0,48.991921,1.0
75%,30.0,56.288484,2.0
max,45.0,78.567014,4.0


In [5]:
# Second version, with Timepoint as "object" instead of integer
# Declare where are the files to load, then read them as CSV and merge them (using the "Mouse ID")
mouse_data = pd.read_csv("data/mouse_drug_data.csv")
clinical_data = pd.read_csv("data/clinicaltrial_data.csv")
tutti_data_v2 = pd.merge(mouse_data, clinical_data, how="outer", on="Mouse ID" )

# Adjusting Timepoints as objects, instead of integers (if needed)
tutti_data_v2["Timepoint"] = tutti_data_v2["Timepoint"].astype("object")
#tutti_data.dtypes
tutti_data_v2.head()

Unnamed: 0,Mouse ID,Drug,Timepoint,Tumor Volume (mm3),Metastatic Sites
0,f234,Stelasyn,0,45.0,0
1,f234,Stelasyn,5,47.313491,0
2,f234,Stelasyn,10,47.904324,0
3,f234,Stelasyn,15,48.735197,1
4,f234,Stelasyn,20,51.112713,2


In [6]:
# General exploration of the data
tutti_data_v2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1906 entries, 0 to 1905
Data columns (total 5 columns):
Mouse ID              1906 non-null object
Drug                  1906 non-null object
Timepoint             1906 non-null object
Tumor Volume (mm3)    1906 non-null float64
Metastatic Sites      1906 non-null int64
dtypes: float64(1), int64(1), object(3)
memory usage: 89.3+ KB


In [7]:
# Store the Mean Tumor Volume Data Grouped by Drug and Timepoint 
g_by_d_and_t = tutti_data_v2.groupby(["Drug", "Timepoint"])
mtv = g_by_d_and_t["Tumor Volume (mm3)"].mean()
mms = g_by_d_and_t["Metastatic Sites"].mean()
n_mice = g_by_d_and_t["Mouse ID"].nunique()

# Convert to DataFrames
mtv_by_drug_time_df = pd.DataFrame({"Mean Tumor Volume" : mtv})
mms_by_drug_time_df = pd.DataFrame({"Mean Metastatic Sites" : mms})
nmice_by_drug_time_df = pd.DataFrame({"Mouse Count" : n_mice})

# Preview DataFrame
mtv_by_drug_time_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Mean Tumor Volume
Drug,Timepoint,Unnamed: 2_level_1
Capomulin,0,45.000000
Capomulin,5,44.266086
Capomulin,10,43.084291
Capomulin,15,42.064317
Capomulin,20,40.716325
...,...,...
Zoniferol,25,55.432935
Zoniferol,30,57.713531
Zoniferol,35,60.089372
Zoniferol,40,62.916692


In [8]:
# Preview DataFrame
mms_by_drug_time_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Mean Metastatic Sites
Drug,Timepoint,Unnamed: 2_level_1
Capomulin,0,0.000000
Capomulin,5,0.160000
Capomulin,10,0.320000
Capomulin,15,0.375000
Capomulin,20,0.652174
...,...,...
Zoniferol,25,1.687500
Zoniferol,30,1.933333
Zoniferol,35,2.285714
Zoniferol,40,2.785714


In [9]:
# Preview DataFrame
nmice_by_drug_time_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Mouse Count
Drug,Timepoint,Unnamed: 2_level_1
Capomulin,0,25
Capomulin,5,25
Capomulin,10,25
Capomulin,15,24
Capomulin,20,23
...,...,...
Zoniferol,25,16
Zoniferol,30,15
Zoniferol,35,14
Zoniferol,40,14


In [None]:


# Convert to DataFrame

# Preview DataFrame

In [None]:
# Generate groups by treatments 
# (Capomulin, Ceftamin, Infubinol, Ketapril, Naftisol, Placebo, Propriva, Ramicane, Stelasyn, Zoniferol)
 
# Using GroupBy in order to organize the data into fields according to "Drugs/Treatments"
capo_df = tutti_data.groupby(["Drug"])

# In order to be visualized, a data function must be used
# The numeric variables can be now calculated per drug

school_average_math = grouped_schools_df["math_score"].mean()

# Average Reading Score
school_average_reading = grouped_schools_df["reading_score"].mean()

# Looking for the ones that pass math per school
pass_math_per_school = pass_math_df.groupby(["school_name"])

# In order to get the percentage passing math
perc_pass_math = (pass_math_per_school["Student ID"].count() / grouped_schools_df["Student ID"].count())*100

# Looking for the ones that pass reading per school
pass_reading_per_school = pass_reading_df.groupby(["school_name"])

# In order to get the percentage passing reading
perc_pass_reading = (pass_reading_per_school["Student ID"].count()/grouped_schools_df["Student ID"].count())*100

# Overall Passing Rate (Average of math and reading)
perc_overall_passing = (perc_pass_math+perc_pass_reading)/2

# Create a school summary for future calculations (tasks 7 and onward), but a formatted version for tasks 2,3 and 4.
school_summary = pd.DataFrame({"Total Students" : total_students, 
                               "Total School Budget" : budget_per_school,
                               "Per Student Budget" : budget_per_student,
                               "Average Math Score" : school_average_math,
                               "Average Reading Score" : school_average_reading,
                               "% Passing Math" : perc_pass_math,
                               "% Passing Reading" : perc_pass_reading,
                               "Overall Passing Rate" : perc_overall_passing})

# Now, print to make sure all OK 
school_summary_f

In [None]:
# Task 1. Trying different options

# Set line
(slope, intercept, _, _, _) = linregress(x_axis, fake)
fit = slope * x_axis + intercept

# Plot data
fig, ax = plt.subplots()

fig.suptitle("Fake Banana Data!", fontsize=16, fontweight="bold")

ax.set_xlim(0, 10)
ax.set_ylim(0, 10)

ax.set_xlabel("Fake Banana Ages (in days)")
ax.set_ylabel("Fake Banana Weights (in Hundres of Kilograms)")

ax.plot(x_axis, fake, linewidth=0, marker='o')
ax.plot(x_axis, fit, 'b--')

plt.show()

In [None]:
# Task 1. Creating a scatter plot that shows how the tumor volume changes over time for each treatment.
    # Formatting: You must use proper labeling of your plots, including aspects like: 
    # Formatting: Plot Titles, Axes Labels, Legend Labels, X and Y Axis Limits, etc.
    # Formatting: Your scatter plots must include error bars.
   

In [None]:
# Task 2. Creating a scatter plot that shows how the number of metastatic (cancer spreading) sites 
# changes over time for each treatment.
    # Formatting: You must use proper labeling of your plots, including aspects like: 
    # Formatting: Plot Titles, Axes Labels, Legend Labels, X and Y Axis Limits, etc.
    # Formatting: Your scatter plots must include error bars.

In [None]:
# Task 3. Creating a scatter plot that shows the number of mice still alive 
# through the course of treatment (Survival Rate)
    # Formatting: You must use proper labeling of your plots, including aspects like: 
    # Formatting: Plot Titles, Axes Labels, Legend Labels, X and Y Axis Limits, etc.
    # Formatting: Your scatter plots must include error bars.


In [None]:
# Task 4. Creating a bar graph that compares the total % tumor volume change for each drug across the full 45 days.
    # Formatting: Titles, Axes Labels, Legend Labels, etc.
    # Formatting: Your bar graph should indicate tumor growth as red and tumor reduction as green.
    # Formatting: It should also include a label with the percentage change for each bar. 


In [None]:
# Task 5. Include 3 observations about the results of the study. Use the visualizations you generated 
# from the study data as the basis for your observations.