In [None]:
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

In [None]:
combined_df = pd.merge(mouse_metadata, study_results, how="inner", on="Mouse ID")
combined_df

In [None]:
    Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0 k403       Ramicane     Male 21        16         0         45.000000          0
1 k403       Ramicane     Male 21        16         5         38.825898          0
2 k403       Ramicane     Male 21        16         10        35.014271          1
3 k403       Ramicane     Male 21        16         15        34.223992          1
4 k403       Ramicane     Male 21        16         20        32.997729          1
1888 z969    Naftisol     Male 9         30         25        63.145652          2
1889 z969    Naftisol     Male 9         30         30        65.841013          3
1890 z969    Naftisol     Male 9         30         35        69.176246          4
1891 z969    Naftisol     Male 9         30         40        70.314904          4
1892 z969    Naftisol     Male 9         30         45        73.867845          4
    


In [None]:
mouse_count = combined_df["Mouse ID"].count()
mouse_count

In [None]:
1893

In [None]:
duplicate_rows = combined_df[combined_df.duplicated(['Mouse ID', 'Timepoint'])]
duplicate_rows

In [None]:
    Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
909  g989     Propriva    Female  21     26          0         45.000000          0
911  g989     Propriva    Female  21     26          5         47.570392          0
913  g989     Propriva    Female  21     26          10        49.880528          0
915  g989     Propriva    Female  21     26          15        53.442020          0
917  g989     Propriva    Female  21     26          20        54.657650          1



In [None]:
all_duplicate_rows = combined_df[combined_df.duplicated(['Mouse ID',])]
all_duplicate_rows 

In [None]:
    Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0 k403       Ramicane     Male 21        16         0         45.000000          0
1 k403       Ramicane     Male 21        16         5         38.825898          0
2 k403       Ramicane     Male 21        16         10        35.014271          1
3 k403       Ramicane     Male 21        16         15        34.223992          1
4 k403       Ramicane     Male 21        16         20        32.997729          1
1888 z969    Naftisol     Male 9         30         25        63.145652          2
1889 z969    Naftisol     Male 9         30         30        65.841013          3
1890 z969    Naftisol     Male 9         30         35        69.176246          4
1891 z969    Naftisol     Male 9         30         40        70.314904          4
1892 z969    Naftisol     Male 9         30         45        73.867845          4
    

In [None]:
clean_df = combined_df.drop_duplicates("Mouse ID")
clean_df

In [None]:
    Mouse ID,Drug Regimen,Sex,Age_months,Weight(g),Timepoint,Tumor Volume (mm3), Metastatic Sites
0     k403    Ramicane    Male     21     16        0          45.0               0
10    s185    Capomulin   Female   3      17        0          45.0               0
20    x401    Capomulin   Female   16     15        0          45.0               0
30    m601    Capomulin   Male     22     17        0          45.0               0
40    g791    Ramicane    Male     11     16        0          45.0               0
1858  z314    Stelasyn    Female   21     28        0          45.0               0
1860  z435    Propriva    Female   12     26        0          45.0               0
1863  z581    Infubinol   Female   24     25        0          45.0               0
1873  z795    Naftisol    Female   13     29        0          45.0               0
1883  z969    Naftisol    Male     9      30        0          45.0               0
    
    

In [None]:
mean = combined_df.groupby('Drug Regimen')['Tumor Volume (mm3)'].mean()
median = combined_df.groupby('Drug Regimen')['Tumor Volume (mm3)'].median()
variance = combined_df.groupby('Drug Regimen')['Tumor Volume (mm3)'].var()
standard_dv = combined_df.groupby('Drug Regimen')['Tumor Volume (mm3)'].std()
sem = combined_df.groupby('Drug Regimen')['Tumor Volume (mm3)'].sem()

summary_df = pd.DataFrame({"Mean": mean, "Median": median, "Variance": variance, "Standard Deviation": standard_dv, "SEM": sem})
summary_df

In [None]:
                Mean,  Median,  Variance,  Standard Deviation,  Sem
Capomulin  40.675741 41.557809 24.947764   4.994774           0.329346
Ceftamin   52.591172 51.776157 39.290177   6.268188           0.469821
Infubinol  52.884795 51.820584 43.128684   6.567243           0.492236
Ketapril   55.235638 53.698743 68.553577   8.279709           0.603860
Naftisol   54.331565 52.509285 66.173479   8.134708           0.596466
Placebo    54.033581 52.288934 61.168083   7.821003           0.581331
Propriva   52.322552 50.854632 42.351070   6.507770           0.512884
Ramicane   40.216745 40.673236 23.486704   4.846308           0.320955
Stelasyn   54.233149 52.431737 59.450562   7.710419           0.573111
Zoniferol  53.236507 51.818479 48.533355   6.966589           0.516398


In [None]:
drug_data = pd.DataFrame(combined_df.groupby(["Drug Regimen"]).count()).reset_index()

drugs_df = drug_data[["Drug Regimen", "Mouse ID"]]
drugs_df = drugs_df.set_index("Drug Regimen")

In [None]:
drugs_df.plot(kind="bar", figsize=(10,3))

plt.title("Drug Treatment Count")
plt.show()
plt.tight_layout()

In [None]:
drug_list = summary_df.index.tolist()
drug_list

In [None]:
 ['Capomulin',
 'Ceftamin',
 'Infubinol',
 'Ketapril',
 'Naftisol',
 'Placebo',
 'Propriva',
 'Ramicane',
 'Stelasyn',
 'Zoniferol']

In [None]:
drug_count = (combined_df.groupby(["Drug Regimen"])["Age_months"].count()).tolist()
drug_count 

In [None]:
[230, 178, 178, 188, 186, 181, 161, 228, 181, 182]

In [None]:
x_axis = np.arange(len(drug_count)) 

In [None]:
x_axis = drug_list

plt.figure(figsize=(11,4))
plt.bar(x_axis, drug_count, color='b', alpha=0.5, align="center")

plt.title("Drug Treatment Count")
plt.xlabel("Drug Regimen")
plt.ylabel("Count")

In [None]:
Text(0, 0.5, 'Count')

In [None]:
plt.clf()
plt.cla()
plt.close()

In [None]:
gender_df = pd.DataFrame(combined_df.groupby(["Sex"]).count()).reset_index()
gender_df.head()

In [None]:
Sex,   Mouse ID, Drug Regimen, Age_months, Weight(g), Timepoint, Tumore volume (mm3), Metastatic Sites
Female      935  935           935         935        935        935                  935
Male        958  958           958         958        958        958                  958                   


In [None]:
gender_df = gender_df[["Sex","Mouse ID"]]
gender_df.head()

In [None]:
 Sex,   Mouse ID
Female  935
Male    958


In [None]:
plt.figure(figsize=(12,6))
ax1 = plt.subplot(121, aspect="equal")
gender_df.plot(kind="pie", y = "Mouse ID", ax=ax1, autopct='%1.1f%%',
              startangle=190, shadow=True, labels=gender_df["Sex"], legend = False, fontsize=14)

plt.title("Male & Female Mice Percentage")
plt.xlabel("")
plt.ylabel("")

In [None]:
Text(0, 0.5, '')

In [None]:
Male = 50.6%
Female = 49.4%

In [None]:
plt.clf()
plt.cla()
plt.close()

In [None]:
gender_count = (combined_df.groupby(["Sex"])["Age_months"].count()).tolist()
gender_count

In [None]:
[935, 958]

In [None]:
plt.clf()
plt.cla()
plt.close()

In [None]:
combined_df.head() 

In [None]:
Mouse ID,Drug Regimen,Sex,Age_months,Weight (g),Timepoint,Tumor Volume (mm3),Metastatic Sites
0 k403       Ramicane     Male 21        16         0         45.000000          0
1 k403       Ramicane     Male 21        16         5         38.825898          0
2 k403       Ramicane     Male 21        16         10        35.014271          1
3 k403       Ramicane     Male 21        16         15        34.223992          1
4 k403       Ramicane     Male 21        16         20        32.997729          1

In [None]:
sorted_df = combined_df.sort_values(["Drug Regimen", "Mouse ID", "Timepoint"], ascending=True)
last_df = sorted_df.loc[sorted_df["Timepoint"] == 45]
last_df.head().reset_index()

In [None]:
Index,  Mouse ID,  Drug Regimen , Sex,  Age_months,  Weight(g), Timepoint,  Tumor Volume(mm3),  Metastatic Sites
309     b128       Capomulin     Female   9           22         45          38.982878           2
299     b742       Capomulin     Male     7           21         45          38.939633           0
244     g288       Capomulin     Male     3           19         45          37.074024           1
360     g316       Capomulin     Female   22          22         45          40.159220           2
440     i557       Capomulin     Female   1           24         45          47.685963           1



In [None]:
capo_df = last_df[last_df["Drug Regimen"].isin(["Capomulin"])]
capo_df.head().reset_index()

In [None]:
Index,  Mouse ID,  Drug Regimen , Sex,  Age_months,  Weight(g), Timepoint,  Tumor Volume(mm3),  Metastatic Sites
309     b128       Capomulin     Female   9           22         45          38.982878           2
299     b742       Capomulin     Male     7           21         45          38.939633           0
244     g288       Capomulin     Male     3           19         45          37.074024           1
360     g316       Capomulin     Female   22          22         45          40.159220           2
440     i557       Capomulin     Female   1           24         45          47.685963           1


In [None]:
capo_obj = capo_df.sort_values(["Tumor Volume (mm3)"], ascending=True).reset_index()
capo_obj = capo_obj["Tumor Volume (mm3)"]
capo_obj

In [None]:
0     23.343598
1     28.430964
2     28.484033
3     31.023923
4     31.896238
5     32.377357
6     33.329098
7     34.455298
8     36.041047
9     37.074024
10    37.311846
11    38.125164
12    38.846876
13    38.939633
14    38.982878
15    40.159220
16    40.658124
17    40.728578
18    41.483008
19    41.581521
20    47.685963
Name: Tumor Volume (mm3), dtype: float64

In [None]:
quartiles = capo_obj.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq - lowerq

print(f"The lower quartile of temperatures is: {lowerq}")
print(f"The upper quartile of temperatures is: {upperq}")
print(f"The interquartile range of temperatures is: {iqr}")
print(f"The median of temperatures is: {quartiles[0.5]}")

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print(f"Values below {lower_bound} could be outliers.")
print(f"Values above {upper_bound} could be outliers.")

In [None]:
The lower quartile of temperatures is: 32.37735684
The upper quartile of temperatures is: 40.1592203
The interquartile range of temperatures is: 7.781863460000004
The median of temperatures is: 37.31184577
Values below 20.70456164999999 could be outliers.
Values above 51.83201549 could be outliers.

In [None]:
fig1, ax1 = plt.subplots()
ax1.set_title("Final Tumor Volume in Capomulin Regimen")
ax1.set_ylabel("Final Tumor Volume (mm3)")
ax1.boxplot(capo_obj)
plt.show()

In [None]:
ram_df = last_df[last_df["Drug Regimen"].isin(["Ramicane"])]
ram_df.head().reset_index()

In [None]:
Index,  Mouse ID,  Drug Regimen , Sex,  Age_months,  Weight(g), Timepoint,  Tumor Volume(mm3),  Metastatic Sites
327     a411       Ramicane      Male     3           22         45          38.407618           1
430     a444       Ramicane      Female   10          25         45          43.047543           0
214     a520       Ramicane      Male     13          21         45          38.810366           1
174     a644       Ramicane      Female   7           17         45          32.978522           1
154     c758       Ramicane      Male     9           17         45          33.397653           1

In [None]:
ram_obj = ram_df.sort_values(["Tumor Volume (mm3)"], ascending=True).reset_index()
ram_obj = ram_obj["Tumor Volume (mm3)"]
ram_obj 

In [None]:
0     22.050126
1     29.128472
2     30.276232
3     30.564625
4     30.638696
5     31.095335
6     31.560470
7     32.978522
8     33.397653
9     33.562402
10    36.134852
11    36.374510
12    37.225650
13    37.311236
14    38.407618
15    38.810366
16    40.659006
17    40.667713
18    43.047543
19    45.220869
Name: Tumor Volume (mm3), dtype: float64

In [None]:
quartiles = capo_obj.quantile([.25,.5,.75])
lowerq = quartiles[0.25]
upperq = quartiles[0.75]
iqr = upperq - lowerq

print(f"The lower quartile of temperatures is: {lowerq}")
print(f"The upper quartile of temperatures is: {upperq}")
print(f"The interquartile range of temperatures is: {iqr}")
print(f"The median of temperatures is: {quartiles[0.5]}")

lower_bound = lowerq - (1.5*iqr)
upper_bound = upperq + (1.5*iqr)
print(f"Values below {lower_bound} could be outliers.")
print(f"Values above {upper_bound} could be outliers.") 

In [None]:
The lower quartile of temperatures is: 32.37735684
The upper quartile of temperatures is: 40.1592203
The interquartile range of temperatures is: 7.781863460000004
The median of temperatures is: 37.31184577
Values below 20.70456164999999 could be outliers.
Values above 51.83201549 could be outliers.


In [None]:
fig1, ax1 = plt.subplots()
ax1.set_title("Final Tumor Volume in Ramicane Regimen")
ax1.set_ylabel("Final Tumor Volume (mm3)")
ax1.boxplot(capo_obj)
plt.show() 

In [None]:
infu_df = last_df[last_df["Drug Regimen"].isin(["Infubinol"])]
infu_df.head().reset_index()

In [None]:
Index,  Mouse ID,  Drug Regimen , Sex,  Age_months,  Weight(g), Timepoint,  Tumor Volume(mm3),  Metastatic Sites
463     a203       Infubinol      Female   20          23         45          67.973419           2
473     a251       Infubinol      Female   21          25         45          65.525743           1
540     a685       Infubinol      Male     8           30         45          66.083066           3
637     c139       Infubinol      Male     11          28         45          72.226731           2
800     e476       Infubinol      Male     23          26         45          62.435404           1