# Pymaceuticals Inc.
---

### Analysis

- Add your analysis here.
 

In [1]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import warnings
warnings.filterwarnings('ignore')

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

# Combine the data into a single DataFrame
mouse_data = pd.merge(study_results, mouse_metadata, how="left", on="Mouse ID")

# Display the data table for preview
mouse_data

Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,b128,0,45.000000,0,Capomulin,Female,9,22
1,f932,0,45.000000,0,Ketapril,Male,15,29
2,g107,0,45.000000,0,Ketapril,Female,2,29
3,a457,0,45.000000,0,Ketapril,Female,11,30
4,c819,0,45.000000,0,Ketapril,Male,21,25
...,...,...,...,...,...,...,...,...
1888,r944,45,41.581521,2,Capomulin,Male,12,25
1889,u364,45,31.023923,3,Capomulin,Male,18,17
1890,p438,45,61.433892,1,Ceftamin,Female,11,26
1891,x773,45,58.634971,4,Placebo,Female,21,30


In [2]:
# Checking the number of mice.
unique_mice = mouse_data["Mouse ID"].unique()
mouse_count = len(unique_mice)
mouse_count

249

In [3]:
# Our data should be uniquely identified by Mouse ID and Timepoint
# Get the duplicate mice by ID number that shows up for Mouse ID and Timepoint.

#array(['g989'], dtype=object)
#dupe_mice = mouse_data_df.duplicated(subset=["Mouse ID","Timepoint"], keep='first')

dupicated_rows_df = mouse_data.loc[mouse_data.duplicated(subset=["Mouse ID","Timepoint"], keep='first')]
dupe_mice = dupicated_rows_df["Mouse ID"].unique()
dupe_mice

array(['g989'], dtype=object)

In [4]:
# Optional: Get all the data for all duplicated mouse ID.
#Get a series of booleans that is True for all rows the pertain to a mouse in dupe_mice
#dupe_mice_row_series = mouse_data["Mouse ID"] == "g989"

dupe_mice_row_series = mouse_data["Mouse ID"].isin(dupe_mice)
print(type(dupe_mice_row_series))
mouse_data.loc[dupe_mice_row_series]



<class 'pandas.core.series.Series'>


Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
107,g989,0,45.0,0,Propriva,Female,21,26
137,g989,0,45.0,0,Propriva,Female,21,26
329,g989,5,48.786801,0,Propriva,Female,21,26
360,g989,5,47.570392,0,Propriva,Female,21,26
620,g989,10,51.745156,0,Propriva,Female,21,26
681,g989,10,49.880528,0,Propriva,Female,21,26
815,g989,15,51.325852,1,Propriva,Female,21,26
869,g989,15,53.44202,0,Propriva,Female,21,26
950,g989,20,55.326122,1,Propriva,Female,21,26
1111,g989,20,54.65765,1,Propriva,Female,21,26


In [5]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
mouse_data_df = mouse_data.loc[~dupe_mice_row_series]
mouse_data_df


Unnamed: 0,Mouse ID,Timepoint,Tumor Volume (mm3),Metastatic Sites,Drug Regimen,Sex,Age_months,Weight (g)
0,b128,0,45.000000,0,Capomulin,Female,9,22
1,f932,0,45.000000,0,Ketapril,Male,15,29
2,g107,0,45.000000,0,Ketapril,Female,2,29
3,a457,0,45.000000,0,Ketapril,Female,11,30
4,c819,0,45.000000,0,Ketapril,Male,21,25
...,...,...,...,...,...,...,...,...
1888,r944,45,41.581521,2,Capomulin,Male,12,25
1889,u364,45,31.023923,3,Capomulin,Male,18,17
1890,p438,45,61.433892,1,Ceftamin,Female,11,26
1891,x773,45,58.634971,4,Placebo,Female,21,30


In [7]:
# Checking the number of mice in the clean DataFrame.
unique_mice1 = mouse_data_df["Mouse ID"].unique()
new_mouse_count = len(unique_mice1)
new_mouse_count

248

## Summary Statistics

In [9]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
# Assemble the resulting series into a single summary DataFrame.

mouse_data_summary = mouse_data_df.groupby(["Drug Regimen"])

mean = mouse_data_df.mean()["Tumor Volume (mm3)"]
median = mouse_data_summary.median()["Tumor Volume (mm3)"]
variance = mouse_data_summary.var()["Tumor Volume (mm3)"]
standard_dev = mouse_data_summary.std()["Tumor Volume (mm3)"]
sem = mouse_data_summary.sem()["Tumor Volume (mm3)"]

mouse_data_summary1 = pd.merge(mean, median, how="left", on=["Drug Regimen"])
mouse_data_summary2 = pd.merge(variance, standard_dev, how="left", on=["Drug Regimen"])
mouse_data_summary3 = pd.merge(mouse_data_summary2,sem, how="left", on=["Drug Regimen"])
mouse_data_summary4 = pd.merge(mouse_data_summary1,mouse_data_summary2, how="left", on=["Drug Regimen"])
mouse_data_summary5 = pd.merge(mouse_data_summary4,mouse_data_summary3,how="left", on=["Drug Regimen"])

#school_data_complete = pd.merge(student_data, school_data, how="left", on=["school_name", "school_name"])
#"Drug Regimen" Tumor Volume (mm3)

#per_school_budget = school_data_complete.groupby(["school_name"]).mean()["budget"]
#mouse_data_df2=mouse_data_df.copy

#mouse_data_df.groupby(["Drug Regimen"])
#mouse_data_df = pd.DataFrame({mouse_data_df.mean()["Tumor Volume (mm3)"],
        #mouse_data_df.median()["Tumor Volume (mm3)"],
        #mouse_data_df.var()["Tumor Volume (mm3)"],
        #mouse_data_df.std()["Tumor Volume (mm3)"],
        #mouse_data_df.sem()["Tumor Volume (mm3)"]})
#mouse_data_df
#mouse_data_summary_df
mouse_data_summary5

TypeError: Can only merge Series or DataFrame objects, a <class 'numpy.float64'> was passed

In [None]:
# A more advanced method to generate a summary statistics table of mean, median, variance, standard deviation,
# and SEM of the tumor volume for each regimen (only one method is required in the solution)

# Using the aggregation method, produce the same summary statistics in a single line


## Bar and Pie Charts

In [None]:
# Generate a bar plot showing the total number of rows (Mouse ID/Timepoints) for each drug regimen using Pandas.
y_axis = np.arange(len(x_axis))
x_axis = "Drug Regimen"

tick_locations = [value for value in x_axis]
plt.xticks = (tick_locations, x_axis)

plt.xlim(-0.75, len(x_axis)-0.25)
plt.ylim(0, max(y_axis)+50)

plt.xlabel("Drug Regimen")
plt.ylabel("# of Observed Mouse Timepoints")
plt.bar(x_axis, y_axis, colour=blue, align=center)

In [None]:
# Generate a bar plot showing the total number of rows (Mouse ID/Timepoints) for each drug regimen using pyplot.
py.plot.bar

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas
mouse_data_pie = mouse_data_df.groupby(["Sex"])



# Labels for the sections of our pie chart
labels = ["Male","Female"]

# The values of each section of the pie chart
values = 
# The colors of each section of the pie chart
colours = ["orange","blue"]
           
#pie chart with equal axes
plt.axis("equal")

plt.pie(values, labels=labels colors=colors, autopct="%1.1f%%", startangle=140)
plt.show()   
           

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot


## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse


# Merge this group df with the original DataFrame to get the tumor volume at the last timepoint


In [None]:
# Put Top treatments into a list for for loop (and later for plot labels)


# Create empty list to fill with tumor vol data (for plotting)

#For each of the top treatments

    # Locate the rows which contain mice on this drug and get the tumor volumes
    
    # add this subset series to the plotting list
    
    # Calculate the IQR for this drug 
    
    #and quantitatively determine if there are any potential outliers. 
     
   

    # Determine outliers using upper and lower bounds


In [None]:
# Generate a box plot that shows the distrubution of the tumor volume for each treatment group.


## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a single mouse treated with Capomulin


In [None]:
# Generate a scatter plot of mouse weight vs. the average observed tumor volume for the entire Capomulin regimen


## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and a linear regression model 
# for mouse weight and average observed tumor volume for the entire Capomulin regimen
