# Pymaceuticals Inc.
---

### Analysis

- Add your analysis here.
 

In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st
import numpy as np

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_result = pd.read_csv(study_results_path)

# Combine the data into a single DataFrame
study_results_complete= pd.merge(study_result, mouse_metadata, how="outer", on=["Mouse ID", "Mouse ID"])

# Display the data table for preview
study_results_complete.head()

In [None]:
# Checking the number of mice.
mice_count=len(mouse_metadata["Mouse ID"].unique())
mice_count

In [None]:
duplicate_mice = study_results_complete.duplicated(['Mouse ID', 'Timepoint'], keep=False)
# Our data should be uniquely identified by Mouse ID and Timepoint
duplicated_mice_ID = study_results_complete.loc[duplicate_mice, 'Mouse ID'].unique()
# Get the duplicate mice by ID number that shows up for Mouse ID and Timepoint. 


In [None]:
# Optional: Get all the data for the duplicate mouse ID. 
duplicated_mice_ID_data =study_results_complete .loc[study_results_complete['Mouse ID'] == duplicated_mice_ID[0]]
duplicated_mice_ID_data

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
clean_df = study_results_complete.drop(duplicated_mice_ID_data.index)
clean_df.head()

In [None]:
# Checking the number of mice in the clean DataFrame.
num_mice = len(clean_df['Mouse ID'].unique()) 
num_mice

## Summary Statistics

In [None]:
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen

# Use groupby and summary statistical methods to calculate the following properties of each drug regimen: 
drug_regimen= clean_df.groupby('Drug Regimen')
# mean, median, variance, standard deviation, and SEM of the tumor volume. 
mean=drug_regimen['Tumor Volume (mm3)'].mean()
median=drug_regimen['Tumor Volume (mm3)'].median()
var=drug_regimen['Tumor Volume (mm3)'].var()
std=drug_regimen['Tumor Volume (mm3)'].std()
sem=drug_regimen['Tumor Volume (mm3)'].sem()
# Assemble the resulting series into a single summary DataFrame.
summary_statictics_df= pd.DataFrame({
    'Mean': mean,
    'Median': median,
    'Variance': var,
    'Standard deviation': std,
    'SEM': sem
})

summary_statictics_df

In [None]:
# A more advanced method to generate a summary statistics table of mean, median, variance, standard deviation,
# and SEM of the tumor volume for each regimen (only one method is required in the solution)

# Using the aggregation method, produce the same summary statistics in a single line
summary_table = drug_regimen['Tumor Volume (mm3)'].agg(['mean', 'median', 'var', 'std', 'sem'])
summary_agg_df = summary_table.rename(columns={
    'mean':'Mean',
    'median': 'Median',
    'var': 'Variance',
    'std': 'Standard deviation',
    'sem': 'SEM'
})

summary_agg_df 

## Bar and Pie Charts

In [None]:
regimen_counts=drug_regimen['Mouse ID'].count().sort_values(ascending=False)
regimen_counts

In [None]:
# Generate a bar plot showing the total number of rows (Mouse ID/Timepoints) for each drug regimen using Pandas.
bar_mice_per=regimen_counts.plot(kind='bar', color='skyblue', alpha=0.7,width=0.5)
bar_mice_per.set_xlabel('Drug Regimen')
bar_mice_per.set_ylabel('# of Observed Mouse Timepoints')
bar_mice_per.figure.savefig('images/bar_mice_per.png')
plt.show()

In [None]:
# Generate a bar plot showing the total number of rows (Mouse ID/Timepoints) for each drug regimen using pyplot.
x_axis_regimen = np.arange(len(regimen_counts))
regimen_name = list(regimen_counts.index.values)

py_bar_mice_per_regimen = plt.bar(x_axis_regimen, regimen_counts, color='skyblue', alpha=0.5)

plt.xticks(x_axis_regimen, regimen_name, rotation='vertical')
plt.xlabel('Drug Regimen')
plt.ylabel('# of Observed Mouse Timepoints')

# Save bar graph to Images folder
plt.tight_layout()
plt.savefig('images/py_bar_mice_per_regimen.png')
# Present the bar chart
plt.show()


In [None]:
sex_distribution = clean_df['Sex'].value_counts()
sex_distribution

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using Pandas
pie_mice_sex_pd = sex_distribution.plot(kind='pie', autopct='%1.1f%%', startangle=180)

# Create axes which are equal so we have a perfect circle
plt.axis("equal")

# Set x/y label, title
pie_mice_sex_pd.set_ylabel('Sex')

# Save bar graph to Images folder
plt.tight_layout()
pie_mice_sex_pd.figure.savefig('images/pd_pie_mice_sex.png')

# Present the bar chart
plt.show()



In [None]:
sex_index=sex_distribution.index
#print(sex_distribution.index)

In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pyplot
sex = ["Male", "Female"]
members = [958, 922]
colors = ["orange", "blue"]

py_pie_mice= plt.pie(sex_distribution, labels=sex,autopct="%1.1f%%", colors=colors, startangle=180)

plt.axis("equal")

plt.ylabel('Sex')

plt.tight_layout()

plt.savefig('images/py_pie_mice.png')

plt.show()

plt.close()

## Quartiles, Outliers and Boxplots

In [None]:
# Calculate the final tumor volume of each mouse across four of the treatment regimens:  
# Capomulin, Ramicane, Infubinol, and Ceftamin

# Start by getting the last (greatest) timepoint for each mouse
tumor_vol = clean_df.groupby(['Drug Regimen', 'Mouse ID']).last()[['Timepoint', 'Tumor Volume (mm3)']]
# Merge this group df with the original DataFrame to get the tumor volume at the last timepoint
final_tumor_df = tumor_vol.reset_index()
final_tumor_df

In [None]:
# Put treatments into a list for for loop (and later for plot labels)
regimen = ['Capomulin', 'Ramicane', 'Infubinol', 'Ceftamin']
# Create empty list to fill with tumor vol data (for plotting)
tumor_vol_data=[]
# Calculate the IQR and quantitatively determine if there are any potential outliers. 
for drug in regimen:
    
    # Locate the rows which contain mice on each drug and get the tumor volumes
    tumor_vol=final_tumor_df.loc[final_tumor_df['Drug Regimen'] == drug, 'Tumor Volume (mm3)']
    # add subset 
    tumor_vol_data.append(tumor_vol)
    
    # Determine outliers using upper and lower bounds
    quartiles = tumor_vol.quantile([0.25,0.5,0.75])
    lowerq = quartiles[0.25]
    upperq = quartiles[0.75]
    mean = tumor_vol.mean()
    iqr = upperq-lowerq    
    lower_bound = lowerq - (1.5*iqr)
    upper_bound = upperq + (1.5*iqr)

    outliers = tumor_vol.loc[(tumor_vol < lower_bound) 
                  | (tumor_vol > upper_bound)]    

In [None]:
# Generate a box plot that shows the distrubution of the tumor volume for each treatment group.
fig1, final_tumor_plot = plt.subplots()
final_tumor_plot.boxplot(tumor_vol_data, labels=regimen, flierprops={'markerfacecolor':'r','markersize': 10})

final_tumor_plot.set_ylabel('Final Tumor Volume (mm3)')

plt.tight_layout()
final_tumor_plot.figure.savefig('images/final_tumor_boxplot.png')

plt.show()

## Line and Scatter Plots

In [None]:
# Generate a line plot of tumor volume vs. time point for a single mouse treated with Capomulin
Capomulin_df = clean_df.loc[clean_df['Drug Regimen'] == 'Capomulin']
mouse_id = 'b128'
mouse_df = Capomulin_df.loc[Capomulin_df['Mouse ID'] == mouse_id]

time_vol = plt.plot(mouse_df['Timepoint'], mouse_df['Tumor Volume (mm3)'], marker='*', color='#0B5345')

plt.title(f"Capomulin Treatment of {mouse_id}")
plt.xlabel('Timepoint (Days)')
plt.ylabel('Tumor Volume (mm3)')

plt.tight_layout()
plt.savefig('images/l509_tumor_plot.png')

plt.show()

In [None]:

capomulin_data = clean_df[clean_df["Drug Regimen"] == "Capomulin"]

avg_tumor_volume = capomulin_data.groupby("Mouse ID")["Tumor Volume (mm3)"].mean()

mouse_weight = capomulin_data.groupby("Mouse ID")["Weight (g)"].mean()

x_values = mouse_weight
y_values = avg_tumor_volume
plt.scatter(x_values,y_values)

# Labels
plt.xlabel("Weight (g)")
plt.ylabel("Average Tumor Volume (mm3)")

plt.savefig('images/scatter_weight_tumor.png')

plt.show()




## Correlation and Regression

In [None]:
# Calculate the correlation coefficient and a linear regression model 
print(f"The correlation between mouse weight and the average tumor volume is {round(st.pearsonr(mouse_weight, avg_tumor_volume)[0],2)}")
# for mouse weight and average observed tumor volume for the entire Capomulin regimen
from scipy.stats import linregress
x_values = mouse_weight
y_values = avg_tumor_volume

(slope, intercept, rvalue, pvalue, stderr) = linregress(x_values, y_values)
regress_values = x_values * slope + intercept
line_eq = "y = " + str(round(slope,2)) + "x + " + str(round(intercept,2))

plt.scatter(x_values,y_values)
plt.plot(x_values,regress_values,"r-")
plt.annotate(line_eq,(20,36),fontsize=15,color="red")

plt.title('Mouse Weight vs. Average Tumor Volume: Capomulin Treatment Group')
plt.xlabel('Weight (g)')
plt.ylabel('Average Tumor Volume (mm3)')

plt.show()

print(line_eq)
