In [None]:
# Dependencies and Setup
import matplotlib.pyplot as plt
import pandas as pd
import scipy.stats as st

# Study data files
mouse_metadata_path = "data/Mouse_metadata.csv"
study_results_path = "data/Study_results.csv"

# Read the mouse data and the study results
mouse_metadata = pd.read_csv(mouse_metadata_path)
study_results = pd.read_csv(study_results_path)

#Combine the data into a single dataset
#Assign Data frames
metadata_df = pd.DataFrame(mouse_metadata, columns=["Mouse ID", "Drug Regimen", "Sex", "Age_months", "Weight (g)"])
study_results_df= pd.DataFrame(study_results, columns=["Mouse ID", "Timepoint", "Tumor Volume (mm3)", "Metastatic Sites"])

#Merge data frames using outer merge that way it returns only data that matches
merge_df = pd.merge(metadata_df, study_results_df, on="Mouse ID", how="inner")

#Replace any space in columns with "_" (this will be useful for using duplicate function)
merge_df.columns = merge_df.columns.str.replace(' ', '_')

# Display the data table for preview
merge_df

In [None]:
merge_df.shape # 1893 rows, 8 columns

In [None]:
#Before beginning the analysis, check the data for any mouse ID with duplicate time points and remove any data associated with that mouse ID.
# Find the duplicate ROWS for Mouse_ID and Timepoint in merge_df data frame+ assign variable
dup= merge_df.duplicated(subset=['Mouse_ID', 'Timepoint']) #5 duplicates
#Locate the duplicates
merge_df.loc[dup,"Mouse_ID"]



In [None]:
#use conditional to locate all rows but g989 + assign variable
new_data= merge_df.loc[merge_df["Mouse_ID"]!= "g989" ]  


In [None]:
# Assign the new DataFrame (with the dropped Mouse_ID/Timepoint Data) a variable
pd.DataFrame(new_data)

In [None]:
# Create a clean DataFrame by dropping the duplicate mouse by its ID.
new_data.drop_duplicates(subset=['Mouse_ID'])

In [None]:
# Checking the number of mice in the clean DataFrame.
number_of_mice= new_data["Mouse_ID"].nunique()
number_of_mice

In [None]:
#SUMMARY STATISTICS
# Generate a summary statistics table of mean, median, variance, standard deviation, and SEM of the tumor volume for each regimen
# Groupby drug regimen
Drugs= new_data.groupby("Drug_Regimen")
Drugs.head()
Drugs.agg({"Tumor_Volume_(mm3)": "count"})
# Collect stats and set into a data frame
Mean= Drugs["Tumor_Volume_(mm3)"].mean()
Median= Drugs["Tumor_Volume_(mm3)"].median()
Variance= Drugs["Tumor_Volume_(mm3)"].var()
STD= Drugs["Tumor_Volume_(mm3)"].std()
SEM= Drugs["Tumor_Volume_(mm3)"].sem()
Summary_stats_table= pd.DataFrame({"Mean": Mean, "Median": Median,
"Variance":Variance,"STD":STD, "SEM": SEM     
                              })
Summary_stats_table
# This method is the most straighforward, creating multiple series and putting them all together at the end.


In [None]:
#BAR AND PIE CHARTS

In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pandas. 
count_drugs= clean_data["Drug_Regimen"].value_counts()


In [None]:
# Generate a bar plot showing the total number of mice for each treatment throughout the course of the study using pyplot.
plt.bar(count_drugs.index.values, count_drugs.values)
plt.xlabel('Drug Regimen')
plt.ylabel('Total Mice')
plt.xticks(rotation=90)
plt.show()


In [None]:
# Generate a pie plot showing the distribution of female versus male mice using pandas
Male_vs_Female= clean_data["Sex"].value_counts()

Male_vs_Female.plot(kind="pie")