### D207 Performance Assessment
##### OEM2 - OEM2 Task 1: EDA - Exploratory Data Analysis
Exploratory Data Analysis - D207
PRFA - OEM2

#### Imports

 * [Pandas](https://pandas.pydata.org/)
 * [matplotlib](https://matplotlib.org/stable/index.html)
 * [sys](https://docs.python.org/3/library/sys.html)
 * [Seaborn](https://seaborn.pydata.org/)

 * Chi-Square
     * [Chi2 Contingency](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chi2_contingency.html)
     * [Chi2](https://docs.scipy.org/doc/scipy/reference/generated/scipy.stats.chi2.html)

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sb
from scipy.stats import chi2_contingency, chi2

ModuleNotFoundError: No module named 'seaborn'

### Load Medical Data
Loading `medical_clean.csv`
The data for this performance assessments comes pre-cleaned similar to the output result from D206.

  * Check the datatypes
  * Check for null, na, and empty values
  * Rename Items1-8

In [None]:
medical_clean_data = pd.read_csv('./medical_clean.csv')
#del medical_clean_data[medical_clean_data.columns[0]]

medical_clean_data.info()

missing_values_sum = medical_clean_data.isna().sum()
print(f'Missing Values Check: {missing_values_sum}')

column_renames = {
     'Item1': 'Timely_Admission'
    ,'Item2': 'Timely_Treatment'
    ,'Item3': 'Timely_Visits'
    ,'Item4': 'Reliability'
    ,'Item5': 'Options'
    ,'Item6': 'Hours_Of_Treatment'
    ,'Item7': 'Courteous_Staff'
    ,'Item8': 'Listening' #Evidence of active listening from Doctor
}
medical_clean_data.rename(columns=column_renames, inplace=True)

### Data Preparation:
#### Convert Variables(Columns) to Categorical DataTypes


In [None]:
category_dtype = 'category'
convert_to_category = {
    'CaseOrder': category_dtype,
    'Customer_id': category_dtype,
    'Interaction': category_dtype,
    'UID': category_dtype,
    'Zip': category_dtype,
    'Lat': category_dtype,
    'Lng': category_dtype,
    'City': category_dtype,
    'State': category_dtype,
    'County': category_dtype,
    'Area': category_dtype,
    'TimeZone': category_dtype,
    'Job': category_dtype,
    'Marital': category_dtype,
    'Gender': category_dtype,
    'ReAdmis': category_dtype,
    'Soft_drink': category_dtype,
    'Initial_admin': category_dtype,
    'HighBlood': category_dtype,
    'Stroke': category_dtype,
    'Complication_risk': category_dtype,
    'Overweight': category_dtype,
    'Arthritis': category_dtype,
    'Diabetes': category_dtype,
    'Hyperlipidemia': category_dtype,
    'BackPain': category_dtype,
    'Anxiety': category_dtype,
    'Allergic_rhinitis': category_dtype,
    'Reflux_esophagitis': category_dtype,
    'Asthma': category_dtype,
    'Services': category_dtype,
    'Timely_Admission': category_dtype,
    'Timely_Treatment': category_dtype,
    'Timely_Visits': category_dtype,
    'Reliability': category_dtype,
    'Options': category_dtype,
    'Hours_Of_Treatment': category_dtype,
    'Courteous_Staff': category_dtype,
    'Listening': category_dtype
}

medical_clean_data = medical_clean_data.astype(convert_to_category)
medical_clean_data.info()

### Data Preparation:
#### Detecting and Removing Categorical Variables with a High Cardinality (>3 - 5 Levels)
If a categorical column has lots of levels (unique categorical values) like UniqueID, Lat or Longitude it should be removed.

In [None]:
categorical_medical_data = medical_clean_data[convert_to_category.keys()]
high_cardinalities = categorical_medical_data.nunique() > 3 #(> 3-5 Levels)
high_cardinalities = high_cardinalities[high_cardinalities == True]
high_cardinalities = list(high_cardinalities.index.values)
print('Categoricals with high cardinality to be removed.')
print(high_cardinalities)

medical_data_prepared = medical_clean_data.drop(high_cardinalities, axis=1)
medical_data_prepared


#### Section B: Data Analysis
###### Run an Analysis using either, chi-square, t-test, or ANOVA

In [None]:
#Contingency Table
cross_tab = pd.crosstab(medical_data_prepared['Initial_admin']
                       ,medical_data_prepared['ReAdmis'])
                       #,margins=True, margins_name='Total')
cross_tab

In [None]:
#Observed values are Yes and No per Initial_admin.
observed_values = cross_tab.values
print('Observed Values: \n', observed_values)

In [None]:
#Chi-square test of independence of variables in a contingency table.
chi_square_statistic, p_value, degree_of_freedom, expected_frequency = chi2_contingency(observed_values)

print(f'Degrees Of Freedom = {degree_of_freedom}')
print(f'Expected Frequency = \n{expected_frequency}')

In [None]:
probability = .95
critical_value = chi2.ppf(probability, degree_of_freedom)

print(f'Probability = {probability}, Chi-Square Statistic = {chi_square_statistic}, Critical Value = {critical_value}')

if critical_value <= abs(chi_square_statistic):
    print('Reject Null Hypothesis (H0), variables are dependent.')
else:
    print('Fail to reject Null Hypothesis (H0), variables are independent.')

In [None]:
significance_level_alpha = 1.0 - probability #.05

print(f'Significance = {significance_level_alpha}, Probability = {probability}')

if probability <= significance_level_alpha:
    print('Reject Null Hypothesis (H0), variables are dependent.')
else:
    print('Fail to reject Null Hypothesis (H0), variables are independent.')


#### Section C: Identify the Distribution
###### Univariate Statistics

In [None]:
#Historgrams of continuous (VitD_level, Income) and categorical (ReAdmis, Initial_admin) variables

sb.histplot(data=medical_data_prepared['VitD_levels'], kde=True)
plt.title('Vitamin D Distribution')
plt.show()

sb.boxplot(data=medical_data_prepared['VitD_levels'], orient='h')
plt.title('Vitamin D Outliers')
plt.show()

sb.histplot(data=medical_data_prepared['Income'], kde=True)
plt.title('Income Distribution')
plt.show()

sb.boxplot(data=medical_data_prepared['Income'], orient='h')
plt.title('Income Outliers')
plt.show()

readmission_levels = medical_data_prepared.groupby(by='ReAdmis').size()
print(readmission_levels)

sb.histplot(data=medical_data_prepared['ReAdmis'])
plt.title('Readmissions Distribution')
plt.show()

initial_admission_levels = medical_data_prepared.groupby(by='Initial_admin').size()
print(initial_admission_levels)

sb.histplot(data=medical_data_prepared['Initial_admin'])
plt.title('Initial Admissions Distribution')
plt.show()

#### Section D: Identify Distributions
###### Bivariate Statistics

In [None]:
plt.scatter(medical_data_prepared['VitD_levels'], medical_data_prepared['ReAdmis'])
plt.title('Vitamin D Levels vs Readmission')
plt.xlabel('Vitamin D Levels')
plt.ylabel('ReAdmissions')
plt.show()

plt.scatter(medical_data_prepared['Income'], medical_data_prepared['ReAdmis'])
plt.title('Income vs Readmission')
plt.xlabel('Income')
plt.ylabel('Readmission')
plt.show()

plt.scatter(medical_data_prepared['VitD_levels'], medical_data_prepared['Initial_admin'])
plt.title('Vitamin D Levels vs Initial Admission Type')
plt.xlabel('Vitamin D Levels')
plt.ylabel('Initial Admission Type')
plt.show()

plt.scatter(medical_data_prepared['Income'], medical_data_prepared['Initial_admin'])
plt.title('Income vs Initial Admission Type')
plt.xlabel('Income')
plt.ylabel('Initial Admission Type')
plt.show()