In [None]:
# Load libraries
import pandas as pd  # Used for data manipulation
import numpy as np  # Used for numerical operations
import seaborn as sns  # Used for data visualization
import matplotlib.pyplot as plt  # Used for plotting
from sklearn.model_selection import train_test_split, GridSearchCV  # Used for model selection and hyperparameter tuning
from sklearn.preprocessing import StandardScaler  # Used for feature scaling
from sklearn.neighbors import KNeighborsClassifier  # Used for KNN classification
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, roc_curve  # Used for model evaluation
from sklearn.feature_selection import SelectKBest, f_classif  # Used for feature selection
from statsmodels.stats.outliers_influence import variance_inflation_factor  # Used to check multicollinearity

# Load the medical dataset
medical = pd.read_csv('medical_clean.csv')

In [None]:
# Check for duplicates in the dataset
print(medical.duplicated().value_counts())

# Identify missing values in each column
missing_values = medical.isnull().sum()
print("Missing Values in Each Column:")
print(missing_values)

In [None]:
# Define a function to count outliers using IQR
def count_outliers(series):
    q1 = series.quantile(0.25)
    q3 = series.quantile(0.75)
    iqr = q3 - q1
    lower_bound = q1 - 1.5 * iqr
    upper_bound = q3 + 1.5 * iqr
    outliers = (series < lower_bound) | (series > upper_bound)
    return outliers.sum()

# Select numeric columns for outlier detection
numeric_columns = medical.select_dtypes(include=['float64', 'int64'])

# Count outliers for each numeric column
count_outliers = numeric_columns.apply(count_outliers)

# Display the count of outliers
print(count_outliers)

In [None]:
# Calculate descriptive statistics and identify outliers for specific columns of interest
columns_of_interest = ['Lat', 'Lng', 'Population', 'Children', 'Income', 'VitD_levels', 'Full_meals_eaten',
                       'vitD_supp', 'Additional_charges', 'Item1', 'Item2', 'Item3', 'Item4',
                       'Item5', 'Item6', 'Item7', 'Item8']

# Check if columns of interest exist in the DataFrame
for col in columns_of_interest:
    if col not in medical.columns:
        print(f"Column '{col}' does not exist in the DataFrame.")

# Calculate descriptive statistics for each column
stats = medical[columns_of_interest].describe()

# Identify columns with outliers
for col in columns_of_interest:
    q1 = stats.loc['25%', col]
    q3 = stats.loc['75%', col]
    iqr = q3 - q1
    lower_whisker = q1 - 1.5 * iqr
    upper_whisker = q3 + 1.5 * iqr
    outliers = (medical[col] < lower_whisker) | (medical[col] > upper_whisker)
    print(f"\nColumn: {col}")
    print(f"Lower Whisker: {lower_whisker}")
    print(f"Upper Whisker: {upper_whisker}")
    print(f"Count of Outliers: {outliers.sum()}")
    if outliers.any():
        plt.figure(figsize=(15, 5))
        sns.boxplot(x=medical[col])
        plt.title(f'Box Plot - {col} (with outliers)')
        plt.show()


In [None]:
# Target Variable
Y = medical['Anxiety']

# Feature Data
# Define columns to be dropped
columns_to_drop = ["Anxiety", "CaseOrder", "City","Customer_id", "State", "County", "UID","Interaction","Job","TimeZone"]

X = medical.drop(columns_to_drop, axis=1).copy()

In [None]:
# Check data types of X
print(X.dtypes)
print(X.head()) 

# Check data type of Y
print(Y.dtype)
print(Y.head()) 

In [None]:
# Select only the numerical columns from X
numerical_columns = X.select_dtypes(include=['int64', 'float64'])

# Summary statistics for numerical features
summary_X = numerical_columns.describe()
print("Summary statistics for X (numerical features only):")
print(summary_X)

# Summary statistics for Y
summary_Y = Y.value_counts()
print("\nSummary statistics for Y:")
print(summary_Y)

# Select only the categorical columns from X
categorical_columns = X.select_dtypes(include=['object'])

# Summary statistics for categorical features
summary_categorical = {}

# Loop through each categorical column
for column in categorical_columns.columns:
    # Calculate frequency counts for each category
    frequency_counts = categorical_columns[column].value_counts()
    # Store the frequency counts in the summary dictionary
    summary_categorical[column] = frequency_counts

# Display summary statistics for categorical features
print("\nSummary statistics for X (categorical features only):")
for column, counts in summary_categorical.items():
    print(f"\n{column}:\n{counts}")

In [None]:
# Replace boolean values with integers
Y = Y.replace({'Yes': 1,'No': 0})

In [None]:
# Check for unique variables
for column in X.columns:
    unique_values = X[column].unique()
    print(f"Unique values in {column}: {unique_values}")

In [None]:
print("Nominal Categorical Variables:", "Marital, Gender, ReAdmis, Soft_drink, Initial_admin, HighBlood, Stroke, Arthritis, Diabetes, Hyperlipidemia, BackPain, Allergic_rhinitis, Reflux_esophagitis, Asthma, Services")
print("Numeric Variables:")
print("Discrete:", "Population, Children, Doc_visits, Full_meals_eaten, vitD_supp, Item1, Item2, Item3, Item4, Item5, Item6, Item7, Item8")
print("Continuous:", "Age, Income, VitD_levels, Initial_days, TotalCharge, Additional_charge")

In [None]:
# One-hot encoding for categorical variables
# Dropping the first category to avoid the dummy variable trap
categorical_cols = ['Marital', 'Gender', 'ReAdmis', 'Soft_drink', 'Initial_admin', 'HighBlood', 
                    'Stroke', 'Complication_risk', 'Overweight', 'Arthritis', 'Diabetes', 
                    'Hyperlipidemia', 'BackPain', 'Allergic_rhinitis', 'Reflux_esophagitis', 
                    'Asthma', 'Services','Area']
x_encoded = pd.get_dummies(X[categorical_cols], drop_first=True)

# Concatenate the original DataFrame with one-hot encoded features
X_final = pd.concat([X.drop(columns=categorical_cols), x_encoded], axis=1)

# Replace boolean values with integers in the X_final DataFrame
X_final = X_final.replace({True: 1, False: 0})

In [None]:
# Rename X_final to X
X = X_final

In [None]:
# Check for unique variables/ 
for column in X.columns:
    unique_values = X[column].unique()
    print(f"Unique values in {column}: {unique_values}")

In [None]:
# Standardize the X variables (explanatory) using StandardScaler
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X), columns=X.columns)

In [None]:
X.head

In [None]:
# Use SelectKBest to determine the best features to include in the model
feat_select = SelectKBest(f_classif, k='all')
feat_select.fit_transform(X, Y)
feat_pvals = pd.DataFrame({'Feature' : X.columns, 'p_value' : feat_select.pvalues_}).sort_values('p_value') 
feat_pvals[feat_pvals['p_value'] < 0.05]

In [None]:
# Use VIF to check for multicollinearity issues amongst these features
X = X[["TotalCharge","Area_Urban"]]

vif_df = pd.DataFrame()
vif_df["feature"] = X.columns

vif_df["VIF"] = [variance_inflation_factor(X.values, i)
for i in range(len(X.columns))]

print(vif_df)

In [None]:
# Provide a copy of the cleaned data set
data_combined = pd.concat([X, Y], axis=1)

data_combined.to_csv('D209_part1_clean.csv')

In [None]:
# Split the dataset into training and testing sets
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42, stratify=Y)


In [None]:
#Save the training and testing sets as csv files
pd.DataFrame(X_train).to_csv('X_train_task1.csv')
pd.DataFrame(X_test).to_csv('X_test_task1.csv')
pd.DataFrame(Y_train).to_csv('Y_train_task1.csv')
pd.DataFrame(Y_test).to_csv('Y_test_task1.csv')

In [None]:
# Determine what is the best number of neighbors to use for KNN classification
param_grid = {'n_neighbors' : np.arange(1, 50)}
knn = KNeighborsClassifier()
# Use GridSearchCV object, searching across the provided parameter grid and 5 fold cross validation
knn_cv = GridSearchCV(knn, param_grid, cv=5)
# Fit to training data
knn_cv.fit(X_train, Y_train)
# Find best parameter from GridSearchCV
knn_cv.best_params_

In [None]:
# Find score of best parameter from GridSearchCV
knn_cv.best_score_

In [None]:
# Perform KNN using the value found from grid search
knn = KNeighborsClassifier(n_neighbors = 46)

In [None]:
# Fit to training data
knn.fit(X_train, Y_train)

In [None]:
# Make predictions on the training data
Y_pred_train = knn.predict(X_train)

# Make predictions on the testing data
Y_pred_test = knn.predict(X_test)

In [None]:
# An evaluation the model
train_accuracy = knn.score(X_train, Y_train)
test_accuracy = knn.score(X_test, Y_test)

print("Train Accuracy:", train_accuracy)
print("Test Accuracy:", test_accuracy)

In [None]:
# Generate confusion matrix and accuracy score of model
final_matrix = confusion_matrix(Y_test, Y_pred_test)
print("The confusion matrix for this KNN model:")
print("Predicted No Anxiety | Predicted Anxiety")
print(f"                 {final_matrix[0]} Actual No Anxiety")
print(f"                 {final_matrix[1]} Actual Anxiety")
print(f"The training accuracy of this KNN classification is {train_accuracy}.")
print(f"The testing accuracy of this KNN classification model is {test_accuracy}.")

In [None]:
# Heatmap for better interpretation of the confusion matrix
plt.figure(figsize=(10, 7))
sns.heatmap(final_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=['Predicted No Anxiety', 'Predicted Anxiety'], yticklabels=['Actual No Anxiety', 'Actual Anxiety'])
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix Heatmap for KNN Model')
plt.show()

In [None]:
# Generate AUC score and print
Y_pred_prob = knn.predict_proba(X_test)[:, 1]
fpr, tpr, thresholds = roc_curve(Y_test, Y_pred_prob)
plt.plot([0, 1], [0, 1], 'k--')
plt.plot(fpr, tpr)
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('ROC Curve for KNN Classification')
plt.show()
print(f"The Area Under the Curve (AUC) score is: {roc_auc_score(Y_test, Y_pred_prob)}\n")
print(classification_report(Y_test, Y_pred_test))