## Timeline for the project
That will also be our next meeting times.
* 4/19-20 try to have dimensionality reduction done.
* 4/26-27 have the best models tuned and selected for the ensemble classification portion to put it together.
* 5/3-4 or earlier we will get together to do the presentation
* 5/13-14 or earlier we will get together to do the final report.


In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
RANDOM_STATE = 42

In [None]:
import kagglehub

# Download latest version
path = kagglehub.dataset_download("adilshamim8/student-depression-dataset")

print("Path to dataset files:", path)

In [None]:
df = pd.read_csv('./data/student_depression_dataset.csv')
df.head(10)

In [None]:
df.count()

In [None]:
df.value_counts('Depression')

In [None]:
16336 / 27901 # class balances

In [None]:
11565 / 27901 # class balances

In [None]:
df.dtypes

In [None]:
sns.boxplot(x=df['CGPA']) # Looking for outliers

In [None]:
# drop id, city, profession, work pressure, job satisfaction
df = df.drop(['id','City', 'Profession', 'Work Pressure', 'Job Satisfaction'], axis=1)
df.head(10)

In [None]:
df = df.rename(columns={"Have you ever had suicidal thoughts ?": "History of suicidal thoughts?"})
df.head(5)

In [None]:
# May need to prep that data now, take the top 3 answers for sleep duration and the rest can be other,
# change degree column to be more general
display(df.value_counts(['Sleep Duration'])) # ask about how many categories we wanted to have for this
display(df.value_counts(['Degree']))

In [None]:
def simplify_degree(degree):
    if degree == "MBBS":
        return "Bachelor's"
    elif "PhD" in degree:
        return "Doctorate"
    elif "B" in degree:
        return "Bachelor's"
    elif "M" in degree:
        return "Master's"
    elif "Class" in degree:
        return "HighSchool/GED"
    else:
        return "Other"

# Apply the function to the Degree column
df['Degree'] = df['Degree'].apply(simplify_degree)
df.head(10)

In [None]:
## Found three non-numeric values in Financial Stress and need to make ? value an na
print(df['Financial Stress'].value_counts())

df['Financial Stress'] = np.where(df['Financial Stress'] == '?', np.nan, df['Financial Stress'])

df['Financial Stress'] = df['Financial Stress'].astype(float)

In [None]:
## Summing all of the missing values by column
missing_val_cols = df.isnull().sum()

print(missing_val_cols)

In [None]:
## need to one hot encode the categorical columns
X = df.drop(['Depression'], axis=1)
y = df['Depression']
# display(X)
# display(y)

X = pd.get_dummies(X, columns=['Gender','Sleep Duration', 'Dietary Habits', 'Degree', 'History of suicidal thoughts?', 'Family History of Mental Illness'], drop_first=True)
display(X)

In [None]:
## Splitting data into train, test, and validation sets
from sklearn.model_selection import train_test_split

# X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, stratify=y)
# X_train, X_val, y_train, y_val = train_test_split(X_train, y_train, test_size=0.2222, stratify=y_train)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y)

display(X_train.shape)
X_train_percentage = (len(X_train) / len(df)) * 100
print(f'X Training set: {X_train_percentage:.2f}%')

display(X_test.shape)
X_test_percentage = (len(X_test) / len(df)) * 100
print(f'X Test set: {X_test_percentage:.2f}%')

# display(X_val.shape)
# X_validation_percentage = (len(X_val) / len(df)) * 100
# print(f'X Validation set: {X_validation_percentage:.2f}%')

In [None]:
## Removing missing values from financial stress column

from sklearn.impute import SimpleImputer

## Filling in missing numeric variables

# Setting imputer as mean
imputer = SimpleImputer(missing_values = np.nan, strategy = "mean")

# Imputing numeric columns with mean in train data
X_train['Financial Stress'] = imputer.fit_transform(X_train[['Financial Stress']])

# # Imputing numeric columns with mean in validation data
# X_val['Financial Stress'] = imputer.transform(X_val[['Financial Stress']])

# Imputing numeric columns with mean in test data
X_test['Financial Stress'] = imputer.transform(X_test[['Financial Stress']])


## Verifying train, validation, and test sets have no missing values

display(X_train.isnull().sum())

# display(X_val.isnull().sum())

display(X_test.isnull().sum())


In [None]:
X_train.dtypes

In [None]:
## Scaling all numerical data

# Importing in scaler
from sklearn.preprocessing import StandardScaler

# Importing in scaler
scaler = StandardScaler()

# Scaling training data
X_train[['Age', 'Academic Pressure', 'CGPA', 'Study Satisfaction', 'Work/Study Hours', 'Financial Stress']]= scaler.fit_transform(X_train[['Age', 'Academic Pressure', 'CGPA', 'Study Satisfaction', 'Work/Study Hours', 'Financial Stress']])


# Scaling validation data
# X_val[['Age', 'Academic Pressure', 'CGPA', 'Study Satisfaction', 'Work/Study Hours', 'Financial Stress']] = scaler.transform(X_val[['Age', 'Academic Pressure', 'CGPA', 'Study Satisfaction', 'Work/Study Hours', 'Financial Stress']])

# Scaling testing data
X_test[['Age', 'Academic Pressure', 'CGPA', 'Study Satisfaction', 'Work/Study Hours', 'Financial Stress']] = scaler.transform(X_test[['Age', 'Academic Pressure', 'CGPA', 'Study Satisfaction', 'Work/Study Hours', 'Financial Stress']])


In [None]:
## Verifying data has been scaled

display(X_train.head(10))

# display(X_val.head(10))

display(X_test.head(10))

In [None]:
#Use ADASYN and Undersampling
from imblearn.over_sampling import ADASYN
adasyn = ADASYN(random_state = RANDOM_STATE)
X_resampled_adasyn, y_resampled_adasyn = adasyn.fit_resample(X_train, y_train)
print(y_resampled_adasyn)
print(X_resampled_adasyn)

In [None]:
from imblearn.under_sampling import RandomUnderSampler
random_under_sampler = RandomUnderSampler(random_state = RANDOM_STATE)
X_resampled_rus, y_resampled_rus = random_under_sampler.fit_resample(X_train, y_train)
print(y_resampled_rus)
print(X_resampled_rus)

## Models we can use: RandomForestClassifier, SVM, LogisticRegression, NaiveBayes, KNN, Decision Tree
* We each choose 2 models from the list above.
* Perform feature reduction: PCA, LDA, Kernel PCA
* Tune hyperparameters: Using the best version of the model we have
* After finding the best model from tuning, then perform K-cross validation
* Select the best model from each of our selected 2, then we can put it together in a voting classifier for the ensemble portion.
## Notes
* Use charts and metrics where appropriate such as f1_score, precision, and accuracy.
* Don't use the validation sets for the feature/dimensionality reduction.


---

# David Braun
* KNN
* SVM

In [None]:
# ---------- extra imports for modelling ----------
from sklearn.preprocessing   import StandardScaler
from sklearn.decomposition   import PCA, KernelPCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.pipeline        import Pipeline
from sklearn.model_selection import GridSearchCV, StratifiedKFold, cross_val_score
from sklearn.neighbors       import KNeighborsClassifier
from sklearn.svm             import SVC
from sklearn.metrics         import f1_score, classification_report, confusion_matrix, RocCurveDisplay
import seaborn as sns, matplotlib.pyplot as plt, pandas as pd, joblib, pathlib

cv = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

In [None]:
def quick_score(model, Xb, yb, tag):
    model.fit(Xb, yb)
    print(f"{tag}: F1 = {f1_score(y_val, model.predict(X_val)):.3f}")

for Xb, yb, tag in [ (X_resampled_adasyn, y_resampled_adasyn, "KNN ADA"),
                     (X_resampled_rus,    y_resampled_rus,    "KNN RUS") ]:
    quick_score(KNeighborsClassifier(n_neighbors=5, weights='distance'), Xb, yb, tag)

for Xb, yb, tag in [ (X_resampled_adasyn, y_resampled_adasyn, "SVM ADA"),
                     (X_resampled_rus,    y_resampled_rus,    "SVM RUS") ]:
    quick_score(SVC(kernel='linear', C=1, random_state=42), Xb, yb, tag)

In [None]:
knn_pipe = Pipeline([
    ('scale', StandardScaler()),
    ('dr',    'passthrough'),          # placeholder
    ('clf',   KNeighborsClassifier())
])

svm_pipe = Pipeline([
    ('scale', StandardScaler()),
    ('dr',    'passthrough'),
    ('clf',   SVC(probability=True, random_state=42))
])

In [None]:
# ───────────────── 0.  nuke any zombie cluster ──────────────────────────
try:
    client.close(); cluster.close()
except NameError:
    pass                   # first run – nothing to clean up

# ───────────────── 1.  dask‑timeout & CPU threads ───────────────────────
import os, dask
dask.config.set({
    "distributed.comm.timeouts.connect": "120s",   # ← was 30 s
    "distributed.comm.timeouts.tcp":    "120s",
})
os.environ["OMP_NUM_THREADS"] = os.environ["MKL_NUM_THREADS"] = "18"

# ───────────────── 2.  start a TCP‑only 2‑GPU cluster ───────────────────
from dask_cuda import LocalCUDACluster
from dask.distributed import Client

cluster = LocalCUDACluster(
    protocol       ="tcp",        # skip UCX (fewer moving parts)
    interface      ="lo",         # loop‑back only
    enable_nvlink  =False,
    CUDA_VISIBLE_DEVICES="0,1",
    threads_per_worker = 6,       # 12 CPU threads total
    memory_limit       ="24GB",   # host RAM / worker
    rmm_pool_size      ="4GB"     # pre‑alloc on each GPU
)
client  = Client(cluster)
client.wait_for_workers(2)        # block until both GPUs are ready
print("Dask dashboard:", client.dashboard_link)

# ───────────────── 3.  cuML‑UMAP wrapper (same as before) ───────────────
import cupy as cp
from cuml.manifold import UMAP as GPUUMAP
from sklearn.base  import BaseEstimator, TransformerMixin
class CumlUMAP(BaseEstimator, TransformerMixin):
    def __init__(self, n_components=10, n_neighbors=30, random_state=42):
        self.n_components, self.n_neighbors, self.random_state = (
            n_components, n_neighbors, random_state)
    def get_params(self, deep=True): return vars(self).copy()
    def set_params(self, **p):
        for k,v in p.items(): setattr(self,k,v); return self
    def fit(self, X, y=None):
        self._umap = GPUUMAP(**self.get_params()).fit(cp.asarray(X))
        return self
    def transform(self, X):
        return self._umap.transform(cp.asarray(X)).get()

# ───────────────── 4.  pipelines + param grids  ─────────────────────────
from sklearn.impute          import SimpleImputer
from sklearn.preprocessing   import StandardScaler
from sklearn.decomposition   import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
from sklearn.neighbors       import KNeighborsClassifier
from sklearn.svm             import SVC
from sklearn.pipeline        import Pipeline
from dask_ml.model_selection import GridSearchCV    # ← dask‑ml version

base_pre = [('impute', SimpleImputer(strategy='median')),
            ('scale',  StandardScaler())]

knn_pipe = Pipeline(base_pre + [('dr','passthrough'),
                                ('clf',KNeighborsClassifier())])

svm_pipe = Pipeline(base_pre + [('dr','passthrough'),
                                ('clf',SVC())])

knn_param = {
    'dr': [None,
           PCA(n_components=10, random_state=42),
           LDA(n_components=1),
           CumlUMAP(n_components=10, n_neighbors=30, random_state=42)],
    'clf__n_neighbors':[3,5,7,11],
    'clf__weights':    ['uniform','distance'],
    'clf__metric':     ['euclidean','manhattan']
}
svm_param = {
    'dr': [None,
           PCA(n_components=15, random_state=42),
           CumlUMAP(n_components=15, n_neighbors=30, random_state=42)],
    'clf__kernel':['linear','rbf'],
    'clf__C':     [0.1,1,10],
    'clf__gamma': ['scale','auto']
}

knn_gs = GridSearchCV(knn_pipe, knn_param, scoring='f1', cv=10,
                      n_jobs=-1, error_score='raise')
svm_gs = GridSearchCV(svm_pipe, svm_param, scoring='f1', cv=10,
                      n_jobs=-1, error_score='raise')

# ───────────────── 5.  run the grid searches  ───────────────────────────
print("▶ fitting k‑NN grid …"); knn_gs.fit(Xb, yb)
print("▶ fitting SVM grid …");  svm_gs.fit(Xb, yb)

print("✓ finished")
print("best k‑NN:", knn_gs.best_params_, "  F1 =", knn_gs.best_score_)
print("best SVM:", svm_gs.best_params_, "  F1 =", svm_gs.best_score_)

In [None]:
best_knn = knn_gs.best_estimator_
best_svm = svm_gs.best_estimator_

print("KNN best params:", knn_gs.best_params_)
print("SVM best params:", svm_gs.best_params_)

In [None]:
import seaborn as sns, matplotlib.pyplot as plt
from sklearn.metrics import (classification_report, confusion_matrix,
                             RocCurveDisplay)

def eval_model(mdl, label):
    # CUPY → NUMPY if necessary
    y_pred = mdl.predict(X_val)
    if hasattr(y_pred, "get"):                      # cupy.ndarray
        y_pred = y_pred.get()

    print(f"\n{label}\n", classification_report(y_val, y_pred, digits=3))

    sns.heatmap(confusion_matrix(y_val, y_pred),
                annot=True, fmt='d', cmap='Blues')
    plt.title(f'{label} – Confusion'); plt.show()

    RocCurveDisplay.from_estimator(mdl, X_val, y_val)
    plt.title(f'{label} – ROC'); plt.show()

eval_model(best_knn, "Best KNN")
eval_model(best_svm, "Best SVM")

In [None]:
from sklearn.model_selection import cross_val_score

full_X = pd.concat([X_train, X_val])
full_y = pd.concat([y_train, y_val])

for mdl, name in [(best_knn, 'KNN'), (best_svm, 'SVM')]:
    scores = cross_val_score(mdl, full_X, full_y,
                             cv=cv, scoring='f1')      # n_jobs = 1
    print(f"{name} 10‑fold F1: {scores.mean():.3f} ± {scores.std():.3f}")

In [None]:
from sklearn.metrics import f1_score

for mdl, name in [(best_knn, 'KNN'), (best_svm, 'SVM')]:
    y_hat = mdl.predict(X_test)
    if hasattr(y_hat, "get"):
        y_hat = y_hat.get()
    print(name, "TEST F1 =", f1_score(y_test, y_hat))

In [None]:
import pathlib, joblib
pathlib.Path("models").mkdir(exist_ok=True)

# detach cupy arrays so joblib can pickle (safe‑guard)
for pipe in (best_knn, best_svm):
    if hasattr(pipe, "steps"):
        for name, step in pipe.steps:
            if hasattr(step, "release_cache"):    # many cuML objects
                step.release_cache()

joblib.dump(best_knn, "models/best_knn.pkl")
joblib.dump(best_svm, "models/best_svm.pkl")

In [None]:
(pd.DataFrame(knn_gs.cv_results_)
   .groupby('param_clf__n_neighbors')['mean_test_score']
   .mean()
   .plot(marker='o'))
plt.ylabel('Mean CV F1'); plt.xlabel('k'); plt.title('KNN: k vs F1'); plt.show()

---

# Karryn Leake
* Logistic Regression
* NaiveBayes

In [None]:
## Importing in needed packages for dimensionality reduction
from sklearn.metrics import precision_score, recall_score, accuracy_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.decomposition import PCA, KernelPCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
import matplotlib.patches as mpatches

In [None]:
### Creating base log model code

def train_log_model(X_train_values, y_train_values, X_test_values, y_test_values):
    # Setting up logistic regression model
    log_model = LogisticRegression(random_state = RANDOM_STATE)

    # Training model
    log_model.fit(X_train_values, y_train_values)

    ## Displaying model accuracy

    log_y_hat = log_model.predict(X_test_values)

    # Getting classification report
    report = classification_report(y_test_values, log_y_hat)

    # Getting accuracy specifically
    accuracy = accuracy_score(y_test_values, log_y_hat)

    print(f"Our current model accuracy is {accuracy:.2f}.")
    print("\n")
    print("A more detailed report of the model's overall accuracy is:")
    print(report)

In [None]:
### Creating base naive bayes code

def train_nb_model(X_train_values, y_train_values, X_test_values, y_test_values):
    nb_model = GaussianNB()
    nb_model.fit(X_train_values, y_train_values)

    # Predicting y_hat
    nb_y_hat = nb_model.predict(X_test_values)

    # Getting classification report
    report = classification_report(y_test_values, nb_y_hat)

    # Getting accuracy specifically
    accuracy = accuracy_score(y_test_values, nb_y_hat)

    print(f"Our current model accuracy is {accuracy:.2f}.")
    print("\n")
    print("A more detailed report of the model's overall accuracy is:")
    print(report)

### Initial log model on oversampled and undersampled data

In [None]:
## Training log model using oversampled data
train_log_model(X_resampled_adasyn, y_resampled_adasyn, X_test, y_test)

In [None]:
## Training log model on undersampled data
train_log_model(X_resampled_rus, y_resampled_rus, X_test, y_test)

### Initial nb model on oversampled and undersampled data

In [None]:
## Training nb model on oversampled data
train_nb_model(X_resampled_adasyn, y_resampled_adasyn, X_test, y_test)

In [None]:
## Training nb model on undersampled data
train_nb_model(X_resampled_rus, y_resampled_rus, X_test, y_test)

## Experimenting with LDA

### Completing LDA on oversampled data

In [None]:
## Completing LDA on over sampled data

# Getting LDA with n_components = 1 as n_components = number of classes - 1
lda = LinearDiscriminantAnalysis(n_components = 1)

# Performing LDA on train and test data
X_train_lda = lda.fit_transform(X_resampled_adasyn, y_resampled_adasyn)
X_test_lda = lda.transform(X_test)


### Training log and naive bayes models on oversampled data with LDA

In [None]:
train_log_model(X_train_lda, y_resampled_adasyn, X_test_lda, y_test)

In [None]:
train_nb_model(X_train_lda, y_resampled_adasyn, X_test_lda, y_test)

## Experimenting with PCA

### Setting up PCA on oversampled data


In [None]:
## Choosing 1 as number of components and testing variance
pca = PCA(n_components = 1)

X_train_pca = pca.fit_transform(X_resampled_adasyn)
X_test_pca = pca.transform(X_test)

print(pca.explained_variance_ratio_)
print(sum(pca.explained_variance_ratio_))

In [None]:
## Testing on log model
train_log_model(X_train_pca, y_resampled_adasyn, X_test_pca, y_test)

In [None]:
## Testing on naive bayes model
train_nb_model(X_train_pca, y_resampled_adasyn, X_test_pca, y_test)

In [None]:
## Choosing 2 as number of components and testing variance
pca = PCA(n_components = 2)

X_train_pca = pca.fit_transform(X_resampled_adasyn)
X_test_pca = pca.transform(X_test)


print(pca.explained_variance_ratio_)
print(sum(pca.explained_variance_ratio_))

In [None]:
## Testing on log model
train_log_model(X_train_pca, y_resampled_adasyn, X_test_pca, y_test)

In [None]:
## Testing on nb model
train_nb_model(X_train_pca, y_resampled_adasyn, X_test_pca, y_test)

In [None]:
## Choosing 3 as number of components and testing variance
pca = PCA(n_components = 3)

X_train_pca = pca.fit_transform(X_resampled_adasyn)
X_test_pca = pca.transform(X_test)

print(pca.explained_variance_ratio_)
print(sum(pca.explained_variance_ratio_))

In [None]:
## Testing on log model
train_log_model(X_train_pca, y_resampled_adasyn, X_test_pca, y_test)

In [None]:
## Testing on nb model
train_nb_model(X_train_pca, y_resampled_adasyn, X_test_pca, y_test)

In [None]:
## Choosing 4 as number of components and testing variance
pca = PCA(n_components = 4)

X_train_pca = pca.fit_transform(X_resampled_adasyn)
X_test_pca = pca.transform(X_test)

print(pca.explained_variance_ratio_)
print(sum(pca.explained_variance_ratio_))

In [None]:
## Testing on log model
train_log_model(X_train_pca, y_resampled_adasyn, X_test_pca, y_test)

In [None]:
## Testing on nb model
train_nb_model(X_train_pca, y_resampled_adasyn, X_test_pca, y_test)

In [None]:
## Choosing 5 as number of components and testing variance
pca = PCA(n_components = 5)

X_train_pca = pca.fit_transform(X_resampled_adasyn)
X_test_pca = pca.transform(X_test)

print(pca.explained_variance_ratio_)
print(sum(pca.explained_variance_ratio_))

In [None]:
## Testing on log model
train_log_model(X_train_pca, y_resampled_adasyn, X_test_pca, y_test)

In [None]:
## Testing on nb model
train_nb_model(X_train_pca, y_resampled_adasyn, X_test_pca, y_test)

In [None]:
# Choosing 6 as a random number of components

pca = PCA(n_components = 6)

X_train_pca = pca.fit_transform(X_resampled_adasyn)
X_test_pca = pca.transform(X_test)

print(pca.explained_variance_ratio_)
print(sum(pca.explained_variance_ratio_))

In [None]:
## Testing on log model
train_log_model(X_train_pca, y_resampled_adasyn, X_test_pca, y_test)

In [None]:
## Testing on nb model
train_nb_model(X_train_pca, y_resampled_adasyn, X_test_pca, y_test)

In [None]:
# Choosing 7 as a random number of components

pca = PCA(n_components = 7)

X_train_pca = pca.fit_transform(X_resampled_adasyn)
X_test_pca = pca.transform(X_test)

print(pca.explained_variance_ratio_)
print(sum(pca.explained_variance_ratio_))

In [None]:
## Testing on log model
train_log_model(X_train_pca, y_resampled_adasyn, X_test_pca, y_test)

In [None]:
## Testing on nb model
train_nb_model(X_train_pca, y_resampled_adasyn, X_test_pca, y_test)

In [None]:
# Choosing 8 as a random number of components

pca = PCA(n_components = 8)

X_train_pca = pca.fit_transform(X_resampled_adasyn)
X_test_pca = pca.transform(X_test)

print(pca.explained_variance_ratio_)
print(sum(pca.explained_variance_ratio_))

In [None]:
## Testing on log model
train_log_model(X_train_pca, y_resampled_adasyn, X_test_pca, y_test)

In [None]:
## Testing on nb model
train_nb_model(X_train_pca, y_resampled_adasyn, X_test_pca, y_test)

In [None]:
# Choosing 9 as a random number of components

pca = PCA(n_components = 9)

X_train_pca = pca.fit_transform(X_resampled_adasyn)
X_test_pca = pca.transform(X_test)

print(pca.explained_variance_ratio_)
print(sum(pca.explained_variance_ratio_))

In [None]:
## Testing on log model
train_log_model(X_train_pca, y_resampled_adasyn, X_test_pca, y_test)

In [None]:
## Testing on nb model
train_nb_model(X_train_pca, y_resampled_adasyn, X_test_pca, y_test)

In [None]:
# Choosing 10 as a random number of components

pca = PCA(n_components = 10)

X_train_pca = pca.fit_transform(X_resampled_adasyn)
X_test_pca = pca.transform(X_test)

print(pca.explained_variance_ratio_)
print(sum(pca.explained_variance_ratio_))

In [None]:
## Testing on log model
train_log_model(X_train_pca, y_resampled_adasyn, X_test_pca, y_test)

In [None]:
## Testing on nb model
train_nb_model(X_train_pca, y_resampled_adasyn, X_test_pca, y_test)

In [None]:
# Choosing 11 as a random number of components

pca = PCA(n_components = 11)

X_train_pca = pca.fit_transform(X_resampled_adasyn)
X_test_pca = pca.transform(X_test)

print(pca.explained_variance_ratio_)
print(sum(pca.explained_variance_ratio_))

In [None]:
## Testing on log model
train_log_model(X_train_pca, y_resampled_adasyn, X_test_pca, y_test)

In [None]:
## Testing on nb model
train_nb_model(X_train_pca, y_resampled_adasyn, X_test_pca, y_test)

In [None]:
# Choosing 12 as a random number of components

pca = PCA(n_components = 12)

X_train_pca = pca.fit_transform(X_resampled_adasyn)
X_test_pca = pca.transform(X_test)

print(pca.explained_variance_ratio_)
print(sum(pca.explained_variance_ratio_))

In [None]:
## Testing on log model
train_log_model(X_train_pca, y_resampled_adasyn, X_test_pca, y_test)

In [None]:
## Testing on nb model
train_nb_model(X_train_pca, y_resampled_adasyn, X_test_pca, y_test)

In [None]:
# Choosing 13 as a random number of components

pca = PCA(n_components = 13)

X_train_pca = pca.fit_transform(X_resampled_adasyn)
X_test_pca = pca.transform(X_test)

print(pca.explained_variance_ratio_)
print(sum(pca.explained_variance_ratio_))

In [None]:
## Testing on log model
train_log_model(X_train_pca, y_resampled_adasyn, X_test_pca, y_test)

In [None]:
## Testing on nb model
train_nb_model(X_train_pca, y_resampled_adasyn, X_test_pca, y_test)

## Experimenting with Kernel PCA

### Testing Kernel PCA on oversampled data

In [None]:
## Setting up Kernel PCA on train and test data

# Getting kernel PCA started
kernel_pca = KernelPCA(n_components = 1, kernel = 'rbf')

# Performing kernel PCA on train and test data
X_train_kernel = kernel_pca.fit_transform(X_resampled_adasyn)
X_test_kernel = kernel_pca.transform(X_test)

In [None]:
## Testing on log model
train_log_model(X_train_kernel, y_resampled_adasyn, X_test_kernel, y_test)

In [None]:
## Testing on nb model
train_nb_model(X_train_kernel, y_resampled_adasyn, X_test_kernel, y_test)

In [None]:
## Setting up Kernel PCA on train and test data

# Getting kernel PCA started
kernel_pca = KernelPCA(n_components = 2, kernel = 'rbf')

# Performing kernel PCA on train and test data
X_train_kernel = kernel_pca.fit_transform(X_resampled_adasyn)
X_test_kernel = kernel_pca.transform(X_test)

In [None]:
## Testing on log model
train_log_model(X_train_kernel, y_resampled_adasyn, X_test_kernel, y_test)

In [None]:
## Testing on nb model
train_nb_model(X_train_kernel, y_resampled_adasyn, X_test_kernel, y_test)

In [None]:
## Setting up Kernel PCA on train and test data

# Getting kernel PCA started
kernel_pca = KernelPCA(n_components = 3, kernel = 'rbf')

# Performing kernel PCA on train and test data
X_train_kernel = kernel_pca.fit_transform(X_resampled_adasyn)
X_test_kernel = kernel_pca.transform(X_test)

In [None]:
## Testing on log model
train_log_model(X_train_kernel, y_resampled_adasyn, X_test_kernel, y_test)

In [None]:
## Testing on nb model
train_nb_model(X_train_kernel, y_resampled_adasyn, X_test_kernel, y_test)

In [None]:
## Setting up Kernel PCA on train and test data

# Getting kernel PCA started
kernel_pca = KernelPCA(n_components = 4, kernel = 'rbf')

# Performing kernel PCA on train and test data
X_train_kernel = kernel_pca.fit_transform(X_resampled_adasyn)
X_test_kernel = kernel_pca.transform(X_test)

In [None]:
## Testing on log model
train_log_model(X_train_kernel, y_resampled_adasyn, X_test_kernel, y_test)

In [None]:
## Testing on nb model
train_nb_model(X_train_kernel, y_resampled_adasyn, X_test_kernel, y_test)

In [None]:
## Setting up Kernel PCA on train and test data

# Getting kernel PCA started
kernel_pca = KernelPCA(n_components = 5, kernel = 'rbf')

# Performing kernel PCA on train and test data
X_train_kernel = kernel_pca.fit_transform(X_resampled_adasyn)
X_test_kernel = kernel_pca.transform(X_test)

In [None]:
## Testing on log model
train_log_model(X_train_kernel, y_resampled_adasyn, X_test_kernel, y_test)

In [None]:
## Testing on nb model
train_nb_model(X_train_kernel, y_resampled_adasyn, X_test_kernel, y_test)

In [None]:
## Setting up Kernel PCA on train and test data

# Getting kernel PCA started
kernel_pca = KernelPCA(n_components = 6, kernel = 'rbf')

# Performing kernel PCA on train and test data
X_train_kernel = kernel_pca.fit_transform(X_resampled_adasyn)
X_test_kernel = kernel_pca.transform(X_test)

In [None]:
## Testing on log model
train_log_model(X_train_kernel, y_resampled_adasyn, X_test_kernel, y_test)

In [None]:
## Testing on nb model
train_nb_model(X_train_kernel, y_resampled_adasyn, X_test_kernel, y_test)

In [None]:
## Setting up Kernel PCA on train and test data

# Getting kernel PCA started
kernel_pca = KernelPCA(n_components = 7, kernel = 'rbf')

# Performing kernel PCA on train and test data
X_train_kernel = kernel_pca.fit_transform(X_resampled_adasyn)
X_test_kernel = kernel_pca.transform(X_test)

In [None]:
## Testing on log model
train_log_model(X_train_kernel, y_resampled_adasyn, X_test_kernel, y_test)

In [None]:
## Testing on nb model
train_nb_model(X_train_kernel, y_resampled_adasyn, X_test_kernel, y_test)

In [None]:
## Setting up Kernel PCA on train and test data

# Getting kernel PCA started
kernel_pca = KernelPCA(n_components = 8, kernel = 'rbf')

# Performing kernel PCA on train and test data
X_train_kernel = kernel_pca.fit_transform(X_resampled_adasyn)
X_test_kernel = kernel_pca.transform(X_test)

In [None]:
## Testing on log model
train_log_model(X_train_kernel, y_resampled_adasyn, X_test_kernel, y_test)

In [None]:
## Testing on nb model
train_nb_model(X_train_kernel, y_resampled_adasyn, X_test_kernel, y_test)

In [None]:
## Setting up Kernel PCA on train and test data

# Getting kernel PCA started
kernel_pca = KernelPCA(n_components = 9, kernel = 'rbf')

# Performing kernel PCA on train and test data
X_train_kernel = kernel_pca.fit_transform(X_resampled_adasyn)
X_test_kernel = kernel_pca.transform(X_test)

In [None]:
## Testing on log model
train_log_model(X_train_kernel, y_resampled_adasyn, X_test_kernel, y_test)

In [None]:
## Testing on nb model
train_nb_model(X_train_kernel, y_resampled_adasyn, X_test_kernel, y_test)

In [None]:
## Setting up Kernel PCA on train and test data

# Getting kernel PCA started
kernel_pca = KernelPCA(n_components = 10, kernel = 'rbf')

# Performing kernel PCA on train and test data
X_train_kernel = kernel_pca.fit_transform(X_resampled_adasyn)
X_test_kernel = kernel_pca.transform(X_test)

In [None]:
## Testing on log model
train_log_model(X_train_kernel, y_resampled_adasyn, X_test_kernel, y_test)

In [None]:
## Testing on nb model
train_nb_model(X_train_kernel, y_resampled_adasyn, X_test_kernel, y_test)

### Plotting LDA with Class Distinction

In [None]:
## Getting plot of LDA 1 and classifier value for train data

# Getting columns named properly 
feature_1 = X_train_lda[:1000, 0]
y = np.array(y_resampled_adasyn[:1000])
second_y = np.zeros(1000)

class_colors = {0: 'red', 1: 'blue'}

# Plotting data with class-based colors
for i in range(len(feature_1)):
    plt.scatter(feature_1[i], second_y[i], color=class_colors[y[i]])

# Creating legend elements
legend_elements = [mpatches.Patch(color=color, label=cls) for cls, color in class_colors.items()]

# Adding labels, legends and making graph nice
plt.xlabel("LDA 1")

plt.title("LDA 1 with Class Distinction")

plt.legend(handles=legend_elements)

# Showing plot
plt.show()

## Model Selection: Hyperparameter Tuning and Cross Validation

Based on the above dimensionality results, we will be using LDA, PCA with 10 components, and PCA with 13 components to train/tune our model. This is because LDA produced the highest metrics for both naive bayes and logistic regression with the greatest amount of dimensionality reduction. Additionally, PCA with 13 components also produced relatively similar metrics, while reducing the dataset to 13 columns that account for over `95%` of the overall data. PCA with 10 components will be used to see if after tunning, the results are similar to our other two dimensionality reduction methods and is a fun experiment to see if we can reduce half the number of columns. Finally based on the results of kernel PCA, we will not be using kernel PCA as a dimensionality reducer, as it provided similar/worse results in terms of model metrics for both naive bayes and logistic regression and was more computationaly expensive to reduce the data. 

### Experimenting with GridSearchCV for logistic regression

#### Setting up Logistic Regression model using GridsSearch CV

In [None]:
## Setting up GridSearch for logistic regresion on all solvers

# Importing in GridSearch and time
from sklearn.model_selection import GridSearchCV
import time

log_model = LogisticRegression(random_state = RANDOM_STATE)

# Setting up hyperparameters
grid_param = {
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'], 
    'penalty': ['l2'], # Can only do l2 for all models because some models do not take other penalties
    'C': [0.001, 0.0001, 0.01, 0.1, 1, 10, 100, 1000, 10000], 
    'max_iter': [1000]
}

start_time = time.time()

grid_search = GridSearchCV(estimator = log_model, 
                           param_grid = grid_param, 
                           scoring = 'accuracy', 
                           cv = 5, 
                           n_jobs = -1, 
                           verbose = 3,)

##### Fitting first Logistic Regression GridSearch on un-reduced data

In [None]:
grid_search.fit(X_resampled_adasyn, y_resampled_adasyn)

elapsed_time = time.time() - start_time 
print(f"The total elapsed time is: {elapsed_time}")

best_score = grid_search.best_score_
best_params = grid_search.best_params_

print(best_score)
print(best_params)

##### Fitting first Logistic Regression GridSearch on LDA data

In [None]:
grid_search.fit(X_train_lda, y_resampled_adasyn)

elapsed_time = time.time() - start_time 
print(f"The total elapsed time is: {elapsed_time}")

best_score = grid_search.best_score_
best_params = grid_search.best_params_

print(best_score)
print(best_params)

##### Fitting first Logistic Regression GridSearch on PCA data with n = 13

In [None]:
# Choosing 13 as a random number of components

pca = PCA(n_components = 13)

X_train_pca_13 = pca.fit_transform(X_resampled_adasyn)

In [None]:
grid_search.fit(X_train_pca_13, y_resampled_adasyn)

elapsed_time = time.time() - start_time 
print(f"The total elapsed time is: {elapsed_time}")

best_score = grid_search.best_score_
best_params = grid_search.best_params_

print(best_score)
print(best_params)

##### Fitting first Logistic Regression GridSearch on PCA data with n = 10

In [None]:
# Choosing 10 as a random number of components

pca = PCA(n_components = 10)

X_train_pca_10 = pca.fit_transform(X_resampled_adasyn)

In [None]:
grid_search.fit(X_train_pca_10, y_resampled_adasyn)

elapsed_time = time.time() - start_time 
print(f"The total elapsed time is: {elapsed_time}")

best_score = grid_search.best_score_
best_params = grid_search.best_params_

print(best_score)
print(best_params)

In [None]:
## Changing grid parameters to exclude liblinear and add None as penalty option
log_model = LogisticRegression(random_state = RANDOM_STATE)

# Setting up hyperparameters
grid_param = {
    'solver': ['newton-cg', 'lbfgs', 'sag', 'saga'], 
    'penalty': ['l2', None], 
    'C': [0.001, 0.0001, 0.01, 0.1, 1, 10, 100, 1000, 10000],
    'max_iter': [1000]
}

start_time = time.time()

grid_search = GridSearchCV(estimator = log_model, 
                           param_grid = grid_param, 
                           scoring = 'accuracy', 
                           cv = 5, 
                           n_jobs = -1, 
                           verbose = 3,)

##### Completing second GridSearch on unreduced data

In [None]:
grid_search.fit(X_resampled_adasyn, y_resampled_adasyn)

elapsed_time = time.time() - start_time 
print(f"The total elapsed time is: {elapsed_time}")

best_score = grid_search.best_score_
best_params = grid_search.best_params_

print(best_score)
print(best_params)

##### Completing second GridSearch on LDA data

In [None]:
grid_search.fit(X_train_lda, y_resampled_adasyn)

elapsed_time = time.time() - start_time 
print(f"The total elapsed time is: {elapsed_time}")

best_score = grid_search.best_score_
best_params = grid_search.best_params_

print(best_score)
print(best_params)

##### Completing second GridSearch on PCA n = 13 data

In [None]:
grid_search.fit(X_train_pca_13, y_resampled_adasyn)

elapsed_time = time.time() - start_time 
print(f"The total elapsed time is: {elapsed_time}")

best_score = grid_search.best_score_
best_params = grid_search.best_params_

print(best_score)
print(best_params)

##### Completing second GridSearch on PCA n = 10 data

In [None]:
grid_search.fit(X_train_pca_10, y_resampled_adasyn)

elapsed_time = time.time() - start_time 
print(f"The total elapsed time is: {elapsed_time}")

best_score = grid_search.best_score_
best_params = grid_search.best_params_

print(best_score)
print(best_params)

In [None]:
## Trying all possible combinations with saga solver that have not been explored
log_model = LogisticRegression(random_state = RANDOM_STATE)

# Setting up hyperparameters
grid_param = {
    'solver': ['saga'], 
    'penalty': ['l1','elasticnet'], 
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000],
    'l1_ratio': [0, 0.5, 1], 
    'max_iter': [1000]
}

start_time = time.time()

grid_search = GridSearchCV(estimator = log_model, 
                           param_grid = grid_param, 
                           scoring = 'accuracy', 
                           cv = 5, 
                           n_jobs = -1, 
                           verbose = 3,)

##### Completing third GridSearch on unreduced data

In [None]:
grid_search.fit(X_resampled_adasyn, y_resampled_adasyn)

elapsed_time = time.time() - start_time 
print(f"The total elapsed time is: {elapsed_time}")

best_score = grid_search.best_score_
best_params = grid_search.best_params_

print(best_score)
print(best_params)

##### Completing third GridSearch on LDA data

In [None]:
grid_search.fit(X_train_lda, y_resampled_adasyn)

elapsed_time = time.time() - start_time 
print(f"The total elapsed time is: {elapsed_time}")

best_score = grid_search.best_score_
best_params = grid_search.best_params_

print(best_score)
print(best_params)

##### Completing third GridSearch on PCA n = 13

In [None]:
grid_search.fit(X_train_pca_13, y_resampled_adasyn)

elapsed_time = time.time() - start_time 
print(f"The total elapsed time is: {elapsed_time}")

best_score = grid_search.best_score_
best_params = grid_search.best_params_

print(best_score)
print(best_params)

##### Completing third GridSearch on PCA n = 10 


In [None]:
grid_search.fit(X_train_pca_10, y_resampled_adasyn)

elapsed_time = time.time() - start_time 
print(f"The total elapsed time is: {elapsed_time}")

best_score = grid_search.best_score_
best_params = grid_search.best_params_

print(best_score)
print(best_params)

#### Testing all versions of liblinear solver

In [None]:
## Trying all possible combinations with saga solver that have not been explored
log_model = LogisticRegression(random_state = RANDOM_STATE)

# Setting up hyperparameters
grid_param = {
    'solver': ['liblinear'], 
    'penalty': ['l1','l2'], 
    'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000],
    'max_iter': [1000]
}

start_time = time.time()

grid_search = GridSearchCV(estimator = log_model, 
                           param_grid = grid_param, 
                           scoring = 'accuracy', 
                           cv = 5, 
                           n_jobs = -1, 
                           verbose = 3,)

##### Completing GridSearch on unreduced dataset

In [None]:
grid_search.fit(X_resampled_adasyn, y_resampled_adasyn)

elapsed_time = time.time() - start_time 
print(f"The total elapsed time is: {elapsed_time}")

best_score = grid_search.best_score_
best_params = grid_search.best_params_

print(best_score)
print(best_params)

##### Completing GridSearch for LDA dataset

In [None]:
grid_search.fit(X_train_lda, y_resampled_adasyn)

elapsed_time = time.time() - start_time 
print(f"The total elapsed time is: {elapsed_time}")

best_score = grid_search.best_score_
best_params = grid_search.best_params_

print(best_score)
print(best_params)

##### Completing GridSearch for PCA dataset with n = 13

In [None]:
grid_search.fit(X_train_pca_13, y_resampled_adasyn)

elapsed_time = time.time() - start_time 
print(f"The total elapsed time is: {elapsed_time}")

best_score = grid_search.best_score_
best_params = grid_search.best_params_

print(best_score)
print(best_params)

##### Completing GridSearch for PCA dataset with n = 10

In [None]:
grid_search.fit(X_train_pca_10, y_resampled_adasyn)

elapsed_time = time.time() - start_time 
print(f"The total elapsed time is: {elapsed_time}")

best_score = grid_search.best_score_
best_params = grid_search.best_params_

print(best_score)
print(best_params)

### Hyperparameter Tuning/Cross Validation with Naive Bayes model

#### Creating GridSearch for Naive Bayes Model

In [None]:
## Trying all possible combinations with saga solver that have not been explored
nb_model = GaussianNB()

# Setting up hyperparameters
grid_param = {
    'var_smoothing': [1e-11, 1e-10, 1e-9],
    'priors': [[0.9, 0.1], [0.8, 0.2], [0.7, 0.3], [0.6, 0.4], None]
}

start_time = time.time()

nb_grid_search = GridSearchCV(estimator = nb_model, 
                           param_grid = grid_param, 
                           scoring = 'accuracy', 
                           cv = 5, 
                           n_jobs = -1, 
                           verbose = 3,)

##### Testing on unreduced data

In [None]:
nb_grid_search.fit(X_resampled_adasyn, y_resampled_adasyn)

elapsed_time = time.time() - start_time 
print(f"The total elapsed time is: {elapsed_time}")

best_score = nb_grid_search.best_score_
best_params = nb_grid_search.best_params_

print(best_score)
print(best_params)

##### Testing on LDA data

In [None]:
nb_grid_search.fit(X_train_lda, y_resampled_adasyn)

elapsed_time = time.time() - start_time 
print(f"The total elapsed time is: {elapsed_time}")

best_score = nb_grid_search.best_score_
best_params = nb_grid_search.best_params_

print(best_score)
print(best_params)

##### Testing on PCA data where n = 13

In [None]:
nb_grid_search.fit(X_train_pca_13, y_resampled_adasyn)

elapsed_time = time.time() - start_time 
print(f"The total elapsed time is: {elapsed_time}")

best_score = nb_grid_search.best_score_
best_params = nb_grid_search.best_params_

print(best_score)
print(best_params)

##### Testing on PCA data where n = 10

In [None]:
nb_grid_search.fit(X_train_pca_10, y_resampled_adasyn)

elapsed_time = time.time() - start_time 
print(f"The total elapsed time is: {elapsed_time}")

best_score = nb_grid_search.best_score_
best_params = nb_grid_search.best_params_

print(best_score)
print(best_params)

---

# Cristian Zendejas
* RandomForestClassifier
* Decision Tree

In [None]:
from sklearn.decomposition import PCA
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.decomposition import KernelPCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import precision_score, recall_score, f1_score
from sklearn.model_selection import GridSearchCV, cross_val_score
from sklearn.pipeline import Pipeline

### Performing PCA here with the 2 different resampled datasets. Both will also be used to train separate models for RandomForestClassifier & Decision Tree. The experiement for this section will be to see which form of analysis PCA, LDA, or Kernel PCA will produce the best model.

In [None]:
# look into a chart to display the current data we have?

In [None]:
# keep an eye on the accuracy for the base models and after the dimensionality reduction, Karryn said her accuracy did not change much

In [None]:
# did we decide to drop random under sampler since it didn't provide much value when using it in the models?

## Establishing some base models to compare later models too.

In [None]:
rfc_base_model = RandomForestClassifier(criterion='entropy')
rfc_base_model.fit(X_resampled_adasyn, y_resampled_adasyn)

rfc_base_y_hat = rfc_base_model.predict(X_test)
print(rfc_base_y_hat)

rfc_base_precision = precision_score(y_test, rfc_base_y_hat, average='macro', zero_division=0.0)
rfc_base_recall = recall_score(y_test, rfc_base_y_hat, average='macro', zero_division=0.0)
rfc_base_f1 = f1_score(y_test, rfc_base_y_hat, average='macro', zero_division=0.0)
rfc_base_accuracy = rfc_base_model.score(X_test, y_test)

print(f"RFC Precision: {rfc_base_precision:.2f}")
print(f"RFC Recall: {rfc_base_recall:.2f}")
print(f"RFC F1 Score: {rfc_base_f1:.2f}")
print(f"RFC Accuracy: {rfc_base_accuracy: .2f}")

In [None]:
dtc_base_model = DecisionTreeClassifier(criterion='entropy')
dtc_base_model.fit(X_resampled_adasyn, y_resampled_adasyn)

dtc_base_y_hat = dtc_base_model.predict(X_test)

dtc_base_precision = precision_score(y_test, dtc_base_y_hat, average='macro', zero_division=0.0)
dtc_base_recall = recall_score(y_test, dtc_base_y_hat, average='macro', zero_division=0.0)
dtc_base_f1 = f1_score(y_test, dtc_base_y_hat, average='macro', zero_division=0.0)
dtc_base_accuracy = dtc_base_model.score(X_test, y_test)

print(f"DTC Precision: {dtc_base_precision:.2f}")
print(f"DTC Recall: {dtc_base_recall:.2f}")
print(f"DTC F1 Score: {dtc_base_f1:.2f}")
print(f"DTC Accuracy: {dtc_base_accuracy: .2f}")

### I'm expecting the RandomForestClassifier to do better than DecisionTreeClassifier since it is just an improved version of it.

## Dimensionality Reduction

In [None]:
def find_best_explained_variance(model_class, X_train, y_train):
    print(model_class)
    best_explained_variance = -np.inf # using negative infinity to ensure the variable is updated.
    best_n_components = None
    n_range = np.arange(1,15)

    for n in n_range:
        if model_class == PCA:
            model = PCA(n_components=n)
        elif model_class == LinearDiscriminantAnalysis: # do I event want to find the best components for this?
            n_classes = len(np.unique(y_train))
            max_components = min(X_train.shape[1], n_classes - 1)
            if n > max_components:
                continue
            model = LinearDiscriminantAnalysis(n_components=n)
        elif model_class == KernelPCA:
            model = KernelPCA(n_components=n, kernel='rbf')
        else:
            raise ValueError("Unsupported model class")

        if model_class == PCA or model_class == KernelPCA:
            X_train_transformed = model.fit_transform(X_train)
        else:
            X_train_transformed = model.fit_transform(X_train, y_train)

        if model_class == PCA or model_class == LinearDiscriminantAnalysis:
            explained_variance = np.sum(model.explained_variance_ratio_)
            # print(explained_variance)
        elif model_class == KernelPCA:
            explained_variance = np.sum(np.var(X_train_transformed, axis=0)) / np.sum(np.var(X_train, axis=0))

        if explained_variance > best_explained_variance:
            best_explained_variance = explained_variance
            best_n_components = n

    return best_explained_variance, best_n_components

In [None]:
best_explained_variance, best_n_components = find_best_explained_variance(PCA, X_resampled_adasyn, y_resampled_adasyn)
print(f"Best explained variance: {best_explained_variance}")
print(f"Optimal number of components: {best_n_components}")

In [None]:
pca_1 = PCA(n_components=best_n_components)
X_train_pca_adasyn = pca_1.fit_transform(X_resampled_adasyn)
print(X_train_pca_adasyn.shape)
# did not actually use adasyn for test set, just using the naming to keep track which variables belong with their respective algorithms
X_test_pca_adasyn = pca_1.transform(X_test)
print(X_test_pca_adasyn.shape)

In [None]:
pca_1.explained_variance_

In [None]:
pca_1.explained_variance_ratio_

In [None]:
np.sum(pca_1.explained_variance_ratio_)

In [None]:
print(X_train_pca_adasyn)

### 14 components seems to get me the closest to .95 or greater variance.

In [None]:
best_explained_variance, best_n_components = find_best_explained_variance(PCA, X_resampled_rus, y_resampled_rus)
print(f"Best explained variance: {best_explained_variance}")
print(f"Optimal number of components: {best_n_components}")

In [None]:
pca_2 = PCA(n_components=best_n_components)
X_train_pca_rus = pca_2.fit_transform(X_resampled_rus)
print(X_train_pca_rus.shape)
# did not actually use rus for test set, just using the naming to keep track which variables belong with their respective algorithms
X_test_pca_rus = pca_2.transform(X_test)
print(X_test_pca_rus.shape)

In [None]:
pca_2.explained_variance_

In [None]:
pca_2.explained_variance_ratio_

In [None]:
np.sum(pca_2.explained_variance_ratio_)

In [None]:
print(X_train_pca_rus)

### RandomForestClassifier models with one using the ADASYN training data and the other using the RandomUnderSampler data

In [None]:
rfc_pca_1_model = RandomForestClassifier(criterion='entropy')# need to verify this is the right criterion
rfc_pca_1_model.fit(X_train_pca_adasyn, y_resampled_adasyn)

In [None]:
rfc_pca_1_y_hat = rfc_pca_1_model.predict(X_test_pca_adasyn)
print(rfc_pca_1_y_hat)

In [None]:
rfc_pca_1_precision = precision_score(y_test, rfc_pca_1_y_hat, average='macro', zero_division=0.0)
rfc_pca_1_recall = recall_score(y_test, rfc_pca_1_y_hat, average='macro', zero_division=0.0)
rfc_pca_1_f1 = f1_score(y_test, rfc_pca_1_y_hat, average='macro', zero_division=0.0)
rfc_pca_1_accuracy = rfc_pca_1_model.score(X_test_pca_adasyn, y_test)

print(f"RFC Precision: {rfc_pca_1_precision:.2f}")
print(f"RFC Recall: {rfc_pca_1_recall:.2f}")
print(f"RFC F1 Score: {rfc_pca_1_f1:.2f}")
print(f"RFC Accuracy: {rfc_pca_1_accuracy: .2f}")

In [None]:
rfc_pca_2_model = RandomForestClassifier(criterion='entropy')# need to verify this is the right criterion
rfc_pca_2_model.fit(X_train_pca_rus, y_resampled_rus)

In [None]:
rfc_pca_2_y_hat = rfc_pca_2_model.predict(X_test_pca_rus)

In [None]:
rfc_pca_2_precision = precision_score(y_test, rfc_pca_2_y_hat, average='macro', zero_division=0.0)
rfc_pca_2_recall = recall_score(y_test, rfc_pca_2_y_hat, average='macro', zero_division=0.0)
rfc_pca_2_f1 = f1_score(y_test, rfc_pca_2_y_hat, average='macro', zero_division=0.0)
rfc_pca_2_accuracy = rfc_pca_2_model.score(X_test_pca_rus, y_test)

print(f"RFC Precision: {rfc_pca_2_precision:.2f}")
print(f"RFC Recall: {rfc_pca_2_recall:.2f}")
print(f"RFC F1 Score: {rfc_pca_2_f1:.2f}")
print(f"RFC Accuracy: {rfc_pca_2_accuracy: .2f}")

### When looking at the metrics there isn't much of a difference between using RandomUnderSampling and ADASYN. I may decide not use RandomUnderSampling for future models.

### Let's check with the decision tree models to see if there is any improvements. I'll use ADASYN and the RandomUnderSampling data sets again.

In [None]:
dtc_pca_1_model = DecisionTreeClassifier(criterion='entropy')
dtc_pca_1_model.fit(X_train_pca_adasyn, y_resampled_adasyn)

In [None]:
dtc_pca_1_y_hat = dtc_pca_1_model.predict(X_test_pca_adasyn)

In [None]:
dtc_pca_1_precision = precision_score(y_test, dtc_pca_1_y_hat, average='macro', zero_division=0.0)
dtc_pca_1_recall = recall_score(y_test, dtc_pca_1_y_hat, average='macro', zero_division=0.0)
dtc_pca_1_f1 = f1_score(y_test, dtc_pca_1_y_hat, average='macro', zero_division=0.0)
dtc_pca_1_accuracy = dtc_pca_1_model.score(X_test_pca_adasyn, y_test)

print(f"DTC Precision: {dtc_pca_1_precision:.2f}")
print(f"DTC Recall: {dtc_pca_1_recall:.2f}")
print(f"DTC F1 Score: {dtc_pca_1_f1:.2f}")
print(f"DTC Accuracy: {dtc_pca_1_accuracy: .2f}")

In [None]:
dtc_pca_2_model = DecisionTreeClassifier(criterion='entropy')
dtc_pca_2_model.fit(X_train_pca_rus, y_resampled_rus)

In [None]:
dtc_pca_2_y_hat = dtc_pca_2_model.predict(X_test_pca_rus)

In [None]:
dtc_pca_2_precision = precision_score(y_test, dtc_pca_2_y_hat, average='macro', zero_division=0.0)
dtc_pca_2_recall = recall_score(y_test, dtc_pca_2_y_hat, average='macro', zero_division=0.0)
dtc_pca_2_f1 = f1_score(y_test, dtc_pca_2_y_hat, average='macro', zero_division=0.0)
dtc_pca_2_accuracy = dtc_pca_2_model.score(X_test_pca_rus, y_test)

print(f"DTC Precision: {dtc_pca_2_precision:.2f}")
print(f"DTC Recall: {dtc_pca_2_recall:.2f}")
print(f"DTC F1 Score: {dtc_pca_2_f1:.2f}")
print(f"DTC Accuracy: {dtc_pca_2_accuracy: .2f}")

### Using the RandomUnderSampling technique again didn't provide much benefit here.

---

In [None]:
best_explained_variance, best_n_components = find_best_explained_variance(LinearDiscriminantAnalysis, X_resampled_adasyn, y_resampled_adasyn)
print(f"Best explained variance: {best_explained_variance}")
print(f"Optimal number of components: {best_n_components}")

In [None]:
lda_1 = LinearDiscriminantAnalysis(n_components=best_n_components)

In [None]:
X_train_lda_adasyn = lda_1.fit_transform(X_resampled_adasyn, y_resampled_adasyn)
X_test_lda_adasyn = lda_1.transform(X_test)
print(X_train_lda_adasyn.shape)
print(X_test_lda_adasyn.shape)

In [None]:
# need to update these
# plt.figure(figsize=(10, 8))
# plt.scatter(X_train_lda_adasyn, y_resampled_adasyn, c=y_resampled_adasyn, cmap='tab20')

# plt.xlabel('LDA Component')
# plt.ylabel('Class')
# plt.title('LDA Component 1 vs Class')
# plt.colorbar(label='Class')
# plt.show()

In [None]:
rfc_lda_1_model = RandomForestClassifier(criterion='entropy')
rfc_lda_1_model.fit(X_train_lda_adasyn, y_resampled_adasyn)

In [None]:
rfc_lda_1_y_hat = rfc_lda_1_model.predict(X_test_lda_adasyn)
print(rfc_lda_1_y_hat)

In [None]:
rfc_lda_1_precision = precision_score(y_test, rfc_lda_1_y_hat, average='macro', zero_division=0.0)
rfc_lda_1_recall = recall_score(y_test, rfc_lda_1_y_hat, average='macro', zero_division=0.0)
rfc_lda_1_f1 = f1_score(y_test, rfc_lda_1_y_hat, average='macro', zero_division=0.0)
rfc_lda_1_accuracy = rfc_lda_1_model.score(X_test_lda_adasyn, y_test)

print(f"RFC Precision: {rfc_lda_1_precision:.2f}")
print(f"RFC Recall: {rfc_lda_1_recall:.2f}")
print(f"RFC F1 Score: {rfc_lda_1_f1:.2f}")
print(f"RFC Accuracy: {rfc_lda_1_accuracy: .2f}")

In [None]:
dtc_lda_1_model = DecisionTreeClassifier(criterion='entropy')
dtc_lda_1_model.fit(X_train_lda_adasyn, y_resampled_adasyn)

In [None]:
dtc_lda_1_y_hat = dtc_lda_1_model.predict(X_test_lda_adasyn)

In [None]:
dtc_lda_1_precision = precision_score(y_test, dtc_lda_1_y_hat, average='macro', zero_division=0.0)
dtc_lda_1_recall = recall_score(y_test, dtc_lda_1_y_hat, average='macro', zero_division=0.0)
dtc_lda_1_f1 = f1_score(y_test, dtc_lda_1_y_hat, average='macro', zero_division=0.0)
dtc_lda_1_accuracy = dtc_lda_1_model.score(X_test_lda_adasyn, y_test)

print(f"DTC Precision: {dtc_lda_1_precision:.2f}")
print(f"DTC Recall: {dtc_lda_1_recall:.2f}")
print(f"DTC F1 Score: {dtc_lda_1_f1:.2f}")
print(f"DTC Accuracy: {dtc_lda_1_accuracy: .2f}")

### With LDA proving to have lower metrics for both the RandomForestClassifier and DecisionTreeClassifiers, I do not believe using LinearDiscriminantAnalysis with the RandomUnderSampling technique will be much better so I'll go straight to Kernel PCA.

In [None]:
# a chart here to display the metrics for both?

---

In [None]:
# pipeline = Pipeline([
#     ('kpca', KernelPCA(kernel='rbf')),
#     ('rfc', RandomForestClassifier(criterion='entropy'))
# ])

# n = np.arange(1,11)
# param_grid = {
#     'kpca__n_components': n,
# }

# grid_search = GridSearchCV(pipeline, param_grid)
# grid_search.fit(X_resampled_adasyn, y_resampled_adasyn)

# best_params = grid_search.best_params_
# print(f"Optimal parameters: {best_params}")

# kernel_pca_optimal = KernelPCA(n_components=best_params['kpca__n_components'], kernel='rbf')
# X_train_kpca_adasyn = kernel_pca_optimal.fit_transform(X_resampled_adasyn)
# X_test_kpca_adasyn = kernel_pca_optimal.transform(X_test)

# rfc_kpca_model = RandomForestClassifier(criterion='entropy')
# rfc_kpca_model.fit(X_train_kpca_adasyn, y_resampled_adasyn)

# cv_scores = cross_val_score(rfc_kpca_model, X_train_kpca_adasyn, y_resampled_adasyn)
# print(f"Cross-validation scores: {cv_scores}")
# print(f"Mean cross-validation score: {cv_scores.mean()}")

# test_score = rfc_kpca_model.score(X_test_kpca_adasyn, y_test)
# print(f"Test set score: {test_score}")


In [None]:
# kernel_pca_1 = KernelPCA(n_components=2, kernel='rbf')
# X_train_kpca_adasyn = kernel_pca_1.fit_transform(X_resampled_adasyn)
# X_test_kpca_adasyn = kernel_pca_1.transform(X_test)
# print(X_train_kpca_adasyn.shape)
# print(X_test_kpca_adasyn.shape)

In [None]:
best_explained_variance, best_n_components = find_best_explained_variance(KernelPCA, X_resampled_adasyn, y_resampled_adasyn)
print(f"Best explained variance: {best_explained_variance}")
print(f"Optimal number of components: {best_n_components}")

In [None]:
# plt.figure(figsize=(10, 8))
# plt.scatter(X_train_kpca_adasyn, X_train_kpca_adasyn, c=y_resampled_adasyn, cmap='tab20')

# plt.xlabel('Kernel PCA Component 1')
# plt.ylabel('Kernel PCA Component 2')
# plt.title('Kernel PCA Component vs Class')
# plt.colorbar(label='Class')
# plt.show()

In [None]:
optimal_kpca = KernelPCA(n_components=best_n_components, kernel='rbf')
X_train_kpca_adasyn = optimal_kpca.fit_transform(X_resampled_adasyn)
X_test_kpca_adasyn = optimal_kpca.transform(X_test)
print(X_train_kpca_adasyn.shape)
print(X_test_kpca_adasyn.shape)

In [None]:
rfc_kpca_model = RandomForestClassifier(criterion='entropy')
rfc_kpca_model.fit(X_train_kpca_adasyn, y_resampled_adasyn)

In [None]:
rfc_kpca_y_hat = rfc_kpca_model.predict(X_test_kpca_adasyn)
print(rfc_kpca_y_hat)

In [None]:
rfc_kpca_precision = precision_score(y_test, rfc_kpca_y_hat, average='macro', zero_division=0.0)
rfc_kpca_recall = recall_score(y_test, rfc_kpca_y_hat, average='macro', zero_division=0.0)
rfc_kpca_f1 = f1_score(y_test, rfc_kpca_y_hat, average='macro', zero_division=0.0)
rfc_kpca_accuracy = rfc_kpca_model.score(X_test_kpca_adasyn, y_test)

print(f"RFC Precision: {rfc_kpca_precision:.2f}")
print(f"RFC Recall: {rfc_kpca_recall:.2f}")
print(f"RFC F1 Score: {rfc_kpca_f1:.2f}")
print(f"RFC Accuracy: {rfc_kpca_accuracy: .2f}")

In [None]:
dtc_kpca_model = DecisionTreeClassifier(criterion='entropy')
dtc_kpca_model.fit(X_train_kpca_adasyn, y_resampled_adasyn)

In [None]:
dtc_kpca_y_hat = dtc_kpca_model.predict(X_test_kpca_adasyn)

In [None]:
dtc_kpca_precision = precision_score(y_test, dtc_kpca_y_hat, average='macro', zero_division=0.0)
dtc_kpca_recall = recall_score(y_test, dtc_kpca_y_hat, average='macro', zero_division=0.0)
dtc_kpca_f1 = f1_score(y_test, dtc_kpca_y_hat, average='macro', zero_division=0.0)
dtc_kpca_accuracy = dtc_kpca_model.score(X_test_kpca_adasyn, y_test)

print(f"DTC Precision: {dtc_kpca_precision:.2f}")
print(f"DTC Recall: {dtc_kpca_recall:.2f}")
print(f"DTC F1 Score: {dtc_kpca_f1:.2f}")
print(f"DTC Accuracy: {dtc_kpca_accuracy: .2f}")

### Both models did not do well with KPCA, which means I will want to stick with just PCA outcomes.

### Could use this code to compare all the models accuracies.

In [None]:
# need to update it still
accuracies = {
    'Logistic Regression': lr_accuracy,
    'PCA 1': lr_accuracy_2,
    'PCA 2': lr_accuracy_3,
    'LDA': lr_accuracy_4,
    'KPCA': lr_accuracy_5
}
plt.figure(figsize=(12, 6))
plt.plot(list(accuracies.keys()), list(accuracies.values()),
         marker='o',
         linestyle='-',
         linewidth=2,
         markersize=8)

plt.title('Model Accuracy Comparison', fontsize=14)
plt.xlabel('Models', fontsize=12)
plt.ylabel('Accuracy Score', fontsize=12)
plt.grid(True, linestyle='--', alpha=0.7)

for i, v in enumerate(accuracies.values()):
    plt.text(i, v, f'{v:.3f}',
             ha='center',
             va='bottom',
             fontsize=10)

plt.ylim(0, 1.0)  # Assuming accuracy values are between 0 and 1
plt.xticks(rotation=45)
plt.legend(['Accuracy'])
plt.tight_layout()
plt.show()