In [None]:
!pip install catboost

In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import warnings
import nltk
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, ConfusionMatrixDisplay, RocCurveDisplay
from sklearn.preprocessing import OneHotEncoder
from catboost import CatBoostClassifier
from imblearn.pipeline import Pipeline as ImbPipeline
from sklearn.metrics import roc_auc_score

In [2]:
custom_stopwords = nltk.corpus.stopwords.words('english')

stopword_list = ['moderna', 'flu', 'mrna', 'vaccine', 'the', 'patient', 'pfizer', 
                 'biontech', 'nan', 'none', 'mg', 'medical', 'history', 'allergy', 
                 'year', 'old','uk','nkda','nka','known','reported','listed']

In [3]:
textTransformer_0 = Pipeline(steps=[
    ('text_bow', TfidfVectorizer(lowercase=True,\
                                 token_pattern=r"(?u)\b\w+\b",\
                                 stop_words=stopword_list,
                                 ngram_range=(2, 2),max_features=5000))])

In [4]:
# Apply text preprocessing to the necessary text columns
tfidfprocess = ColumnTransformer(transformers=[
    ('text1', textTransformer_0, 'SYMPTOM_TEXT'),
    ('text2', textTransformer_0, 'OTHER_MEDS'),
    ('text3', textTransformer_0, 'CUR_ILL'),
    ('text4', textTransformer_0, 'HISTORY'),
    ('text5', textTransformer_0, 'ALLERGIES'),
    ('num', StandardScaler(), ['AGE_YRS', 'HOSPDAYS'])  # StandardScaler for numerical columns
], remainder='passthrough')

In [9]:
varzos_clean = pd.read_csv(r'C:/Users/chand/Downloads/606/varzos_ML.csv')

In [10]:
# One-hot encode VAX_MANU (vaccine manufacturer column)
vaccine_data = pd.get_dummies(varzos_clean.VAX_MANU, prefix='VAX_MANU')
varzos_clean = pd.merge(varzos_clean, vaccine_data, left_index=True, right_index=True)


# Drop the original vaccine manufacturer column (VAX_MANU)
varzos_clean = varzos_clean.drop('VAX_MANU', axis=1)

In [13]:
from sklearn.model_selection import train_test_split

y=varzos_clean.SERIOUS
X=varzos_clean.drop(['SERIOUS','SYMPTOM1','SYMPTOM2','SYMPTOM3','SYMPTOM4',
                    'SYMPTOM5'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, 
                                                    random_state=200, stratify=y)

print(f'Training examples: {X_train.shape[0]:,}')
print(f'Test examples: {X_test.shape[0]:,}')

Training examples: 40,589
Test examples: 17,396


In [15]:
print(X_train.columns)

Index(['AGE_YRS', 'SEX', 'SYMPTOM_TEXT', 'DIED', 'L_THREAT', 'HOSPITAL',
       'HOSPDAYS', 'DISABLE', 'RECOVD', 'OTHER_MEDS', 'CUR_ILL', 'HISTORY',
       'BIRTH_DEFECT', 'ALLERGIES', 'VAX_MANU_GLAXOSMITHKLINE BIOLOGICALS',
       'VAX_MANU_MERCK & CO. INC.'],
      dtype='object')


In [21]:
from sklearn.utils.class_weight import compute_class_weight

# Function for threshold adjustment
def adjust_threshold(probabilities, threshold=0.5):
    """Adjust prediction probabilities based on a custom threshold."""
    return (probabilities[:, 1] >= threshold).astype(int)

# Compute class weights
class_weight = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weight_dict = dict(zip(np.unique(y_train), class_weight))

# Calculate scale_pos_weight for CatBoost
num_positive = np.sum(y_train == 1)
num_negative = np.sum(y_train == 0)
scale_pos_weight = num_negative / num_positive

# CatBoost pipeline including preprocessing
pipeline_catboost = Pipeline(steps=[
    ('tfidf', tfidfprocess),
    ('classifier', CatBoostClassifier(random_state=200, scale_pos_weight=scale_pos_weight, verbose=0))
])

# Fit the CatBoost pipeline
pipeline_catboost.fit(X_train, y_train)

# Predict with the standard threshold
y_pred_catboost = pipeline_catboost.predict(X_test)

# Evaluate the CatBoost model
print("\nCatBoost Model (Default Threshold):")
print("Accuracy:", accuracy_score(y_test, y_pred_catboost))
print("Classification Report:\n", classification_report(y_test, y_pred_catboost))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_catboost))

# Apply preprocessing to transform the training and test data
X_train_transformed = tfidfprocess.fit_transform(X_train)
X_test_transformed = tfidfprocess.transform(X_test)

# Initialize and train the CatBoost model
catboost_model = CatBoostClassifier(silent=True, scale_pos_weight=scale_pos_weight, random_state=200)
catboost_model.fit(X_train_transformed, y_train)

# Predict probabilities and adjust the threshold
threshold = 0.3  # Custom threshold
y_pred_threshold_catboost = adjust_threshold(catboost_model.predict_proba(X_test_transformed), threshold)

# Evaluate the CatBoost model with the adjusted threshold
print("\nCatBoost Model with Threshold Adjustment:")
print("Accuracy:", accuracy_score(y_test, y_pred_threshold_catboost))
print("Classification Report:\n", classification_report(y_test, y_pred_threshold_catboost))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_threshold_catboost))


CatBoost Model (Default Threshold):
Accuracy: 0.9552770751896987
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.97      0.98     15843
           1       0.73      0.78      0.76      1553

    accuracy                           0.96     17396
   macro avg       0.86      0.88      0.87     17396
weighted avg       0.96      0.96      0.96     17396

Confusion Matrix:
 [[15402   441]
 [  337  1216]]

CatBoost Model with Threshold Adjustment:
Accuracy: 0.9137157967348816
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.92      0.95     15843
           1       0.51      0.84      0.63      1553

    accuracy                           0.91     17396
   macro avg       0.75      0.88      0.79     17396
weighted avg       0.94      0.91      0.92     17396

Confusion Matrix:
 [[14592  1251]
 [  250  1303]]


In [22]:
import pickle

with open('C:/Users/chand/Downloads/606/varzos.sav', 'wb') as f:
    pickle.dump(pipeline_catboost, f)

In [9]:
covid_clean =pd.read_csv(r'C:/Users/chand/Downloads/606/covid_ML.csv')

In [10]:
# One-hot encode VAX_MANU (vaccine manufacturer column)
vaccine_data = pd.get_dummies(covid_clean.VAX_MANU, prefix='VAX_MANU')
covid_clean = pd.merge(covid_clean, vaccine_data, left_index=True, right_index=True)


# Drop the original vaccine manufacturer column (VAX_MANU)
covid_clean = covid_clean.drop('VAX_MANU', axis=1)

In [11]:
from sklearn.model_selection import train_test_split

y=covid_clean.SERIOUS
X=covid_clean.drop(['SERIOUS','SYMPTOM1','SYMPTOM2','SYMPTOM3','SYMPTOM4',
                    'SYMPTOM5'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, 
                                                    random_state=200, stratify=y)

print(f'Training examples: {X_train.shape[0]:,}')
print(f'Test examples: {X_test.shape[0]:,}')

Training examples: 620,627
Test examples: 265,983


In [12]:
print(X_train.columns)

Index(['AGE_YRS', 'SEX', 'SYMPTOM_TEXT', 'DIED', 'L_THREAT', 'HOSPITAL',
       'HOSPDAYS', 'DISABLE', 'RECOVD', 'OTHER_MEDS', 'CUR_ILL', 'HISTORY',
       'BIRTH_DEFECT', 'ALLERGIES', 'VAX_MANU_JANSSEN', 'VAX_MANU_MODERNA',
       'VAX_MANU_NOVAVAX', 'VAX_MANU_PFIZER\BIONTECH'],
      dtype='object')


In [13]:
from sklearn.utils.class_weight import compute_class_weight

# Function for threshold adjustment
def adjust_threshold(probabilities, threshold=0.5):
    """Adjust prediction probabilities based on a custom threshold."""
    return (probabilities[:, 1] >= threshold).astype(int)

# Compute class weights
class_weight = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weight_dict = dict(zip(np.unique(y_train), class_weight))

# Calculate scale_pos_weight for CatBoost
num_positive = np.sum(y_train == 1)
num_negative = np.sum(y_train == 0)
scale_pos_weight = num_negative / num_positive

# CatBoost pipeline including preprocessing
pipeline_catboost = Pipeline(steps=[
    ('tfidf', tfidfprocess),
    ('classifier', CatBoostClassifier(random_state=200, scale_pos_weight=scale_pos_weight, verbose=0))
])

# Fit the CatBoost pipeline
pipeline_catboost.fit(X_train, y_train)

# Predict with the standard threshold
y_pred_catboost = pipeline_catboost.predict(X_test)

# Evaluate the CatBoost model
print("\nCatBoost Model (Default Threshold):")
print("Accuracy:", accuracy_score(y_test, y_pred_catboost))
print("Classification Report:\n", classification_report(y_test, y_pred_catboost))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_catboost))




CatBoost Model (Default Threshold):
Accuracy: 0.9350935962072764
Classification Report:
               precision    recall  f1-score   support

           0       0.95      0.97      0.96    203584
           1       0.88      0.84      0.86     62399

    accuracy                           0.94    265983
   macro avg       0.92      0.90      0.91    265983
weighted avg       0.93      0.94      0.93    265983

Confusion Matrix:
 [[196517   7067]
 [ 10197  52202]]


In [18]:
# Apply preprocessing to transform the training and test data
X_train_transformed = tfidfprocess.fit_transform(X_train)
X_test_transformed = tfidfprocess.transform(X_test)

# Initialize and train the CatBoost model
catboost_model = CatBoostClassifier(silent=True, scale_pos_weight=scale_pos_weight, random_state=200)
catboost_model.fit(X_train_transformed, y_train)

# Predict probabilities and adjust the threshold
threshold = 0.3  # Custom threshold
y_pred_threshold_catboost = adjust_threshold(catboost_model.predict_proba(X_test_transformed), threshold)

# Evaluate the CatBoost model with the adjusted threshold
print("\nCatBoost Model with Threshold Adjustment:")
print("Accuracy:", accuracy_score(y_test, y_pred_threshold_catboost))
print("Classification Report:\n", classification_report(y_test, y_pred_threshold_catboost))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_threshold_catboost))


CatBoost Model with Threshold Adjustment:
Accuracy: 0.8945233341980503
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.89      0.93    203584
           1       0.72      0.90      0.80     62399

    accuracy                           0.89    265983
   macro avg       0.84      0.89      0.86    265983
weighted avg       0.91      0.89      0.90    265983

Confusion Matrix:
 [[182044  21540]
 [  6515  55884]]


In [19]:
import pickle

with open('C:/Users/chand/Downloads/606/covid.sav', 'wb') as f:
    pickle.dump(pipeline_catboost, f)

In [23]:
ppv_clean =pd.read_csv(r'C:/Users/chand/Downloads/606/ppv_ML.csv')

In [24]:
# One-hot encode VAX_MANU (vaccine manufacturer column)
vaccine_data = pd.get_dummies(ppv_clean.VAX_MANU, prefix='VAX_MANU')
ppv_clean = pd.merge(ppv_clean, vaccine_data, left_index=True, right_index=True)


# Drop the original vaccine manufacturer column (VAX_MANU)
ppv_clean = ppv_clean.drop('VAX_MANU', axis=1)

In [25]:
from sklearn.model_selection import train_test_split

y=ppv_clean.SERIOUS
X=ppv_clean.drop(['SERIOUS','SYMPTOM1','SYMPTOM2','SYMPTOM3','SYMPTOM4',
                    'SYMPTOM5'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, 
                                                    random_state=200, stratify=y)

print(f'Training examples: {X_train.shape[0]:,}')
print(f'Test examples: {X_test.shape[0]:,}')

Training examples: 10,119
Test examples: 4,337


In [26]:
from sklearn.utils.class_weight import compute_class_weight

# Function for threshold adjustment
def adjust_threshold(probabilities, threshold=0.5):
    """Adjust prediction probabilities based on a custom threshold."""
    return (probabilities[:, 1] >= threshold).astype(int)

# Compute class weights
class_weight = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weight_dict = dict(zip(np.unique(y_train), class_weight))

# Calculate scale_pos_weight for CatBoost
num_positive = np.sum(y_train == 1)
num_negative = np.sum(y_train == 0)
scale_pos_weight = num_negative / num_positive

# CatBoost pipeline including preprocessing
pipeline_catboost = Pipeline(steps=[
    ('tfidf', tfidfprocess),
    ('classifier', CatBoostClassifier(random_state=200, scale_pos_weight=scale_pos_weight, verbose=0))
])

# Fit the CatBoost pipeline
pipeline_catboost.fit(X_train, y_train)

# Predict with the standard threshold
y_pred_catboost = pipeline_catboost.predict(X_test)

# Evaluate the CatBoost model
print("\nCatBoost Model (Default Threshold):")
print("Accuracy:", accuracy_score(y_test, y_pred_catboost))
print("Classification Report:\n", classification_report(y_test, y_pred_catboost))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_catboost))

# Apply preprocessing to transform the training and test data
X_train_transformed = tfidfprocess.fit_transform(X_train)
X_test_transformed = tfidfprocess.transform(X_test)

# Initialize and train the CatBoost model
catboost_model = CatBoostClassifier(silent=True, scale_pos_weight=scale_pos_weight, random_state=200)
catboost_model.fit(X_train_transformed, y_train)

# Predict probabilities and adjust the threshold
threshold = 0.3  # Custom threshold
y_pred_threshold_catboost = adjust_threshold(catboost_model.predict_proba(X_test_transformed), threshold)

# Evaluate the CatBoost model with the adjusted threshold
print("\nCatBoost Model with Threshold Adjustment:")
print("Accuracy:", accuracy_score(y_test, y_pred_threshold_catboost))
print("Classification Report:\n", classification_report(y_test, y_pred_threshold_catboost))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_threshold_catboost))


CatBoost Model (Default Threshold):
Accuracy: 0.9644915840442703
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.98      0.98      3923
           1       0.82      0.80      0.81       414

    accuracy                           0.96      4337
   macro avg       0.90      0.89      0.90      4337
weighted avg       0.96      0.96      0.96      4337

Confusion Matrix:
 [[3850   73]
 [  81  333]]

CatBoost Model with Threshold Adjustment:
Accuracy: 0.9513488586580585
Classification Report:
               precision    recall  f1-score   support

           0       0.98      0.97      0.97      3923
           1       0.71      0.82      0.76       414

    accuracy                           0.95      4337
   macro avg       0.85      0.89      0.87      4337
weighted avg       0.96      0.95      0.95      4337

Confusion Matrix:
 [[3787  136]
 [  75  339]]


In [32]:
import pickle

with open('C:/Users/chand/Downloads/606/ppv.sav', 'wb') as f:
    pickle.dump(pipeline_catboost, f)

In [33]:
flu_clean =pd.read_csv(r'C:/Users/chand/Downloads/606/flu_ML.csv')

In [34]:
# One-hot encode VAX_MANU (vaccine manufacturer column)
vaccine_data = pd.get_dummies(flu_clean.VAX_MANU, prefix='VAX_MANU')
flu_clean = pd.merge(flu_clean, vaccine_data, left_index=True, right_index=True)


# Drop the original vaccine manufacturer column (VAX_MANU)
flu_clean = flu_clean.drop('VAX_MANU', axis=1)

In [35]:
from sklearn.model_selection import train_test_split

y=flu_clean.SERIOUS
X=flu_clean.drop(['SERIOUS','SYMPTOM1','SYMPTOM2','SYMPTOM3','SYMPTOM4',
                    'SYMPTOM5'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.3, 
                                                    random_state=200, stratify=y)

print(f'Training examples: {X_train.shape[0]:,}')
print(f'Test examples: {X_test.shape[0]:,}')

Training examples: 40,375
Test examples: 17,304


In [36]:
from sklearn.utils.class_weight import compute_class_weight

# Function for threshold adjustment
def adjust_threshold(probabilities, threshold=0.5):
    """Adjust prediction probabilities based on a custom threshold."""
    return (probabilities[:, 1] >= threshold).astype(int)

# Compute class weights
class_weight = compute_class_weight(
    class_weight='balanced',
    classes=np.unique(y_train),
    y=y_train
)
class_weight_dict = dict(zip(np.unique(y_train), class_weight))

# Calculate scale_pos_weight for CatBoost
num_positive = np.sum(y_train == 1)
num_negative = np.sum(y_train == 0)
scale_pos_weight = num_negative / num_positive

# CatBoost pipeline including preprocessing
pipeline_catboost = Pipeline(steps=[
    ('tfidf', tfidfprocess),
    ('classifier', CatBoostClassifier(random_state=200, scale_pos_weight=scale_pos_weight, verbose=0))
])

# Fit the CatBoost pipeline
pipeline_catboost.fit(X_train, y_train)

# Predict with the standard threshold
y_pred_catboost = pipeline_catboost.predict(X_test)

# Evaluate the CatBoost model
print("\nCatBoost Model (Default Threshold):")
print("Accuracy:", accuracy_score(y_test, y_pred_catboost))
print("Classification Report:\n", classification_report(y_test, y_pred_catboost))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_catboost))

# Apply preprocessing to transform the training and test data
X_train_transformed = tfidfprocess.fit_transform(X_train)
X_test_transformed = tfidfprocess.transform(X_test)

# Initialize and train the CatBoost model
catboost_model = CatBoostClassifier(silent=True, scale_pos_weight=scale_pos_weight, random_state=200)
catboost_model.fit(X_train_transformed, y_train)

# Predict probabilities and adjust the threshold
threshold = 0.3  # Custom threshold
y_pred_threshold_catboost = adjust_threshold(catboost_model.predict_proba(X_test_transformed), threshold)

# Evaluate the CatBoost model with the adjusted threshold
print("\nCatBoost Model with Threshold Adjustment:")
print("Accuracy:", accuracy_score(y_test, y_pred_threshold_catboost))
print("Classification Report:\n", classification_report(y_test, y_pred_threshold_catboost))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred_threshold_catboost))


CatBoost Model (Default Threshold):
Accuracy: 0.9407651410078595
Classification Report:
               precision    recall  f1-score   support

           0       0.96      0.97      0.96     14137
           1       0.85      0.83      0.84      3167

    accuracy                           0.94     17304
   macro avg       0.90      0.90      0.90     17304
weighted avg       0.94      0.94      0.94     17304

Confusion Matrix:
 [[13658   479]
 [  546  2621]]

CatBoost Model with Threshold Adjustment:
Accuracy: 0.9006588072122053
Classification Report:
               precision    recall  f1-score   support

           0       0.97      0.90      0.94     14137
           1       0.67      0.89      0.77      3167

    accuracy                           0.90     17304
   macro avg       0.82      0.90      0.85     17304
weighted avg       0.92      0.90      0.91     17304

Confusion Matrix:
 [[12760  1377]
 [  342  2825]]


In [38]:
import pickle

with open('C:/Users/chand/Downloads/606/flu.sav', 'wb') as f:
    pickle.dump(pipeline_catboost, f)

In [42]:
results_df = pd.DataFrame({
    'True Labels': y_test,
    'Predictions': y_pred_catboost
})

results_df.to_csv('C:/Users/chand/Downloads/606/predictions.csv', index=False)