<a href="https://colab.research.google.com/github/Anjali-DA/Symposium-source-code/blob/main/hai_prediction_model.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [21]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
import string
# Standardization 
from sklearn.preprocessing import StandardScaler
import nltk
# remove stopwords
from nltk.corpus import stopwords
# lemmatization & pos tagging
from nltk import pos_tag
from nltk.corpus import wordnet
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

from sklearn.feature_extraction.text import TfidfVectorizer

from datetime import datetime


In [22]:
df= pd.read_csv('/content/drive/MyDrive/Colab Notebooks/dataset/HAI_dataset.csv')

In [23]:
df.dropna()
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199 entries, 0 to 198
Data columns (total 13 columns):
 #   Column                Non-Null Count  Dtype 
---  ------                --------------  ----- 
 0   Patient ID            199 non-null    int64 
 1   Age                   199 non-null    int64 
 2   Sex                   199 non-null    object
 3   Medical History       199 non-null    object
 4   Hospitalization Data  199 non-null    object
 5   Laboratory Data       199 non-null    object
 6   Imaging Data          199 non-null    object
 7   Microbiology Data     199 non-null    object
 8   Risk Factors          199 non-null    object
 9   Symptoms              199 non-null    object
 10  Signs                 198 non-null    object
 11  Treatment             198 non-null    object
 12  HAI Name              198 non-null    object
dtypes: int64(2), object(11)
memory usage: 20.3+ KB


#finding Y

In [24]:
df['HAI Name'].unique()
df['HAI Name']= df['HAI Name'].replace(['Migraine', 'Allergic Rhinitis',
       'NAFLD (Non-Alcoholic Fatty Liver Disease)', 'Type 2 Diabetes',
       'Osteoporotic Fracture', 'Hypertensive Heart Disease', 'Asthma',
       'Cardiovascular Disease', 'Hypothyroidism',
       'Breast Cancer', 'Allergic Dermatitis', 'Hypertensive Diabetes',
       'Diabetic Hypertension', 'Fracture', 'Hypertensive Kidney Disease',
     'Diabetic Infection', 'Bone Infection',
       'Cardiovascular Infection', 'Hypertensive Infection','Myocardial Infarction', 'Osteoarthritis', 'COPD', 'Hypertension',
       'Fatty Liver Disease', 'Coronary Artery Disease',
       'Asthma exacerbation','Hypertension, Non-alcoholic fatty liver disease', 'Heart failure',
       'Hypertensive cerebrovascular disease'],'other')

df['HAI Name']= df['HAI Name'].map({'other':0,'Pneumonia':1,'Influenza':2,'Unspecified Infection':3,'COVID-19':4,'Respiratory Infection':5})
df['HAI Name'] = df['HAI Name'].replace('None', -1)
df['HAI Name'] = df['HAI Name'].fillna(-1)
df['HAI Name'].dropna()
print(len(df['HAI Name']))

199


#data preprocessing

In [25]:
class Processing:
    def __init__(self, df):
        df1 = df.drop(['Patient ID'], axis=1)
        df2 = df1
        for index, row in df2.iterrows():
            try:
                start_date_str, end_date_str = row['Hospitalization Data'].split(' to ')
                start_date = datetime.strptime(start_date_str, '%Y-%m-%d')
                end_date = datetime.strptime(end_date_str, '%Y-%m-%d')
                days = (end_date - start_date).days
                df2.at[index, 'Hospitalization Data'] = days
            except ValueError:
                print(f"Invalid date range at index {index}: {row['Hospitalization Data']}")
        
        df2['Sex'] = df2['Sex'].replace(['Female'], 'F')
        df2['Sex'] = df2['Sex'].replace(['Male'], 'M')
        df2['Sex'] = df2['Sex'].map({'F': 1, 'M': 0}).astype(int)
        df2 = df2.replace('None', -1)
    
        nc = df2.select_dtypes(include=['int', 'float']).columns
        tc = df2.select_dtypes(include='object').columns
        nd = df2[nc]
        td = df2[tc]
        scaler = StandardScaler()
        std_nd = scaler.fit_transform(nd)
        std_nd = pd.DataFrame(std_nd, columns=nd.columns)
        
        # Text preprocessing
        # 1. Lowercase
        # for column in td.columns:
        #     if df2[column].dtype in [np.int64, np.float64]:
        #         continue
        #     else:
        #         td[column] = td[column].str.lower()
        
        # 2. Remove punctuation
        for column in td.columns:
            td[column] = td[column].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)) if isinstance(x, str) else x)
        
        # 3. Remove stopwords
        for column in td.columns:
            stop_words = set(stopwords.words('english'))
            td[column] = td[column].apply(lambda x: ' '.join([word for word in str(x).split() if word.lower() not in stop_words]))
        
        # 4. Lemmatizer
        lemmatizer = WordNetLemmatizer()
        
        def lemmatize_word(word, pos_tag):
            if pos_tag.startswith('J'):
                pos = wordnet.ADJ
            elif pos_tag.startswith('V'):
                pos = wordnet.VERB
            elif pos_tag.startswith('N'):
                pos = wordnet.NOUN
            elif pos_tag.startswith('R'):
                pos = wordnet.ADV
            else:
                pos = wordnet.NOUN
            return lemmatizer.lemmatize(word, pos)

        def lemmatize_column(column):
            if column == 'NaN':
                return column
            elif isinstance(column, str):
                tokens = word_tokenize(column)
                pos_tags = nltk.pos_tag(tokens)
                lemmatized_words = [lemmatize_word(word, pos_tag) for word, pos_tag in pos_tags]
                return ' '.join(lemmatized_words)
            else:
                return column
        
        for column in td.columns:
            td.loc[:, column] = td[column].apply(lemmatize_column)
        
        # 5. Vectorizer
        text_columns = ['Medical History', 'Laboratory Data', 'Imaging Data', 'Microbiology Data',
                        'Risk Factors', 'Symptoms', 'Signs', 'Treatment']

        td['Combined Text'] = td[text_columns].apply(lambda x: ' '.join(x.dropna().astype(str)), axis=1)

        vectorizer = TfidfVectorizer()
        vectorized_data = vectorizer.fit_transform(td['Combined Text'])

        self.vectorized_df = pd.DataFrame(vectorized_data.toarray(), columns=vectorizer.get_feature_names_out())
        self.vectorized_text_data = pd.concat([td, self.vectorized_df], axis=1)
        
        # 6. Combine data
        self.combined_data = pd.concat([nd, self.vectorized_df], axis=1)
        self.combined_data = self.combined_data.fillna(-1)
        # self.combined_data = self.combined_data.dropna()
    
    def process_data(self):
        return self.combined_data


In [26]:
# nltk.download('stopwords')
# nltk.download('punkt')
# nltk.download('averaged_perceptron_tagger')
# nltk.download('wordnet')

In [27]:
processing = Processing(df)
combined_data = processing.process_data()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  td[column] = td[column].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)) if isinstance(x, str) else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  td[column] = td[column].apply(lambda x: ' '.join([word for word in str(x).split() if word.lower() not in stop_words]))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing

In [28]:
combined_data.head()
# combined_data.info()

Unnamed: 0,Age,Sex,Hospitalization Data,HAI Name,abdomen,abdominal,abnormal,abnormality,alcohol,allergens,...,urine,visual,wall,wbc,weak,weakness,weight,wheezing,white,xray
0,45,0,7,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.163416
1,32,1,8,2.0,0.0,0.0,0.167199,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,60,0,7,0.0,0.0,0.0,0.0,0.469022,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.176446
3,28,1,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.349131,0.0
4,50,0,8,0.0,0.0,0.272157,0.0,0.0,0.314017,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


#model training

In [29]:
from sklearn.model_selection import train_test_split
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import GridSearchCV

In [30]:
combined_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199 entries, 0 to 198
Columns: 226 entries, Age to xray
dtypes: float64(223), int64(3)
memory usage: 351.5 KB


In [31]:
combined_data.head()

Unnamed: 0,Age,Sex,Hospitalization Data,HAI Name,abdomen,abdominal,abnormal,abnormality,alcohol,allergens,...,urine,visual,wall,wbc,weak,weakness,weight,wheezing,white,xray
0,45,0,7,1.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.163416
1,32,1,8,2.0,0.0,0.0,0.167199,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,60,0,7,0.0,0.0,0.0,0.0,0.469022,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.176446
3,28,1,6,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.349131,0.0
4,50,0,8,0.0,0.0,0.272157,0.0,0.0,0.314017,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [32]:
df['HAI Name'].info()

<class 'pandas.core.series.Series'>
RangeIndex: 199 entries, 0 to 198
Series name: HAI Name
Non-Null Count  Dtype  
--------------  -----  
199 non-null    float64
dtypes: float64(1)
memory usage: 1.7 KB


In [33]:
X = combined_data.drop('HAI Name', axis=1)

y = combined_data['HAI Name']
X.info()
print(len(X))
print(len(y))
# len(X) == len(y)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 199 entries, 0 to 198
Columns: 225 entries, Age to xray
dtypes: float64(222), int64(3)
memory usage: 349.9 KB
199
199


In [34]:
# # svm
svm = SVC()
svm.fit(X_train, y_train)

y_pred = svm.predict(X_test)
svm_accuracy = accuracy_score(y_test, y_pred)*100
print("Accuracy:", svm_accuracy, "%")

svm_score = cross_val_score(svm, X_train, y_train, cv=6)
print('svm cross validation score',round(svm_score.mean()*100,2).astype(str),'%' )



Accuracy: 32.5 %
svm cross validation score 35.21 %




In [35]:
# Random Forest Classifer
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train, y_train)

y_pred = rf.predict(X_test)
rf_accuracy = accuracy_score(y_test, y_pred)*100
print("Accuracy:", rf_accuracy, "%")

rf_score = cross_val_score(rf, X_train, y_train, cv=6)
print('rfc cross validation score',round(rf_score.mean()*100,2).astype(str),'%' )

Accuracy: 87.5 %




rfc cross validation score 90.69 %


In [36]:
#  Naive bayes
nb = GaussianNB()
nb.fit(X_train, y_train)

y_pred = nb.predict(X_test)
nb_accuracy = accuracy_score(y_test, y_pred)*100
print("Accuracy:", nb_accuracy, "%")

nb_score= cross_val_score(nb,X_train,y_train,cv=6)
print('naive bayes cross validation score',round(nb_score.mean()*100,2).astype(str),'%' )

Accuracy: 90.0 %
naive bayes cross validation score 85.02 %




In [37]:
# GridSearchCV
svc_params = {'C': [0.5, 0.7, 0.9, 1], 'kernel': ['rbf', 'poly', 'sigmoid', 'linear']}
grid_svc = GridSearchCV(SVC(), svc_params)
grid_svc.fit(X_train, y_train)
svc = grid_svc.best_estimator_

y_pred = grid_svc.predict(X_test)
svc_accuracy = accuracy_score(y_test, y_pred)*100
print("Accuracy:", nb_accuracy, "%")

svc_score= cross_val_score(svc,X_train,y_train,cv=6)
print('svc cross validation score',round(svc_score.mean()*100,2).astype(str),'%' )



Accuracy: 90.0 %




svc cross validation score 87.54 %


#output predicting

In [38]:
data_list = [1, 45, 'M', 'White', 'Non-Hispanic', 'Asthma, Hypertension', '2023-01-05 to 2023-01-12', 'Blood Test: Normal, X-ray: Clear', 'MRI: No abnormalities', 'NonNasal swab: Positive for influenzae', 'Obesity, Smoking', 'Cough, Fever', 'Elevated heart rate', 'Antibiotics', 'Recovered','None']
columns = ['Patient ID','Age', 'Sex', 'Race', 'Ethnicity', 'Medical History', 'Hospitalization Data', 'Laboratory Data', 'Imaging Data', 'Microbiology Data', 'Risk Factors', 'Symptoms', 'Signs', 'Treatment', 'Outcomes','Unnamed: 16']
df_new = pd.DataFrame([data_list], columns=columns)
test_df = Processing(df_new)
X_data = test_df.process_data()
X_data.head()

# Step 1: Get feature names used during training
feature_names_training = X_train.columns.tolist()

# Step 2: Align feature names in test data
feature_names_test = X_data.columns.tolist()

# Step 3: Identify unseen features in test data
unseen_features = set(feature_names_test) - set(feature_names_training)

# Step 4: Drop unseen features from test data
X_data = X_data.drop(unseen_features, axis=1)

# Step 5: Reorder features in test data to match the order in training data
X_data = X_data.reindex(columns=feature_names_training, fill_value=0)

# # Step 6: Make predictions using the Random Forest classifier
# predict = rf.predict(X_data)
# print(predict)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  td[column] = td[column].apply(lambda x: x.translate(str.maketrans('', '', string.punctuation)) if isinstance(x, str) else x)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  td[column] = td[column].apply(lambda x: ' '.join([word for word in str(x).split() if word.lower() not in stop_words]))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing

In [39]:
# predict = grid_svc.predict(X_data)
predict = rf.predict(X_data)
print(predict)

[3.]


In [40]:
# Define a dictionary to map the numeric labels to their corresponding strings
label_mapping = {-1: 'None', 0: 'Other', 1: 'Pneumonia', 2: 'Influenza', 3: 'Unspecified Infection', 4: 'COVID-19', 5: 'Respiratory Infection'}

# Map the predicted values to their corresponding strings
predict_strings = [label_mapping[label] for label in predict]

# Print the predicted strings
for prediction in predict_strings:
    print(prediction)

Unspecified Infection


#saving the model

In [41]:
# !pip install joblib

In [42]:
import pickle

filename = 'hai_pred_model.sav'
pickle.dump(rf, open(filename, 'wb'))



In [43]:
# loading the model
model = pickle.load(open('hai_pred_model.sav', 'rb'))

predict = model.predict(X_data)
print(predict)

[3.]
