In [1]:
import pandas as pd
from sklearn.preprocessing import StandardScaler
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.impute import SimpleImputer

In [2]:
df = pd.read_csv("/content/Typhoid_dataset.csv")

In [3]:
df = df.drop(columns=['Age', 'Gastrointestinal Symptoms','Neurological Symptoms', 'Complications', 'Ongoing Infection in Society', 'Gender','Widal Test'])

In [4]:
df.to_csv('pp_Typhoid_data.csv', index=False)
from google.colab import files
files.download('pp_Typhoid_data.csv')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [5]:
X = df.drop('Typhoid Status', axis=1)
y = df['Typhoid Status']

In [6]:
le = LabelEncoder()
y_encoded = le.fit_transform(y)


In [7]:
categorical_cols = ['Previous History of Typhoid','Typhoid Vaccination Status','Blood Culture Result',
                   'Skin Manifestations','Sanitation Facilities','Hand Hygiene','Consumption of Street Food',
                   'Location', 'Socioeconomic Status', 'Water Source Type', 'Typhidot Test','Weather Condition']
numerical_cols = ['Fever Duration (Days)', 'White Blood Cell Count', 'Platelet Count']



In [8]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', Pipeline([
            ('imputer', SimpleImputer(strategy='mean')),
            ('scaler', StandardScaler())
        ]), numerical_cols),

        ('cat', Pipeline([
            ('imputer', SimpleImputer(strategy='most_frequent')),
            ('encoder', OneHotEncoder(handle_unknown='ignore'))
        ]), categorical_cols)
    ]
)

In [9]:
pipeline= Pipeline([
    ('preprocessing', preprocessor),
    ('classifier', LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter=1000))
])

In [None]:
joblib.dump(pipeline, 'typhoid_model.pkl')

In [10]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 31087 entries, 0 to 31086
Data columns (total 16 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   Location                     31087 non-null  object
 1   Socioeconomic Status         31087 non-null  object
 2   Water Source Type            31087 non-null  object
 3   Sanitation Facilities        31087 non-null  object
 4   Hand Hygiene                 31087 non-null  object
 5   Consumption of Street Food   31087 non-null  object
 6   Fever Duration (Days)        31087 non-null  int64 
 7   Skin Manifestations          31087 non-null  object
 8   White Blood Cell Count       31087 non-null  int64 
 9   Platelet Count               31087 non-null  int64 
 10  Blood Culture Result         31087 non-null  object
 11  Typhidot Test                31087 non-null  object
 12  Typhoid Vaccination Status   31087 non-null  object
 13  Previous History of Typhoid  31

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.2, random_state=42)


pipeline.fit(X_train, y_train)


y_pred = pipeline.predict(X_test)



In [12]:
print("Accuracy:", accuracy_score(y_test, y_pred))
print("Confusion Matrix:\n", confusion_matrix(y_test, y_pred))
print("Classification Report:\n", classification_report(y_test, y_pred))

Accuracy: 0.9197491154712126
Confusion Matrix:
 [[1085    0    0    0]
 [   0  253    0    0]
 [   0    0 4381    0]
 [ 499    0    0    0]]
Classification Report:
               precision    recall  f1-score   support

           0       0.68      1.00      0.81      1085
           1       1.00      1.00      1.00       253
           2       1.00      1.00      1.00      4381
           3       0.00      0.00      0.00       499

    accuracy                           0.92      6218
   macro avg       0.67      0.75      0.70      6218
weighted avg       0.86      0.92      0.89      6218



  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
