In [6]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [7]:
raw_train = pd.read_csv("train.csv").reset_index(drop=True)

In [8]:
raw_train.head()

Unnamed: 0,id,sudden_fever,headache,mouth_bleed,nose_bleed,muscle_pain,joint_pain,vomiting,rash,diarrhea,...,breathing_restriction,toe_inflammation,finger_inflammation,lips_irritation,itchiness,ulcers,toenail_loss,speech_problem,bullseye_rash,prognosis
0,0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Lyme_disease
1,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Tungiasis
2,2,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,Lyme_disease
3,3,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Zika
4,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,Rift_Valley_fever


In [9]:
raw_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 707 entries, 0 to 706
Data columns (total 66 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     707 non-null    int64  
 1   sudden_fever           707 non-null    float64
 2   headache               707 non-null    float64
 3   mouth_bleed            707 non-null    float64
 4   nose_bleed             707 non-null    float64
 5   muscle_pain            707 non-null    float64
 6   joint_pain             707 non-null    float64
 7   vomiting               707 non-null    float64
 8   rash                   707 non-null    float64
 9   diarrhea               707 non-null    float64
 10  hypotension            707 non-null    float64
 11  pleural_effusion       707 non-null    float64
 12  ascites                707 non-null    float64
 13  gastro_bleeding        707 non-null    float64
 14  swelling               707 non-null    float64
 15  nausea

In [10]:
raw_train['prognosis'].unique()

array(['Lyme_disease', 'Tungiasis', 'Zika', 'Rift_Valley_fever',
       'West_Nile_fever', 'Malaria', 'Chikungunya', 'Plague', 'Dengue',
       'Yellow_Fever', 'Japanese_encephalitis'], dtype=object)

In [11]:
raw_train['prognosis'].value_counts()

West_Nile_fever          85
Japanese_encephalitis    81
Tungiasis                70
Rift_Valley_fever        70
Chikungunya              66
Dengue                   63
Yellow_Fever             61
Zika                     58
Plague                   53
Lyme_disease             52
Malaria                  48
Name: prognosis, dtype: int64

In [12]:
raw_test = pd.read_csv('test.csv')

In [13]:
raw_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 65 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     303 non-null    int64  
 1   sudden_fever           303 non-null    float64
 2   headache               303 non-null    float64
 3   mouth_bleed            303 non-null    float64
 4   nose_bleed             303 non-null    float64
 5   muscle_pain            303 non-null    float64
 6   joint_pain             303 non-null    float64
 7   vomiting               303 non-null    float64
 8   rash                   303 non-null    float64
 9   diarrhea               303 non-null    float64
 10  hypotension            303 non-null    float64
 11  pleural_effusion       303 non-null    float64
 12  ascites                303 non-null    float64
 13  gastro_bleeding        303 non-null    float64
 14  swelling               303 non-null    float64
 15  nausea

## Pre-processing

In [163]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

encoder = preprocessing.LabelEncoder().fit(raw_train['prognosis'])

In [164]:
labels = encoder.transform(raw_train['prognosis'])

In [165]:
features = raw_train.loc[:, ~raw_train.columns.isin(['id', 'prognosis'])]

## Feature selection

In [166]:
from sklearn.feature_selection import SelectKBest ,chi2, mutual_info_classif

In [167]:
k_best = SelectKBest(chi2, k=20)

In [168]:
k_best_fit = k_best.fit(features, labels)

In [169]:
best_features = k_best_fit.get_feature_names_out()
best_features

array(['weakness', 'jaundice', 'inflammation', 'loss_of_appetite',
       'urination_loss', 'slow_heart_rate', 'abdominal_pain',
       'light_sensitivity', 'yellow_skin', 'yellow_eyes', 'microcephaly',
       'bitter_tongue', 'convulsion', 'prostraction',
       'breathing_restriction', 'finger_inflammation', 'lips_irritation',
       'itchiness', 'ulcers', 'toenail_loss'], dtype=object)

In [170]:
train_X = pd.DataFrame(features, columns=best_features)

In [171]:
X_train, X_val, y_train, y_val = train_test_split(train_X, encoded_labels, test_size=0.33, random_state=42, stratify=encoded_labels)

## Logistic Regression

In [208]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from sklearn.metrics import accuracy_score

In [209]:
lr = LogisticRegression(max_iter=500)

In [210]:
lr.fit(X_train, y_train)

In [211]:
preds = lr.predict(X_val)
score = accuracy_score(preds, y_val)
print("accuracy", score)

accuracy 0.2905982905982906


In [212]:
parameters = { 'penalty' : ['l1','l2'],
              'solver': ['saga', 'newton-cg', 'sag', 'saga', 'lbfgs'], 
              'multi_class': ['ovr','multinomial'], 'max_iter': [1000, 1500, 2000, 2500] }

In [213]:
clf = RandomizedSearchCV(lr,
                         param_distributions=parameters,
                         n_iter=50,
                         scoring='accuracy',
                         n_jobs=4,
                         verbose=4,
                        random_state=42,
                         cv=10)

In [214]:
clf.fit(X_train, y_train)

Fitting 10 folds for each of 50 candidates, totalling 500 fits


120 fits failed out of a total of 500.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "/Users/adityapimpalkar/miniconda3/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "/Users/adityapimpalkar/miniconda3/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 1162, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "/Users/adityapimpalkar/miniconda3/lib/python3.10/site-packages/sklearn/linear_model/_logistic.py", line 54, in _check_solver
    raise ValueError(
ValueError: Solver sag supports only 'l2' or 'non

In [215]:
print("Tuned Hyperparameters :", clf.best_params_)

Tuned Hyperparameters : {'solver': 'saga', 'penalty': 'l2', 'multi_class': 'multinomial', 'max_iter': 1000}


In [242]:
lr = LogisticRegression(max_iter=1000, multi_class='multinomial', penalty='l2', solver= 'saga')

In [243]:
lr.fit(X_train, y_train)
lr_preds = lr.predict(X_val)
lr_score = accuracy_score(lr_preds, y_val)
print("Logistic accuracy", lr_score)

Logistic accuracy 0.2905982905982906


## DecisionTreeClassifier

In [220]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score

In [221]:
dsc_clf = DecisionTreeClassifier(random_state=32)

In [222]:
dsc_clf.fit(X_train, y_train)

In [223]:
dsc_preds = dsc_clf.predict(X_val)

In [224]:
dsc_preds = accuracy_score(dsc_preds, y_val)
print("accuracy", dsc_preds)

accuracy 0.2692307692307692


In [225]:
desc_parameters = {
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_features': [10,20,30],
    'max_depth': [2,4,6,8,10,12]
}

In [226]:
dsc_clf = DecisionTreeClassifier(random_state=32)

In [227]:
clf = RandomizedSearchCV(dsc_clf,
                         param_distributions=desc_parameters,
                         n_iter=50,
                         scoring='accuracy',
                         n_jobs=4,
                         verbose=4,
                        random_state=42,
                         cv=10)

In [228]:
clf.fit(X_train, y_train)

Fitting 10 folds for each of 50 candidates, totalling 500 fits


In [229]:
print("Tuned Hyperparameters :", clf.best_params_)

Tuned Hyperparameters : {'max_features': 10, 'max_depth': 6, 'criterion': 'entropy'}


In [230]:
dsc_clf = DecisionTreeClassifier(random_state=32, criterion='entropy', max_depth=6, max_features=10)

In [231]:
dsc_clf.fit(X_train, y_train)
dsc_clf_preds = dsc_clf.predict(X_val)
dsc_clf_score = accuracy_score(dsc_clf_preds, y_val)
print("Decision tree accuracy", dsc_clf_score)

Decision tree accuracy 0.2564102564102564


## Random Forest Classifier

In [232]:
from sklearn.ensemble import RandomForestClassifier

In [233]:
rfc_clf = RandomForestClassifier(random_state=32)

In [234]:
rfc_clf.fit(X_train, y_train)

In [235]:
rfc_preds = rfc_clf.predict(X_val)

In [236]:
rfc_score = accuracy_score(rfc_preds, y_val)
print("accuracy", rfc_score)

accuracy 0.29914529914529914


In [237]:
rfc_parameters = {
    'criterion': ['gini', 'entropy', 'log_loss'],
    'max_features': [10,20,30],
    'max_depth': [2,4,6,8,10,12]
}

In [238]:
rfc_clf = RandomForestClassifier(random_state=32)

In [239]:
clf = RandomizedSearchCV(rfc_clf,
                         param_distributions=rfc_parameters,
                         n_iter=50,
                         scoring='accuracy',
                         n_jobs=4,
                         verbose=4,
                        random_state=42,
                         cv=10)

In [240]:
clf.fit(X_train, y_train)

Fitting 10 folds for each of 50 candidates, totalling 500 fits


In [241]:
print("Tuned Hyperparameters :", clf.best_params_)

Tuned Hyperparameters : {'max_features': 10, 'max_depth': 8, 'criterion': 'gini'}


In [244]:
rfc_clf = RandomForestClassifier(random_state=32, criterion='gini', max_depth=8, max_features=10)

In [245]:
rfc_clf.fit(X_train, y_train)
rfc_preds = rfc_clf.predict(X_val)
rfc_score = accuracy_score(rfc_preds, y_val)
print("test accuracy", rfc_score)

test accuracy 0.2905982905982906


## XGBoost

In [246]:

import xgboost
classifier = xgboost.XGBClassifier()

In [247]:
params = {
    'n_estimators': [100, 500, 900, 1100, 1500],
    'booster':['gbtree','gblinear'],
 'learning_rate' : [0.05,0.10,0.15,0.20,0.25,0.30],
 'max_depth' : [ 3, 4, 5, 6, 8, 10, 12, 15],
 'min_child_weight' : [ 1, 3, 5, 7 ],
 'gamma': [ 0.0, 0.1, 0.2 , 0.3, 0.4 ],
 'colsample_bytree' : [ 0.3, 0.4, 0.5 , 0.7 ]
}

In [248]:
rs_model=RandomizedSearchCV(classifier,
                            param_distributions=params,
                            n_iter=50,
                            scoring='accuracy',
                            n_jobs=4,
                            cv=5,
                            return_train_score=True,
                            verbose=4,
                            random_state=42)

In [249]:
rs_model.fit(X_train, y_train)

Fitting 5 folds for each of 50 candidates, totalling 250 fits


In [250]:
rs_model.best_estimator_

In [252]:
rs_model.best_params_

{'n_estimators': 100,
 'min_child_weight': 7,
 'max_depth': 10,
 'learning_rate': 0.15,
 'gamma': 0.1,
 'colsample_bytree': 0.7,
 'booster': 'gbtree'}

In [261]:
classifier1 = xgboost.XGBClassifier(n_estimators=100,
                                   min_child_weight=7, 
                                   max_depth=10, 
                                   learning_rate=0.15,
                                   gamma=0.1,
                                   colsample_bytree=0.7,
                                   booster='gbtree',
                                   tree_method='exact',
                                   random_state=42)

In [262]:
classifier1.fit(X_train, y_train)

In [263]:
xgb_preds = classifier1.predict(X_val)
xgb_score = accuracy_score(xgb_preds, y_val)
print("xgb_score", xgb_score)

xgb_score 0.2905982905982906


## submissions

In [267]:
test_df = pd.DataFrame(raw_test, columns=best_features)

In [268]:
test_preds = classifier1.predict(test_df)

array(['Rift_Valley_fever', 'Chikungunya', 'Japanese_encephalitis',
       'Japanese_encephalitis', 'Japanese_encephalitis', 'Yellow_Fever',
       'Plague', 'Chikungunya', 'Yellow_Fever', 'Lyme_disease',
       'Yellow_Fever', 'Japanese_encephalitis', 'Dengue', 'Chikungunya',
       'West_Nile_fever', 'Yellow_Fever', 'Plague', 'Tungiasis',
       'Chikungunya', 'Zika', 'Malaria', 'Zika', 'Japanese_encephalitis',
       'Lyme_disease', 'Tungiasis', 'Chikungunya', 'West_Nile_fever',
       'Malaria', 'Japanese_encephalitis', 'West_Nile_fever', 'Malaria',
       'Tungiasis', 'Lyme_disease', 'Plague', 'Lyme_disease',
       'West_Nile_fever', 'Zika', 'Zika', 'Japanese_encephalitis',
       'West_Nile_fever', 'West_Nile_fever', 'Chikungunya', 'Zika',
       'Yellow_Fever', 'Chikungunya', 'Rift_Valley_fever', 'Chikungunya',
       'Plague', 'Yellow_Fever', 'Tungiasis', 'Tungiasis',
       'Rift_Valley_fever', 'Lyme_disease', 'Zika', 'Rift_Valley_fever',
       'Lyme_disease', 'Chikungunya',

In [271]:
submit_df = pd.DataFrame({ 'id': raw_test['id'], 'prognosis': encoder.inverse_transform(test_preds) })

In [273]:
submit_df.to_csv('submission_fs1.csv')

In [274]:
submit_df

Unnamed: 0,id,prognosis
0,707,Rift_Valley_fever
1,708,Chikungunya
2,709,Japanese_encephalitis
3,710,Japanese_encephalitis
4,711,Japanese_encephalitis
...,...,...
298,1005,Zika
299,1006,Lyme_disease
300,1007,Malaria
301,1008,Rift_Valley_fever
