In [22]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [62]:
raw_train = pd.read_csv("train.csv").reset_index(drop=True)

In [63]:
raw_train.head()

Unnamed: 0,id,sudden_fever,headache,mouth_bleed,nose_bleed,muscle_pain,joint_pain,vomiting,rash,diarrhea,...,breathing_restriction,toe_inflammation,finger_inflammation,lips_irritation,itchiness,ulcers,toenail_loss,speech_problem,bullseye_rash,prognosis
0,0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Lyme_disease
1,1,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Tungiasis
2,2,0.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,1.0,...,1.0,1.0,1.0,1.0,1.0,0.0,1.0,1.0,1.0,Lyme_disease
3,3,0.0,0.0,1.0,1.0,1.0,1.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,Zika
4,4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,...,0.0,1.0,0.0,0.0,1.0,1.0,1.0,0.0,0.0,Rift_Valley_fever


In [64]:
raw_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 707 entries, 0 to 706
Data columns (total 66 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     707 non-null    int64  
 1   sudden_fever           707 non-null    float64
 2   headache               707 non-null    float64
 3   mouth_bleed            707 non-null    float64
 4   nose_bleed             707 non-null    float64
 5   muscle_pain            707 non-null    float64
 6   joint_pain             707 non-null    float64
 7   vomiting               707 non-null    float64
 8   rash                   707 non-null    float64
 9   diarrhea               707 non-null    float64
 10  hypotension            707 non-null    float64
 11  pleural_effusion       707 non-null    float64
 12  ascites                707 non-null    float64
 13  gastro_bleeding        707 non-null    float64
 14  swelling               707 non-null    float64
 15  nausea

In [65]:
raw_train['prognosis'].unique()

array(['Lyme_disease', 'Tungiasis', 'Zika', 'Rift_Valley_fever',
       'West_Nile_fever', 'Malaria', 'Chikungunya', 'Plague', 'Dengue',
       'Yellow_Fever', 'Japanese_encephalitis'], dtype=object)

In [66]:
raw_train['prognosis'].value_counts()

West_Nile_fever          85
Japanese_encephalitis    81
Tungiasis                70
Rift_Valley_fever        70
Chikungunya              66
Dengue                   63
Yellow_Fever             61
Zika                     58
Plague                   53
Lyme_disease             52
Malaria                  48
Name: prognosis, dtype: int64

In [67]:
raw_test = pd.read_csv('test.csv')

In [68]:
raw_test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 303 entries, 0 to 302
Data columns (total 65 columns):
 #   Column                 Non-Null Count  Dtype  
---  ------                 --------------  -----  
 0   id                     303 non-null    int64  
 1   sudden_fever           303 non-null    float64
 2   headache               303 non-null    float64
 3   mouth_bleed            303 non-null    float64
 4   nose_bleed             303 non-null    float64
 5   muscle_pain            303 non-null    float64
 6   joint_pain             303 non-null    float64
 7   vomiting               303 non-null    float64
 8   rash                   303 non-null    float64
 9   diarrhea               303 non-null    float64
 10  hypotension            303 non-null    float64
 11  pleural_effusion       303 non-null    float64
 12  ascites                303 non-null    float64
 13  gastro_bleeding        303 non-null    float64
 14  swelling               303 non-null    float64
 15  nausea

## Pre-processing

In [69]:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split

encoder = preprocessing.LabelEncoder().fit(raw_train['prognosis'])

In [70]:
encoded_labels = encoder.transform(raw_train['prognosis'])

In [71]:
len(encoded_labels)

707

In [75]:
train_X = raw_train.loc[:, ~raw_train.columns.isin(['id', 'prognosis'])]
train_y = raw_train.loc[:, raw_train.columns == 'prognosis']

In [81]:
X_train, X_val, y_train, y_val = train_test_split(train_X, train_y, test_size=0.33, random_state=42)

## Model building

In [82]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV

In [83]:
lr = LogisticRegression()
parameters = { 'penalty' : ['l1','l2'],
              'solver': ['newton-cg', 'sag', 'saga', 'lbfgs'], 
              'multi_class': ['multinomial'], 'max_iters':  }

In [7]:
clf = GridSearchCV(lr, parameters)

array(['Chikungunya', 'Dengue', 'Japanese_encephalitis', 'Lyme_disease',
       'Malaria', 'Plague', 'Rift_Valley_fever', 'Tungiasis',
       'West_Nile_fever', 'Yellow_Fever', 'Zika'], dtype=object)

id              249571
muscle_pain      366.0
sudden_fever     356.0
nose_bleed       345.0
rash             345.0
mouth_bleed      325.0
headache         318.0
joint_pain       318.0
vomiting         312.0
diarrhea         276.0
dtype: object