# Feature Engineering On Dengue Dataset

### Importing important libraries

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

### Load data into dataframe

In [4]:
df = pd.read_csv('../data/raw_data/dengue.csv')
df.head()

Unnamed: 0,dengue.p_i_d,dengue.date_of_fever,dengue.residence,dengue.days,dengue.current_temp,dengue.wbc,dengue.servere_headche,dengue.pain_behind_the_eyes,dengue.joint_muscle_aches,dengue.metallic_taste_in_the_mouth,dengue.appetite_loss,dengue.addominal_pain,dengue.nausea_vomiting,dengue.diarrhoea,dengue.hemoglobin,dengue._hematocri,dengue.platelet,dengue.dengue
0,P012,13-Jan,Bangalore,10 days,100.0,5.0,yes,no,no,yes,yes,yes,no,yes,15.0,,140.0,yes
1,P011,13-Feb,New Delhi,4 days,104.0,1.0,no,yes,yes,no,no,no,no,no,9.0,22.0,80.0,no
2,P010,9-Jan,Jamica,5 days,104.0,5.0,,no,no,yes,yes,yes,yes,no,,,120.0,yes
3,P009,12-Aug,Barbados,12 months,101.0,,yes,yes,no,no,no,no,no,no,,,,no
4,P008,12-Oct,St.Martin,3 days,101.0,,yes,no,no,no,no,no,yes,no,,,,no


### Changing feature names for simplicity

In [5]:
clean_feature_names = [feature.split('.')[1] for feature in df.columns]
clean_feature_names

['p_i_d',
 'date_of_fever',
 'residence',
 'days',
 'current_temp',
 'wbc',
 'servere_headche',
 'pain_behind_the_eyes',
 'joint_muscle_aches',
 'metallic_taste_in_the_mouth',
 'appetite_loss',
 'addominal_pain',
 'nausea_vomiting',
 'diarrhoea',
 'hemoglobin',
 '_hematocri',
 'platelet',
 'dengue']

In [7]:
df.columns = clean_feature_names
df.head()

Unnamed: 0,p_i_d,date_of_fever,residence,days,current_temp,wbc,servere_headche,pain_behind_the_eyes,joint_muscle_aches,metallic_taste_in_the_mouth,appetite_loss,addominal_pain,nausea_vomiting,diarrhoea,hemoglobin,_hematocri,platelet,dengue
0,P012,13-Jan,Bangalore,10 days,100.0,5.0,yes,no,no,yes,yes,yes,no,yes,15.0,,140.0,yes
1,P011,13-Feb,New Delhi,4 days,104.0,1.0,no,yes,yes,no,no,no,no,no,9.0,22.0,80.0,no
2,P010,9-Jan,Jamica,5 days,104.0,5.0,,no,no,yes,yes,yes,yes,no,,,120.0,yes
3,P009,12-Aug,Barbados,12 months,101.0,,yes,yes,no,no,no,no,no,no,,,,no
4,P008,12-Oct,St.Martin,3 days,101.0,,yes,no,no,no,no,no,yes,no,,,,no


### Handling missing values

In [23]:
### for num features
num_nan_features = [feature for feature in df.columns if df[feature].dtype != 'O' and df[feature].isnull().sum() > 1]
num_nan_features

[]

In [11]:
for feature in num_nan_features:
    median = df[feature].median()

    df[feature].fillna(value=median,inplace=True)

In [14]:
df[num_nan_features].isnull().sum()

wbc           0
hemoglobin    0
_hematocri    0
platelet      0
dtype: int64

### Drop unnecessary features

In [18]:
unnessary_features = ['days','residence','p_i_d','date_of_fever']
df.drop(columns=unnessary_features,inplace=True)

In [27]:
### Now for categorical features
cate_nan_features = [feature for feature in df.columns if df[feature].dtype == 'O' and df[feature].isnull().sum() > 1]
cate_nan_features

['servere_headche', 'joint_muscle_aches', 'dengue']

In [34]:
for feature in cate_nan_features:
    mode = df[feature].mode()

    df[feature] = np.where(df[feature].isnull(),mode,df[feature])

df[cate_nan_features].isnull().sum()

servere_headche       0
joint_muscle_aches    0
dengue                0
dtype: int64

### Let's encode categorical features

In [38]:
from sklearn.preprocessing import LabelEncoder
encoder = LabelEncoder()

In [37]:
cate_features = [feature for feature in df.columns if df[feature].dtype == 'O']
cate_features

['servere_headche',
 'pain_behind_the_eyes',
 'joint_muscle_aches',
 'metallic_taste_in_the_mouth',
 'appetite_loss',
 'addominal_pain',
 'nausea_vomiting',
 'diarrhoea',
 'dengue']

In [39]:
for feature in cate_features:
    df[feature] = encoder.fit_transform(df[feature])

df[cate_features].head()

Unnamed: 0,servere_headche,pain_behind_the_eyes,joint_muscle_aches,metallic_taste_in_the_mouth,appetite_loss,addominal_pain,nausea_vomiting,diarrhoea,dengue
0,1,0,0,1,1,1,0,1,1
1,0,1,1,0,0,0,0,0,0
2,1,0,0,1,1,1,1,0,1
3,1,1,0,0,0,0,0,0,0
4,1,0,0,0,0,0,1,0,0


### Let's save the cooked data into file

In [40]:
df.to_csv('../data/cooked_data/dengue(cooked).csv',index=False)

Now we cooked the all data now we apply feature selection on it