In [216]:
import numpy as np
import pandas as pd

In [217]:
data = pd.read_csv('../data/dengue.csv')
data.head()

Unnamed: 0,Gender,Age,NS1,IgG,IgM,Area,AreaType,HouseType,District,Outcome
0,Female,45,0,0,0,Mirpur,Undeveloped,Building,Dhaka,0
1,Male,17,0,0,1,Chawkbazar,Developed,Building,Dhaka,0
2,Female,29,0,0,0,Paltan,Undeveloped,Other,Dhaka,0
3,Female,63,1,1,0,Motijheel,Developed,Other,Dhaka,1
4,Male,22,0,0,0,Gendaria,Undeveloped,Building,Dhaka,0


In [218]:
data.dtypes

Gender       object
Age           int64
NS1           int64
IgG           int64
IgM           int64
Area         object
AreaType     object
HouseType    object
District     object
Outcome       int64
dtype: object

In [219]:
data.describe()

Unnamed: 0,Age,NS1,IgG,IgM,Outcome
count,1000.0,1000.0,1000.0,1000.0,1000.0
mean,35.924,0.519,0.533,0.475,0.533
std,16.468437,0.499889,0.499159,0.499624,0.499159
min,8.0,0.0,0.0,0.0,0.0
25%,22.0,0.0,0.0,0.0,0.0
50%,37.0,1.0,1.0,0.0,1.0
75%,50.0,1.0,1.0,1.0,1.0
max,65.0,1.0,1.0,1.0,1.0


In [220]:
data['Area'].unique()

array(['Mirpur', 'Chawkbazar', 'Paltan', 'Motijheel', 'Gendaria',
       'Dhanmondi', 'New Market', 'Sher-e-Bangla Nagar', 'Kafrul',
       'Pallabi', 'Mohammadpur', 'Shahbagh', 'Shyampur', 'Kalabagan',
       'Bosila', 'Jatrabari', 'Adabor', 'Kamrangirchar', 'Biman Bandar',
       'Ramna', 'Badda', 'Bangshal', 'Sabujbagh', 'Hazaribagh',
       'Sutrapur', 'Lalbagh', 'Demra', 'Banasree', 'Cantonment',
       'Keraniganj', 'Tejgaon', 'Khilkhet', 'Kadamtali', 'Gulshan',
       'Rampura', 'Khilgaon'], dtype=object)

In [221]:
data['AreaType'].unique()

array(['Undeveloped', 'Developed'], dtype=object)

In [222]:
data['HouseType'].unique()

array(['Building', 'Other', 'Tinshed'], dtype=object)

In [223]:
num_cols = data.select_dtypes(include='number').columns
cat_cols = data.select_dtypes(include='object').columns

print(num_cols)
print(cat_cols)

Index(['Age', 'NS1', 'IgG', 'IgM', 'Outcome'], dtype='object')
Index(['Gender', 'Area', 'AreaType', 'HouseType', 'District'], dtype='object')


In [224]:
data['AgeCat'] = pd.cut(data['Age'], bins=[-np.inf, 18, 30, 45, np.inf], labels=['child', 'young', 'middle-aged', 'aged'])
data.head()

Unnamed: 0,Gender,Age,NS1,IgG,IgM,Area,AreaType,HouseType,District,Outcome,AgeCat
0,Female,45,0,0,0,Mirpur,Undeveloped,Building,Dhaka,0,middle-aged
1,Male,17,0,0,1,Chawkbazar,Developed,Building,Dhaka,0,child
2,Female,29,0,0,0,Paltan,Undeveloped,Other,Dhaka,0,young
3,Female,63,1,1,0,Motijheel,Developed,Other,Dhaka,1,aged
4,Male,22,0,0,0,Gendaria,Undeveloped,Building,Dhaka,0,young


In [225]:
print(data.isnull().sum())


Gender       0
Age          0
NS1          0
IgG          0
IgM          0
Area         0
AreaType     0
HouseType    0
District     0
Outcome      0
AgeCat       0
dtype: int64


In [226]:
from sklearn.model_selection import train_test_split

train, test = train_test_split(data, test_size=0.2, random_state=7, stratify=data['AgeCat'])


In [227]:
print(train['AgeCat'].value_counts(normalize=True) * 100)
print(test['AgeCat'].value_counts(normalize=True) * 100)

AgeCat
aged           32.750
middle-aged    27.250
child          20.125
young          19.875
Name: proportion, dtype: float64
AgeCat
aged           32.5
middle-aged    27.5
young          20.0
child          20.0
Name: proportion, dtype: float64


In [228]:
train_set, val_set = train_test_split(train, test_size=0.2, random_state=7, stratify=train['AgeCat'])

train_set.drop(columns=['AgeCat'], axis=1, inplace= True)
val_set.drop(columns=['AgeCat'], axis=1, inplace=True)

In [229]:
X_train = train_set.drop(columns=['Outcome'])
y_train = train_set['Outcome']

X_val = val_set.drop(columns=['Outcome'])
y_val = val_set['Outcome']

In [230]:
num_cols = X_train.select_dtypes(include='number').columns
cat_cols = X_train.select_dtypes(include='object').columns

In [231]:
from sklearn.impute import SimpleImputer

num_imputer = SimpleImputer(strategy='mean')
cat_imputer = SimpleImputer(strategy='most_frequent')

X_train[num_cols] = num_imputer.fit_transform(X_train[num_cols])
X_train[cat_cols] = cat_imputer.fit_transform(X_train[cat_cols])

X_val[num_cols] = num_imputer.transform(X_val[num_cols])
X_val[cat_cols] = cat_imputer.transform(X_val[cat_cols])


In [235]:
from sklearn.preprocessing import StandardScaler, OrdinalEncoder

scaler = StandardScaler()
encoder = OrdinalEncoder()

X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_val[num_cols] = scaler.transform(X_val[num_cols])

X_train[cat_cols] = encoder.fit_transform(X_train[cat_cols])
X_val[cat_cols] = encoder.transform(X_val[cat_cols])

In [237]:
from sklearn.linear_model import LogisticRegression

log_reg = LogisticRegression()

log_reg.fit(X_train, y_train)

In [242]:
pred = log_reg.predict(X_val)

In [245]:
log_reg.score(X_val, y_val)

1.0

In [247]:
from sklearn.metrics import confusion_matrix

confusion_matrix(y_val, pred)

array([[78,  0],
       [ 0, 82]])

In [248]:
print('Razon Boss')

Razon Boss
