In [1]:
import pandas as pd
from pycaret.classification import *
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import dash
import dash_core_components as dcc
import dash_html_components as html

load dataset

In [11]:
df = pd.read_csv('malaria_outbreak.csv')

print the first five rows

In [12]:
df.head()

Unnamed: 0,Year,Rainfall,Min-Temperature,Max-Temperature,Relative Humidity 1(0800hrs),Relative Humidity 2(1400hrs),MVP,Malarial Cases,Malaria Outbreak
0,2006,0.0,20.0,28.36,67.74,66.39,135,217,no
1,2006,0.0,21.11,30.01,68.61,69.14,192,298,no
2,2006,1317.7,24.38,31.51,71.55,74.06,98,413,no
3,2006,2441.1,26.38,31.66,72.4,76.77,71,603,yes
4,2006,3743.1,27.18,32.67,74.35,77.03,108,636,yes


In [13]:
df.tail()

Unnamed: 0,Year,Rainfall,Min-Temperature,Max-Temperature,Relative Humidity 1(0800hrs),Relative Humidity 2(1400hrs),MVP,Malarial Cases,Malaria Outbreak
67,2011,7871.2,26.18,30.88,82.03,79.45,533,1371,yes
68,2011,2007.0,26.47,31.69,75.87,78.03,525,968,yes
69,2011,2315.6,29.73,33.17,65.26,67.42,590,537,yes
70,2011,79.2,23.08,31.68,63.26,65.42,635,425,yes
71,2011,218.5,22.14,29.33,68.0,66.61,612,328,no


In [14]:
df.shape

(72, 9)

check for data types in the dataset

In [15]:
df.dtypes

Year                              int64
Rainfall                        float64
Min-Temperature                 float64
Max-Temperature                 float64
Relative Humidity 1(0800hrs)    float64
Relative Humidity 2(1400hrs)    float64
MVP                               int64
Malarial Cases                    int64
Malaria Outbreak                 object
dtype: object

get the columns

In [16]:
df.columns

Index(['Year', 'Rainfall', 'Min-Temperature', 'Max-Temperature',
       'Relative Humidity 1(0800hrs)', 'Relative Humidity 2(1400hrs)', 'MVP',
       'Malarial Cases', 'Malaria Outbreak'],
      dtype='object')

check for missing values

In [17]:
df.isnull().sum()

Year                            0
Rainfall                        0
Min-Temperature                 0
Max-Temperature                 0
Relative Humidity 1(0800hrs)    0
Relative Humidity 2(1400hrs)    0
MVP                             0
Malarial Cases                  0
Malaria Outbreak                0
dtype: int64

drop the year column

In [18]:
df = df.drop('Year', axis =1)
df.head()

Unnamed: 0,Rainfall,Min-Temperature,Max-Temperature,Relative Humidity 1(0800hrs),Relative Humidity 2(1400hrs),MVP,Malarial Cases,Malaria Outbreak
0,0.0,20.0,28.36,67.74,66.39,135,217,no
1,0.0,21.11,30.01,68.61,69.14,192,298,no
2,1317.7,24.38,31.51,71.55,74.06,98,413,no
3,2441.1,26.38,31.66,72.4,76.77,71,603,yes
4,3743.1,27.18,32.67,74.35,77.03,108,636,yes


check for category parameter in the category variable

In [19]:
df['Malaria Outbreak'].unique()

array(['no', 'yes'], dtype=object)

exploratory analysis

In [20]:
df.describe()

Unnamed: 0,Rainfall,Min-Temperature,Max-Temperature,Relative Humidity 1(0800hrs),Relative Humidity 2(1400hrs),MVP,Malarial Cases
count,72.0,72.0,72.0,72.0,72.0,72.0,72.0
mean,4161.486111,24.712222,31.055833,73.745833,74.468194,177.527778,604.666667
std,4163.674937,2.453733,1.996525,5.019072,5.139252,160.760865,593.732002
min,0.0,19.23,27.84,62.58,63.16,30.0,146.0
25%,410.175,23.03,29.9875,69.9525,70.45,81.0,303.25
50%,3149.75,25.555,31.085,73.21,75.095,123.0,466.0
75%,7068.05,26.385,31.7225,77.8325,78.1175,178.5,637.75
max,15850.6,29.73,42.45,84.32,84.1,710.0,4203.0


data visualization

In [21]:
fig = px.scatter(df, x="MVP", y="Malarial Cases", color="Rainfall", trendline='ols',trendline_color_override = 'red', title = 'Malaria Outbreak Prediction')
fig.show()

In [22]:
fig = px.scatter(df, x="Max-Temperature", y="Malarial Cases", color="MVP", trendline='ols', trendline_color_override = 'red', title = 'Malaria Outbreak Prediction')
fig.show()

In [23]:
fig = px.scatter(df, x="Min-Temperature", y="Malarial Cases", color="MVP", trendline='ols', trendline_color_override = 'red', title = 'Malaria Outbreak Prediction')
fig.show()

In [24]:
fig = px.scatter(df, x="Relative Humidity 1(0800hrs)", y="Malarial Cases", color="Rainfall", trendline='ols', trendline_color_override = 'red', title = 'Malaria Outbreak Prediction')
fig.show()

In [25]:
fig = px.scatter(df, x="Relative Humidity 2(1400hrs)", y="Malarial Cases", color="Rainfall", trendline='ols', trendline_color_override = 'red', title = 'Malaria Outbreak Prediction')
fig.show()

In [26]:
fig = px.scatter(df, x="Relative Humidity 2(1400hrs)", y="Rainfall", color="Malarial Cases", trendline='ols', trendline_color_override = 'red', title = 'Malaria Outbreak Prediction')
fig.show()

In [27]:
fig = px.scatter(df, x="Rainfall", y="Malarial Cases", color="Max-Temperature", trendline='ols', trendline_color_override = 'red', title = 'Malaria Outbreak Prediction')
fig.show()

Feature engineering

In [28]:
X = df.drop('Malaria Outbreak', axis =1)

In [29]:
y = df['Malaria Outbreak'].apply(lambda x: 1 if x== 'yes' else 0)

In [30]:
y.head()

0    0
1    0
2    0
3    1
4    1
Name: Malaria Outbreak, dtype: int64

split the data into train and test data

In [31]:
from sklearn.model_selection import train_test_split
from pycaret.utils import enable_colab
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder

label encoding of categorical variable

In [32]:
df['Malaria Outbreak'] = LabelEncoder().fit_transform(df['Malaria Outbreak'])
df.head()

Unnamed: 0,Rainfall,Min-Temperature,Max-Temperature,Relative Humidity 1(0800hrs),Relative Humidity 2(1400hrs),MVP,Malarial Cases,Malaria Outbreak
0,0.0,20.0,28.36,67.74,66.39,135,217,0
1,0.0,21.11,30.01,68.61,69.14,192,298,0
2,1317.7,24.38,31.51,71.55,74.06,98,413,0
3,2441.1,26.38,31.66,72.4,76.77,71,603,1
4,3743.1,27.18,32.67,74.35,77.03,108,636,1


In [33]:
#Train, test split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2)

In [34]:
#train model on LogisticRegression

lr = LogisticRegression()
lr.fit(X_train,y_train)

LogisticRegression()

In [35]:
#check accuracy of trained model on LogisticRegression

accuracy = lr.score(X_test, y_test)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.73


In [36]:
#train model on RandomForestClassifier

rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)

RandomForestClassifier()

In [37]:
#check accuracy of trained model on RandomForestClassifier

accuracy = rf.score(X_test, y_test)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.60


In [38]:
#train model on KNeighborsClassifier

knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

KNeighborsClassifier()

In [39]:
#check accuracy of trained model on KNeighborsClassifier

accuracy = knn.score(X_test, y_test)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.60


In [40]:
#train model on DecisionTreeClassifier

dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)


DecisionTreeClassifier()

In [41]:
#check accuracy of trained model on DecisionTreeClassifier

accuracy = dt.score(X_test, y_test)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.60


In [42]:
#train model on MLPClassifier

nn = MLPClassifier(hidden_layer_sizes=(10, 10), max_iter=1000)
nn.fit(X_train, y_train)

MLPClassifier(hidden_layer_sizes=(10, 10), max_iter=1000)

In [43]:
#check accuracy of trained model on MLPClassifier

accuracy = nn.score(X_test, y_test)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.47


In [60]:
input_data = (0,20.00	,28.36,	67.74,	66.39	,135,	217	)

input_data_as_numpy_array = np.asarray(input_data)

input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)


prediction = dt.predict(input_data_reshaped)
print(prediction)

if (prediction[0]==1):
    print('HIGH ALERT!!!  \nMalaria Outbreak is anticipated in  + selected_county +  County.  \n\lCONTROL MEASURES:  \nUse vector control; Indoor Residual Spraying (IRS), Insecticide-treated bed nets (ITNs), case management, vaccine administration and public sensitization.  \n\nMALARIA ENDEMIC COUNTIES:  \nmalaria_endemic_counties')
#elif (prediction[0]==0 and (Malarial Cases == 0 or MVP == 0)): 
#    print('NO THREAT... \nMalaria outbreak is currently NOT anticipated in  + selected_county + County. \n\nMALARIA ENDEMIC COUNTIES:\nmalaria_endemic_counties')
else:
    print('MILD ALERT!!!  \nMild Malaria Outbreak is anticipated in  + selected_county +  County.  \nHealth officials in the county should be on alert and engage in public education and sensitization:  \n\nCONTROL MEASURES: \nUse antimalaria prophylaxis, Indoor Residual Spraying (IRS), Insecticide-treated bed nets (ITNs).  \n\nMALARIA ENDEMIC COUNTIES:\nmalaria_endemic_counties')
    

[0]
MILD ALERT!!!  
Mild Malaria Outbreak is anticipated in  + selected_county +  County.  
Health officials in the county should be on alert and engage in public education and sensitization:  

CONTROL MEASURES: 
Use antimalaria prophylaxis, Indoor Residual Spraying (IRS), Insecticide-treated bed nets (ITNs).  

MALARIA ENDEMIC COUNTIES:
malaria_endemic_counties


retrain the model on the whole dataset

In [61]:
dt = DecisionTreeClassifier()
dt.fit(X, y)

DecisionTreeClassifier()

In [62]:
accuracy = dt.score(X, y)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 1.00


In [63]:
malaria_pred = dt

save the model

In [64]:
import pickle

In [65]:
filename = 'malaria_model.sav'
pickle.dump(malaria_pred,open(filename, 'wb'))

load the model

In [66]:
loaded_model = pickle.load(open('malaria_model.sav', 'rb'))

In [67]:
input_data = (0,20.00	,28.36,	67.74,	66.39	,135,	217	)

input_data_as_numpy_array = np.asarray(input_data)

input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)


prediction = loaded_model.predict(input_data_reshaped)
print(prediction)

if (prediction[0]==1):
    print('HIGH ALERT!!!  \nMalaria Outbreak is anticipated in  + selected_county +  County.  \n\lCONTROL MEASURES:  \nUse vector control; Indoor Residual Spraying (IRS), Insecticide-treated bed nets (ITNs), case management, vaccine administration and public sensitization.  \n\nMALARIA ENDEMIC COUNTIES:  \nmalaria_endemic_counties')
#elif (prediction[0]==0 and (Malarial Cases == 0 or MVP == 0)): 
#    print('NO THREAT... \nMalaria outbreak is currently NOT anticipated in  + selected_county + County. \n\nMALARIA ENDEMIC COUNTIES:\nmalaria_endemic_counties')
else:
    print('MILD ALERT!!!  \nMild Malaria Outbreak is anticipated in  + selected_county +  County.  \nHealth officials in the county should be on alert and engage in public education and sensitization:  \n\nCONTROL MEASURES: \nUse antimalaria prophylaxis, Indoor Residual Spraying (IRS), Insecticide-treated bed nets (ITNs).  \n\nMALARIA ENDEMIC COUNTIES:\nmalaria_endemic_counties')
    


# if (prediction[0]==1):
#     print('outbreak; control measures: vector control, case management and vaccines')
# else:
#     print('medium threat; control measures: antimalaria, IRS, ITN')

[0]
MILD ALERT!!!  
Mild Malaria Outbreak is anticipated in  + selected_county +  County.  
Health officials in the county should be on alert and engage in public education and sensitization:  

CONTROL MEASURES: 
Use antimalaria prophylaxis, Indoor Residual Spraying (IRS), Insecticide-treated bed nets (ITNs).  

MALARIA ENDEMIC COUNTIES:
malaria_endemic_counties
