In [1]:
import pandas as pd
from pycaret.classification import *
import numpy as np
import matplotlib.pyplot as plt
import plotly.express as px
import dash
import dash_core_components as dcc
import dash_html_components as html

loading dataset

In [47]:
df = pd.read_csv('malaria_outbreak.csv')

print the first five rows

In [48]:
df.head()

Unnamed: 0,Year,Rainfall,Min-Temperature,Max-Temperature,Relative Humidity 1(0800hrs),Relative Humidity 2(1400hrs),MVP,Malarial Cases,Malaria Outbreak
0,2006,0.0,20.0,28.36,67.74,66.39,135,217,no
1,2006,0.0,21.11,30.01,68.61,69.14,192,298,no
2,2006,1317.7,24.38,31.51,71.55,74.06,98,413,no
3,2006,2441.1,26.38,31.66,72.4,76.77,71,603,yes
4,2006,3743.1,27.18,32.67,74.35,77.03,108,636,yes


In [49]:
df.tail()

Unnamed: 0,Year,Rainfall,Min-Temperature,Max-Temperature,Relative Humidity 1(0800hrs),Relative Humidity 2(1400hrs),MVP,Malarial Cases,Malaria Outbreak
67,2011,7871.2,26.18,30.88,82.03,79.45,533,1371,yes
68,2011,2007.0,26.47,31.69,75.87,78.03,525,968,yes
69,2011,2315.6,29.73,33.17,65.26,67.42,590,537,yes
70,2011,79.2,23.08,31.68,63.26,65.42,635,425,yes
71,2011,218.5,22.14,29.33,68.0,66.61,612,328,no


In [50]:
df.shape

(72, 9)

cheching for the data types in the dataset

In [6]:
df.dtypes

Year                              int64
Rainfall                        float64
Min-Temperature                 float64
Max-temperature                 float64
Relative humidity 1(0800hrs)    float64
Relative humidity 2(1400hrs)    float64
Mosquito population               int64
Cases                             int64
Outbreak                         object
dtype: object

getting the columns

In [7]:
df.columns

Index(['Year', 'Rainfall', 'Min-Temperature', 'Max-temperature',
       'Relative humidity 1(0800hrs)', 'Relative humidity 2(1400hrs)',
       'Mosquito population', 'Cases', 'Outbreak'],
      dtype='object')

check for missing values

In [8]:
df.isnull().sum()

Year                            0
Rainfall                        0
Min-Temperature                 0
Max-temperature                 0
Relative humidity 1(0800hrs)    0
Relative humidity 2(1400hrs)    0
Mosquito population             0
Cases                           0
Outbreak                        0
dtype: int64

drop the year column

In [51]:
df = df.drop('Year', axis =1)

first five rows without the year column

In [52]:
df.head()

Unnamed: 0,Rainfall,Min-Temperature,Max-Temperature,Relative Humidity 1(0800hrs),Relative Humidity 2(1400hrs),MVP,Malarial Cases,Malaria Outbreak
0,0.0,20.0,28.36,67.74,66.39,135,217,no
1,0.0,21.11,30.01,68.61,69.14,192,298,no
2,1317.7,24.38,31.51,71.55,74.06,98,413,no
3,2441.1,26.38,31.66,72.4,76.77,71,603,yes
4,3743.1,27.18,32.67,74.35,77.03,108,636,yes


checking for the category parameter in the category variable

In [11]:
df['Outbreak'].unique()

array(['no', 'yes'], dtype=object)

exploratory analysis

In [12]:
df.describe()

Unnamed: 0,Rainfall,Min-Temperature,Max-temperature,Relative humidity 1(0800hrs),Relative humidity 2(1400hrs),Mosquito population,Cases
count,72.0,72.0,72.0,72.0,72.0,72.0,72.0
mean,4161.486111,24.712222,31.055833,73.745833,74.468194,177.527778,604.666667
std,4163.674937,2.453733,1.996525,5.019072,5.139252,160.760865,593.732002
min,0.0,19.23,27.84,62.58,63.16,30.0,146.0
25%,410.175,23.03,29.9875,69.9525,70.45,81.0,303.25
50%,3149.75,25.555,31.085,73.21,75.095,123.0,466.0
75%,7068.05,26.385,31.7225,77.8325,78.1175,178.5,637.75
max,15850.6,29.73,42.45,84.32,84.1,710.0,4203.0


data visualization

In [53]:
fig = px.scatter(df, x="MVP", y="Malarial Cases", color="Rainfall")#''''''

In [54]:
fig.show()

In [15]:
fig = px.scatter(df, x="Cases", y="Mosquito population", color="Rainfall")

In [17]:
fig.show()

In [16]:
fig = px.scatter(df, x="Mosquito population", y="Cases", color="Max-temperature")

In [18]:
fig.show()

In [19]:
fig = px.scatter(df, x="Mosquito population", y="Cases", color="Max-temperature")

In [20]:
fig.show()

In [21]:
fig = px.scatter(df, x="Relative humidity 2(1400hrs)", y="Rainfall", color="Cases")

In [22]:
fig.show()

In [23]:
fig = px.scatter(df, x="Relative humidity 1(0800hrs)", y="Rainfall", color="Cases")

In [24]:
fig.show()

In [27]:
fig = px.scatter(df, x="Mosquito population", y="Cases", color="Rainfall")

In [28]:
fig.show()

Feature engineering

In [55]:
X = df.drop('Malaria Outbreak', axis =1)

In [56]:
y = df['Malaria Outbreak'].apply(lambda x: 1 if x== 'yes' else 0)

In [57]:
y.head()

0    0
1    0
2    0
3    1
4    1
Name: Malaria Outbreak, dtype: int64

splitting the data into training and test data

In [58]:
from sklearn.model_selection import train_test_split
from pycaret.utils import enable_colab
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.preprocessing import LabelEncoder

label encoding of categorical variable

In [59]:
df['Malaria Outbreak'] = LabelEncoder().fit_transform(df['Malaria Outbreak'])
df.head()

Unnamed: 0,Rainfall,Min-Temperature,Max-Temperature,Relative Humidity 1(0800hrs),Relative Humidity 2(1400hrs),MVP,Malarial Cases,Malaria Outbreak
0,0.0,20.0,28.36,67.74,66.39,135,217,0
1,0.0,21.11,30.01,68.61,69.14,192,298,0
2,1317.7,24.38,31.51,71.55,74.06,98,413,0
3,2441.1,26.38,31.66,72.4,76.77,71,603,1
4,3743.1,27.18,32.67,74.35,77.03,108,636,1


In [60]:
X_train, X_test, y_train, y_test = train_test_split(X,y, test_size = 0.2)

In [61]:
lr = LogisticRegression()
lr.fit(X_train,y_train)

LogisticRegression()

In [62]:
accuracy = lr.score(X_test, y_test)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.73


In [63]:
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, y_train)

RandomForestClassifier()

In [64]:
accuracy = rf.score(X_test, y_test)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.73


In [65]:
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train, y_train)

KNeighborsClassifier()

In [66]:
accuracy = knn.score(X_test, y_test)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.67


In [67]:
dt = DecisionTreeClassifier()
dt.fit(X_train, y_train)


DecisionTreeClassifier()

In [68]:
accuracy = dt.score(X_test, y_test)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.80


In [69]:
nn = MLPClassifier(hidden_layer_sizes=(10, 10), max_iter=1000)
nn.fit(X_train, y_train)

MLPClassifier(hidden_layer_sizes=(10, 10), max_iter=1000)

In [70]:
accuracy = nn.score(X_test, y_test)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 0.47


In [71]:
input_data = (0,20.00	,28.36,	67.74,	66.39	,135,	217	)

input_data_as_numpy_array = np.asarray(input_data)

input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)


prediction = dt.predict(input_data_reshaped)
print(prediction)

if (prediction[0]==1):
    print('outbreak; control measures: vector control, case management and vaccines')
else:
    print('medium threat; control measures: antimalaria, IRS, ITN')

[0]
medium threat; control measures: antimalaria, IRS, ITN


retrain the model on the whole dataset

In [72]:
dt = DecisionTreeClassifier()
dt.fit(X, y)

DecisionTreeClassifier()

In [73]:
accuracy = dt.score(X, y)
print(f"Accuracy: {accuracy:.2f}")

Accuracy: 1.00


In [74]:
malaria_pred = dt

saving the model

In [75]:
import pickle

In [76]:
filename = 'malaria_model.sav'
pickle.dump(malaria_pred,open(filename, 'wb'))

load the model

In [77]:
loaded_model = pickle.load(open('malaria_model.sav', 'rb'))

In [78]:
input_data = (0,20.00	,28.36,	67.74,	66.39	,135,	217	)

input_data_as_numpy_array = np.asarray(input_data)

input_data_reshaped = input_data_as_numpy_array.reshape(1,-1)


prediction = loaded_model.predict(input_data_reshaped)
print(prediction)

if (prediction[0]==1):
    print('outbreak; control measures: vector control, case management and vaccines')
else:
    print('medium threat; control measures: antimalaria, IRS, ITN')

[0]
medium threat; control measures: antimalaria, IRS, ITN


In [8]:
try:
    df_County_Geodata = pd.read_csv('CountyGeodata.csv', encoding ='utf-8')

except:
    df_County_Geodata = pd.read_csv('CountyGeodata.csv', encoding ='ISO-8859-1')

In [19]:
df_County_Geodata.head()

Unnamed: 0,County_Code,County_Name,County_HQ,Latitude,Longitude,Altitude_Mtrs,Altitude_Ft
0,1,Mombasa,Mombasa,-4.04347,39.6682,4,13
1,2,Kwale,Kwale,-4.18161,39.46056,225,738
2,3,Kilifi,Kilifi,-3.51065,39.90932,100,328
3,4,Tana River,Hola,-1.49922,40.03432,64,210
4,5,Lamu,Lamu,-2.26955,40.90064,13,43


In [18]:
import pandas as pd
import plotly.express as px
import numpy as np

In [21]:
# mapbox_access_token = 'pk.eyJ1IjoiMDEyMzQ1Njc4OTA5ODc2NTQzMjEwIiwiYSI6ImNqdmdlZnZyMzAzcTQ0OHBjOGN0ZTl1ZW4ifQ.AHB-GJ3EtYeUrHIvtGBDkg'

px.set_mapbox_access_token('pk.eyJ1Ijoic3BhcmtsZW1pbCIsImEiOiJjbGR1OTIwN3MwM2l3M3BuOGNhc2p4bXk4In0.5IwWN-PxTqzZgcuA6OgQvw')
fig = px.scatter_mapbox(df_County_Geodata,
                        lat='Latitude',
                        lon='Longitude',
                        color='County_Name',
                        hover_name="County_HQ",
                        zoom=3,
                        mapbox_style='basic')
# fig.show()

ValueError: Value of 'lat' is not the name of a column in 'data_frame'. Expected one of ['County_Code', 'County_Name', 'County_HQ', ' Latitude', 'Longitude', 'Altitude_Mtrs', 'Altitude_Ft'] but received: Latitude

In [None]:
fig.show()