# importing libraries

In [None]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
import seaborn as sns
import warnings
from sklearn.preprocessing import LabelEncoder
import tkinter
from tkinter import *
from PIL import ImageTk, Image
import string

# Importing Data

In [None]:
data = pd.read_excel('OTF.xlsx') 
data.head()

In [None]:
data.columns

In [None]:
data.head()

# Data preproccessing

# Data cleaning 

In [None]:
# Check missing

data.isnull().sum()

In [None]:
# Check missing graphically

sns.heatmap(data.isnull())

# Data transformation

# Modify and rename attribute name

In [None]:
data.columns

In [None]:
data.rename(columns = {"gilr_referred_to_health_post_or_ msi_outreach_team_to_take_up_a_method":"Referred", "location_of_smart_ start_follow_up":"Premise", "pregnancy_ test":"P_test", "region ":"Region", "discussion_includes_husband ":"Hus", "referred_ by":"Referred_by", "girl_age":"Age", "method_received":"Method", "number_of_children":"Children", "referred_by":"Referred_by", "session_type":"S_type", "used_ec_or_condoms_last_time_you_had_sex":"CEC", "are_you_pregnant":"Pregnant"}, inplace= True)
data.columns

# Modify and rename attribute value

In [None]:
data['Pregnant'].replace({'N - Not pregnant': 'NP','UN - Unknown': 'UN'}, inplace= True)

In [None]:
data['Referred_by'].replace({'Friend/peer': 'Friend/Peer'}, inplace= True)

# Convert Text categories in to numeric categories (Indexing)

In [None]:
df=data
cols = ['Region', 'Pregnant', 'Hus', 'Age', 'Premise', 'Children', 'P_test', 'Referred_by', 'S_type', 'CEC', 'Referred', 'Method']
data = df[cols]
label_encoder = LabelEncoder()
data.Region = label_encoder.fit_transform(data['Region'])
data.Hus = label_encoder.fit_transform(data['Hus'])
data.Premise = label_encoder.fit_transform(data['Premise'])
data.Children = label_encoder.fit_transform(data['Children'])
data.P_test = label_encoder.fit_transform(data['P_test'])
data.Referred_by = label_encoder.fit_transform(data['Referred_by'])
data.S_type = label_encoder.fit_transform(data['S_type'])
data.CEC = label_encoder.fit_transform(data['CEC'])
data.Pregnant = label_encoder.fit_transform(data['Pregnant'])
data.Referred = label_encoder.fit_transform(data['Referred'])
data.Method = label_encoder.fit_transform(data['Method'])

In [None]:
data.head()

In [None]:
df.head()

# 1. girl referred to health post or msi outreach team to take up a method

# separate dependant and indpendant class

In [None]:
referred_x = data[["Region", "Hus", "Age", "Premise", "Children", "P_test", "Referred_by", "S_type", "CEC", "Pregnant"]]
referred_y = data[["Referred"]]

# Check Class Balance

In [None]:
df['Referred'].value_counts()

In [None]:
warnings.filterwarnings('ignore')
class_count = df['Referred'].value_counts()
sns.barplot(class_count.index, class_count.values, alpha=0.9)
plt.title('Frequency Distribution of Class')
plt.ylabel('Number of girls', fontsize=12)
plt.xlabel('Referred', fontsize=12)
plt.show()

# Apply over-sampling with smote

In [None]:
from imblearn import under_sampling, over_sampling
from imblearn.over_sampling import SMOTE
smote = SMOTE()
referred_smote_x,referred_smote_y= smote.fit_resample(referred_x,referred_y)

In [None]:
warnings.filterwarnings('ignore')
class_count = referred_smote_y['Referred'].value_counts()
sns.barplot(class_count.index, class_count.values, alpha=0.9)
plt.title('Frequency Distribution of Class')
plt.ylabel('Number of girls', fontsize=12)
plt.xlabel('Referred', fontsize=12)
plt.show()

# Split traing and test dataset

In [None]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(referred_x,referred_y,test_size=0.3,random_state=0)
x_train_smote,x_test_smote,y_train_smote,y_test_smote=train_test_split(referred_smote_x,referred_smote_y,test_size=0.3,random_state=0)

In [None]:
x_train.shape

In [None]:
x_train_smote.shape

# Select Algorithm

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier,export_graphviz
dt_clf = DecisionTreeClassifier()

In [None]:
#with unbalanced data
dt_clf.fit(referred_x,referred_y)
value=dt_clf.score(x_train,y_train)
value

In [None]:
#balanced class with smote
dt_clf.fit(referred_x,referred_y)
value=dt_clf.score(x_train_smote,y_train_smote)
value

# Cross Validation

In [None]:
# evaluate Models using 10-fold cross-validation
from numpy import mean
from sklearn.datasets import make_classification
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
cv = KFold(n_splits=10, random_state=1, shuffle=True)

In [None]:
#with unbalanced data
dt_cv_accuracy = cross_val_score(dt_clf, x_train, y_train, cv=cv).mean()
print("Unbalanced Cross Validation Accuracy DT: %.4f" % dt_cv_accuracy)

In [None]:
#balanced class with smote
dt_cv_accuracy1 = cross_val_score(dt_clf, x_train_smote, y_train_smote, cv=cv).mean()
print("Balanced Cross Validation Accuracy DT: %.4f" % dt_cv_accuracy1)

# Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier 
rand_forest =  RandomForestClassifier()

In [None]:
#with unbalanced data
rand_forest.fit(referred_x,referred_y)
value=rand_forest.score(x_train,y_train)
value

In [None]:
#balanced class with smote
rand_forest.fit(referred_x,referred_y)
value=rand_forest.score(x_train_smote,y_train_smote)
value

# Cross Validation

In [None]:
#with unbalanced data
rand_forest_cv_accuracy = cross_val_score(rand_forest, x_train, y_train, cv=cv).mean()
print("Cross Validation Accuracy DT: %.4f" % rand_forest_cv_accuracy)

In [None]:
#balanced class with smote
rand_forest_cv_accuracy = cross_val_score(rand_forest, x_train_smote, y_train_smote, cv=cv).mean()
print("Cross Validation Accuracy DT: %.4f" % rand_forest_cv_accuracy)

# Gradiant Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbMod = GradientBoostingClassifier()

In [None]:
#with unbalanced data
gbMod.fit(referred_x,referred_y)
value=gbMod.score(x_train,y_train)
value

In [None]:
#balanced class with smote
gbMod.fit(referred_x,referred_y)
value=gbMod.score(x_train_smote,y_train_smote)
value

# Cross Validation

In [None]:
#with unbalanced data
gbMod_cv_accuracy = cross_val_score(gbMod, x_train, y_train, cv=cv).mean()
print("Cross Validation Accuracy DT: %.4f" % gbMod_cv_accuracy)

In [None]:
#balanced class with smote
gbMod_cv_accuracy = cross_val_score(gbMod, x_train_smote, y_train_smote, cv=cv).mean()
print("Cross Validation Accuracy DT: %.4f" % gbMod_cv_accuracy)

# Multi-layer neural network

In [None]:
from sklearn.neural_network import MLPClassifier
mLPClassifier=MLPClassifier()

In [None]:
#with unbalanced data
mLPClassifier.fit(referred_x,referred_y)
value=mLPClassifier.score(x_train,y_train)
value

In [None]:
#balanced class with smote
mLPClassifier.fit(referred_x,referred_y)
value=mLPClassifier.score(x_train_smote,y_train_smote)
value

# Cross Validation

In [None]:
#with unbalanced data
mLPClassifier_cv_accuracy = cross_val_score(mLPClassifier, x_train, y_train, cv=cv).mean()
print("Cross Validation Accuracy DT: %.4f" % mLPClassifier_cv_accuracy)

In [None]:
#balanced class with smote
mLPClassifier_cv_accuracy = cross_val_score(mLPClassifier, x_train_smote, y_train_smote, cv=cv).mean()
print("Cross Validation Accuracy DT: %.4f" % mLPClassifier_cv_accuracy)

# Apply Feature Selection

In [None]:
from sklearn.feature_selection import SelectKBest, chi2

feature_selector = SelectKBest(chi2, k = "all")
fit = feature_selector.fit(referred_x,referred_y)

p_values = pd.DataFrame(fit.pvalues_)
scores = pd.DataFrame(fit.scores_)
input_variable_names = pd.DataFrame(referred_x.columns)
summary_stats = pd.concat([input_variable_names, p_values, scores], axis = 1)
summary_stats.columns = ["input_variable", "p_value", "chi2_score"]
summary_stats.sort_values(by = "p_value", inplace = True)

p_value_threshold = 0.05
score_threshold = 5

selected_variables = summary_stats.loc[(summary_stats["chi2_score"] >= score_threshold) &
                                       (summary_stats["p_value"] <= p_value_threshold)]
selected_variables1 = selected_variables["input_variable"]
X_new = referred_x[selected_variables]
selected_variables1

In [None]:
referred_x = data[["Region", "Hus", "Premise", "Children", "P_test", "Referred_by", "S_type", "CEC", "Pregnant"]]
referred_y = data[["Referred"]]


x_train,x_test,y_train,y_test=train_test_split(referred_x,referred_y,test_size=0.3,random_state=0)
x_train_smote,x_test_smote,y_train_smote,y_test_smote=train_test_split(referred_smote_x,referred_smote_y,test_size=0.3,random_state=0)

# After applying feature selection

# Decision Tree

In [None]:
#with unbalanced data
dt_cv_accuracy = cross_val_score(dt_clf, x_train, y_train, cv=cv).mean()
print("Unbalanced Cross Validation Accuracy DT: %.4f" % dt_cv_accuracy)

#balanced class with smote
dt_cv_accuracy1 = cross_val_score(dt_clf, x_train_smote, y_train_smote, cv=cv).mean()
print("Balanced Cross Validation Accuracy DT: %.4f" % dt_cv_accuracy1)


# Random Forest

In [None]:
#with unbalanced data
rand_forest_cv_accuracy = cross_val_score(rand_forest, x_train, y_train, cv=cv).mean()
print("Unbalanced Cross Validation Accuracy DT: %.4f" % rand_forest_cv_accuracy)

#balanced class with smote
rand_forest_cv_accuracy = cross_val_score(rand_forest, x_train_smote, y_train_smote, cv=cv).mean()
print("Balanced Cross Validation Accuracy DT: %.4f" % rand_forest_cv_accuracy)

# Gradiant Boosting

In [None]:
#with unbalanced data
gbMod_cv_accuracy = cross_val_score(gbMod, x_train, y_train, cv=cv).mean()
print("Unbalanced Cross Validation Accuracy DT: %.4f" % gbMod_cv_accuracy)

#balanced class with smote
gbMod_cv_accuracy = cross_val_score(gbMod, x_train_smote, y_train_smote, cv=cv).mean()
print("Balanced Cross Validation Accuracy DT: %.4f" % gbMod_cv_accuracy)

# Multi-layer neural network

In [None]:
#with unbalanced data
mLPClassifier_cv_accuracy = cross_val_score(mLPClassifier, x_train, y_train, cv=cv).mean()
print("Unbalanced Cross Validation Accuracy DT: %.4f" % mLPClassifier_cv_accuracy)

#balanced class with smote
mLPClassifier_cv_accuracy = cross_val_score(mLPClassifier, x_train_smote, y_train_smote, cv=cv).mean()
print("Balanced Cross Validation Accuracy DT: %.4f" % mLPClassifier_cv_accuracy)

# 2. contraceptive method received

# separate dependant and indpendant class

In [None]:
method_x = data[["Region", "Hus", "Age", "Premise", "Children", "P_test", "Referred_by", "S_type", "CEC", "Pregnant","Referred"]]
method_y = data[["Method"]]

# Check Class Balance

In [None]:
df['Method'].value_counts()

In [None]:
warnings.filterwarnings('ignore')
class_count = df['Method'].value_counts()
sns.barplot(class_count.index, class_count.values, alpha=0.9)
plt.title('Frequency Distribution of Class')
plt.ylabel('Number of girls', fontsize=12)
plt.xlabel('Referred', fontsize=12)
plt.show()

# Apply over-sampling with smote

In [None]:
from imblearn import under_sampling, over_sampling
from imblearn.over_sampling import SMOTE
smote = SMOTE()
method_smote_x,method_smote_y= smote.fit_resample(method_x,method_y)

In [None]:
warnings.filterwarnings('ignore')
class_count = method_smote_y['Method'].value_counts()
sns.barplot(class_count.index, class_count.values, alpha=0.9)
plt.title('Frequency Distribution of Class')
plt.ylabel('Number of girls', fontsize=12)
plt.xlabel('Referred', fontsize=12)
plt.show()

# Split traing and test dataset

In [None]:
from sklearn.model_selection import train_test_split

x_train,x_test,y_train,y_test=train_test_split(method_x,method_y,test_size=0.3,random_state=0)
x_train_smote,x_test_smote,y_train_smote,y_test_smote=train_test_split(method_smote_x,method_smote_y,test_size=0.3,random_state=0)

# Select Algorithm

# Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier,export_graphviz
method_dt_clf = DecisionTreeClassifier()


In [None]:
#with unbalanced data
method_dt_clf.fit(method_x,method_y)
value=method_dt_clf.score(x_train,y_train)
value

In [None]:
#balanced class with smote
method_dt_clf.fit(method_x,method_y)
value=method_dt_clf.score(x_train_smote,y_train_smote)
value

# Cross Validation

In [None]:
#with unbalanced data
dt_cv_accuracy = cross_val_score(dt_clf, x_train, y_train, cv=cv).mean()
print("Unbalanced Cross Validation Accuracy DT: %.4f" % dt_cv_accuracy)

In [None]:
#balanced class with smote
dt_cv_accuracy1 = cross_val_score(dt_clf, x_train_smote, y_train_smote, cv=cv).mean()
print("Balanced Cross Validation Accuracy DT: %.4f" % dt_cv_accuracy1)

# Random Forest

In [None]:
# different contraceptive methods
warnings.filterwarnings('ignore')
plt.rcParams['figure.figsize'] = (10, 6)
plt.style.use('Solarize_Light2')

data_n = data['method_received'].value_counts()
data_n.head(10).plot.bar(color = 'teal')

plt.title('Contraceptive Methods Recieved',fontsize = 30, color = 'black')
plt.xticks(rotation = 90)
plt.show()

In [None]:
# different contraceptive methods
warnings.filterwarnings('ignore')
plt.rcParams['figure.figsize'] = (10, 6)
plt.style.use('Solarize_Light2')

data_n = data['are_you_pregnant'].value_counts()
data_n.head(10).plot.bar(color = 'teal')
plt.title('Pregnancy Status ',fontsize = 30, color = 'black')
plt.xticks(rotation = 90)
plt.show()

# Data Cleaning

In [None]:
data.isnull().sum()

In [None]:
data.girl_age.hist()

In [None]:
#define values
values = [15, 16, 17, 18, 19, 20, 21, 22, 23, 24]

#drop rows that contain any value in the list
data1 = data[data.girl_age.isin(values) == True]

In [None]:
data1

# Referred

In [None]:
#define values
region_values = ['Sidama']

#drop rows that contain any value in the list
data2 = data1[data1['region '].isin(region_values) == False]

In [None]:
data2

# Normalization

In [None]:
t = data1['are_you_pregnant'].value_counts()
table = pd.DataFrame(data=t.values, index=t.index, columns=['Pregnant Count'])

table.rename({'index': 'Division'}, axis='columns', inplace=True)
table


In [None]:
data2.are_you_pregnant.replace({'N - Not pregnan': 'NP',
                            'UN - Unknown': 'UN',
                               'ghvg':'UN'}, inplace= True)

In [None]:
data2.rename(columns = {"gilr_referred_to_health_post_or_ msi_outreach_team_to_take_up_a_method":"Referred", "location_of_smart_ start_follow_up":"Premise", "pregnancy_ test":"P_test", "region ":"Region", "discussion_includes_husband ":"Hus", "referred_ by":"Referred_by", "girl_age":"Age", "method_received":"Method", "number_of_children":"Children", "referred_by":"Referred_by", "session_type":"S_type", "used_ec_or_condoms_last_time_you_had_sex":"CEC", "are_you_pregnant":"Pregnant"}, inplace= True)
data2.columns

In [None]:
r = data1['referred_ by'].value_counts()
table = pd.DataFrame(data=r.values, index=r.index, columns=['Reffered by Count'])

table.rename({'index': 'Division'}, axis='columns', inplace=True)
table

In [None]:
data2.Referred_by.replace({'Friend/peer': 'Friend/Peer'}, inplace= True)

In [None]:
r = data2['Referred_by'].value_counts()
table = pd.DataFrame(data=r.values, index=r.index, columns=['Reffered by Count'])

table.rename({'index': 'Division'}, axis='columns', inplace=True)
table

In [None]:
cols = ['Region', 'Pregnant', 'Hus', 'Age', 'Premise', 'Children', 'P_test', 'Referred_by', 'S_type', 'CEC', 'Referred', 'Method']
data3 = data2[cols]
data3.head(5)

# Data Transformation

In [None]:
label_encoder = LabelEncoder()

In [None]:
data3.Region = label_encoder.fit_transform(data3['Region'])
data3.Hus = label_encoder.fit_transform(data3['Hus'])
data3.Premise = label_encoder.fit_transform(data3['Premise'])
data3.Children = label_encoder.fit_transform(data3['Children'])
data3.P_test = label_encoder.fit_transform(data3['P_test'])
data3.Referred_by = label_encoder.fit_transform(data3['Referred_by'])
data3.S_type = label_encoder.fit_transform(data3['S_type'])
data3.CEC = label_encoder.fit_transform(data3['CEC'])
data3.Pregnant = label_encoder.fit_transform(data3['Pregnant'])
data3.Referred = label_encoder.fit_transform(data3['Referred'])
data3.Method = label_encoder.fit_transform(data3['Method'])

In [None]:
data3.head()

# 1. girl referred to health post or msi outreach team to take up a method

In [None]:
data4 = data3.drop(columns=["Method"], axis = 1)

In [None]:
x = data3[["Region", "Hus", "Age", "Premise", "Children", "P_test", "Referred_by", "S_type", "CEC", "Pregnant"]]
y = data3[["Referred"]]

# Class Balancing

In [None]:
warnings.filterwarnings('ignore')
class_count = data3['Referred'].value_counts()
sns.set(style="darkgrid")
sns.barplot(class_count.index, class_count.values, alpha=0.9)
plt.title('Frequency Distribution of Class')
plt.ylabel('Number of girls', fontsize=12)
plt.xlabel('Referred', fontsize=12)
plt.show()

#over-sampling with smote

In [None]:
# pip install imblearn

In [None]:
from imblearn import under_sampling, over_sampling
from imblearn.over_sampling import SMOTE


In [None]:
smote = SMOTE()
X_re ,Y_re= smote.fit_resample(x,y)

In [None]:
len(X_re)

#under sampling with random under sampler

In [None]:
from imblearn.under_sampling import RandomUnderSampler

In [None]:
under = RandomUnderSampler()
X_res ,Y_res= under.fit_resample(x, y)

In [None]:
len(X_res)

# Cross Validation

In [None]:
# evaluate Models using 10-fold cross-validation
from numpy import mean
#from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
# prepare the cross-validation procedure
cv = KFold(n_splits=10, random_state=1, shuffle=True)
#cv2 = KFold(n_splits=5, random_state=1, shuffle=True)
# create model

# Algorithm

#Decision Tree

In [None]:
from sklearn.tree import DecisionTreeClassifier,export_graphviz
dt_clf = DecisionTreeClassifier()

#with unbalanced data
dt_cv_accuracy = cross_val_score(dt_clf, x, y, cv=cv).mean()
print("Cross Validation Accuracy DT: %.4f" % dt_cv_accuracy)

#balanced class with smote
dt_cv_accuracy1 = cross_val_score(dt_clf, X_re, Y_re, cv=cv).mean()
print("Cross Validation Accuracy DT: %.4f" % dt_cv_accuracy1)

In [None]:
#with unbalanced data
dt_cv_accuracy = cross_val_score(dt_clf, x, y, cv=cv).mean()
print("Cross Validation Accuracy DT: %.4f" % dt_cv_accuracy)

#balanced class with smote
dt_cv_accuracy1 = cross_val_score(dt_clf, X_re, Y_re, cv=cv).mean()
print("Cross Validation Accuracy DT: %.4f" % dt_cv_accuracy1)

In [None]:
#balanced class with smote
dt_cv_accuracy1 = cross_val_score(dt_clf, X_re, Y_re, cv=cv).mean()
print("Cross Validation Accuracy DT: %.4f" % dt_cv_accuracy1)

In [None]:
#balanced data with random under sampler
dt_cv_accuracy2 = cross_val_score(dt_clf, X_res, Y_res, cv=cv).mean()
print("Cross Validation Accuracy DT: %.4f" % dt_cv_accuracy2)

#Gradiant Boosting

In [None]:
from sklearn.ensemble import GradientBoostingClassifier
gbMod = GradientBoostingClassifier()

In [None]:
#with unbalanced data
gb_accuracy = cross_val_score(gbMod, x, y, cv=cv).mean()
print("Cross Validation Accuracy GB: %.4f" % gb_accuracy)

In [None]:
#balanced data with smote
gb_accuracy1 = cross_val_score(gbMod, X_re, Y_re, cv=cv).mean()
print("Cross Validation Accuracy GB: %.4f" % gb_accuracy1)

In [None]:
#balanced data with random under sampler
gb_accuracy2 = cross_val_score(gbMod, X_res, Y_res, cv=cv).mean()
print("Cross Validation Accuracy GB: %.4f" % gb_accuracy2)

#Random Forest

In [None]:
from sklearn.ensemble import RandomForestClassifier 
algorithm =  RandomForestClassifier()

In [None]:
#with unbalanced data
scores = cross_val_score(algorithm, x, y, scoring='accuracy', cv=cv)
# report performance
print('Accuracy RF: %.4f' % mean(scores))

In [None]:
#balanced data with smote
scores1 = cross_val_score(algorithm, X_re, Y_re, scoring='accuracy', cv=cv)
# report performance
print('Accuracy RF: %.4f' % mean(scores1))

In [None]:
#balanced data with random under sampler
scores2 = cross_val_score(algorithm, X_res, Y_res, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy RF: %.4f' % mean(scores2))

#Multi-layer neural network

In [None]:
# Import MLPClassifer 
from sklearn.neural_network import MLPClassifier

In [None]:
#without class balancing
model1 = MLPClassifier(hidden_layer_sizes=(5,2),#number of attritube * class values under square root 
                    random_state=1,
                    verbose=False,
                    learning_rate_init=0.01)
# evaluate model
scores = cross_val_score(model1, x, y, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy of MLNN: %.4f' % mean(scores))

In [None]:
# evaluate model
scores1 = cross_val_score(model1, X_re, Y_re, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy of MLNN: %.4f' % mean(scores1))

In [None]:
# evaluate model
scores2 = cross_val_score(model1, X_res, Y_res, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy of MLNN: %.4f' % mean(scores2))

# Feature Selection

#chi2 test for feature selection

In [None]:
from sklearn.feature_selection import SelectKBest, chi2

feature_selector = SelectKBest(chi2, k = "all")
fit = feature_selector.fit(x,y)

p_values = pd.DataFrame(fit.pvalues_)
scores = pd.DataFrame(fit.scores_)
input_variable_names = pd.DataFrame(x.columns)
summary_stats = pd.concat([input_variable_names, p_values, scores], axis = 1)
summary_stats.columns = ["input_variable", "p_value", "chi2_score"]
summary_stats.sort_values(by = "p_value", inplace = True)

p_value_threshold = 0.05
score_threshold = 5

selected_variables = summary_stats.loc[(summary_stats["chi2_score"] >= score_threshold) &
                                       (summary_stats["p_value"] <= p_value_threshold)]
selected_variables1 = selected_variables["input_variable"]
X_new = x[selected_variables]

In [None]:
selected_variables1

In [None]:
X = data3[["Region", "Hus", "Premise", "Children", "P_test", "Referred_by", "S_type", "CEC", "Pregnant"]]
Y = data3[["Referred"]]

#Decision tree

In [None]:
dt_cv_accuracy1 = cross_val_score(dt_clf, X, Y, cv=cv).mean()
print("Cross Validation Accuracy DT: %.4f" % dt_cv_accuracy1)

#Gradiant boosting

In [None]:
gb_accuracy1 = cross_val_score(gbMod, X, Y, cv=cv).mean()
print("Cross Validation Accuracy DT: %.4f" % gb_accuracy1)

#Random Forest

In [None]:
scores1 = cross_val_score(algorithm, X, Y, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy RF: %.4f' % mean(scores1))

#Multi-layer neural network

In [None]:
# evaluate model
scores = cross_val_score(model1, X, Y, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy of MLNN: %.3f' % mean(scores))

# Pridiction for referred

In [None]:
# final features 
# X = data3[["Region", "Hus", "Premise", "Children", "P_test", "Referred_by", "S_type", "CEC", "Pregnant"]]
# Y = data3[["Referred"]]

In [None]:
data3.head(10)

In [None]:
new_data = [[0,0,2, 2, 5, 1, 0, 0, 1]]
# 0=NO
# 1=YES

In [None]:
dt_clf.fit(X, Y)
Referred_output = dt_clf.predict(new_data)
Referred_output

In [None]:
 data3["Referred"].value_counts()

# 2. contraceptive method received

In [None]:
x1 = data3[["Region", "Hus", "Age", "Premise", "Children", "P_test", "Referred_by", "S_type", "CEC", "Pregnant", "Referred"]]
y1 = data3[["Method"]]

In [None]:
#define values
values = ['None', 'none']

#drop rows that contain any value in the list
data6 = data2[data2.Method.isin(values) == False]

In [None]:
data6

In [None]:
data6.Region = label_encoder.fit_transform(data6['Region'])
data6.Hus = label_encoder.fit_transform(data6['Hus'])
data6.Premise = label_encoder.fit_transform(data6['Premise'])
data6.Children = label_encoder.fit_transform(data6['Children'])
data6.P_test = label_encoder.fit_transform(data6['P_test'])
data6.Referred_by = label_encoder.fit_transform(data6['Referred_by'])
data6.S_type = label_encoder.fit_transform(data6['S_type'])
data6.CEC = label_encoder.fit_transform(data6['CEC'])
data6.Pregnant = label_encoder.fit_transform(data6['Pregnant'])
data6.Referred = label_encoder.fit_transform(data6['Referred'])
data6.Method = label_encoder.fit_transform(data6['Method'])

# Class balancing

In [None]:
warnings.filterwarnings('ignore')
class_count = data6['Method'].value_counts()
sns.set(style="darkgrid")
sns.barplot(class_count.index, class_count.values, alpha=0.9)
plt.title('Frequency Distribution of Class')
plt.ylabel('Number of girls', fontsize=12)
plt.xlabel('Contraceptive method', fontsize=12)
plt.show()

#over sampling

In [None]:
x_re ,y_re= smote.fit_resample(x1,y1)

#under sampling

In [None]:
x_res ,y_res= under.fit_resample(x1, y1)

#Decision Tree

In [None]:
dt_accuracy = cross_val_score(dt_clf, x1, y1, cv=cv).mean()
print("Cross Validation Accuracy DT: %.4f" % dt_accuracy)

In [None]:
dt_accuracy1 = cross_val_score(dt_clf, x_re, y_re, cv=cv).mean()
print("Cross Validation Accuracy DT: %.4f" % dt_accuracy1)

In [None]:
dt_accuracy2 = cross_val_score(dt_clf, x_res, y_res, cv=cv).mean()
print("Cross Validation Accuracy DT: %.4f" % dt_accuracy2)

In [None]:
#Gradiant Boosting

In [None]:
gb_acc = cross_val_score(gbMod, x1, y1, cv=cv).mean()
print("Cross Validation Accuracy DT: %.4f" % gb_acc)

In [None]:
gb_acc1 = cross_val_score(gbMod, x_re, y_re, cv=cv, n_jobs=-1).mean()
print("Cross Validation Accuracy DT: %.4f" % gb_acc1)

In [None]:
gb_acc2 = cross_val_score(gbMod, x_res, y_res, cv=cv,  n_jobs=-1).mean()
print("Cross Validation Accuracy DT: %.4f" % gb_acc2)

In [None]:
#Random Forest

In [None]:
rf_acc = cross_val_score(algorithm, x1, y1, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy RF: %.4f' % mean(rf_acc))

In [None]:
rf_acc1 = cross_val_score(algorithm, x_re, y_re, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy RF: %.4f' % mean(rf_acc1))

In [None]:
rf_acc2 = cross_val_score(algorithm, x_res, y_res, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy RF: %.4f' % mean(rf_acc2))

In [None]:
#Multi-layer neural network

In [None]:
# evaluate model
mlnn_acc = cross_val_score(model1, x1, y1, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy of MLNN: %.4f' % mean(mlnn_acc))

In [None]:
# evaluate model
mlnn_acc1 = cross_val_score(model1, x_re, y_re, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy of MLNN: %.4f' % mean(mlnn_acc1))

In [None]:
# evaluate model
mlnn_acc1 = cross_val_score(model1, x_res, y_res, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy of MLNN: %.4f' % mean(mlnn_acc1))

# Feature Selection

In [None]:
from sklearn.feature_selection import SelectKBest, chi2

feature_selector = SelectKBest(chi2, k = "all")
fit = feature_selector.fit(x1,y1)

p_values = pd.DataFrame(fit.pvalues_)
scores = pd.DataFrame(fit.scores_)
input_variable_names = pd.DataFrame(x1.columns)
summary_stats = pd.concat([input_variable_names, p_values, scores], axis = 1)
summary_stats.columns = ["input_variable", "p_value", "chi2_score"]
summary_stats.sort_values(by = "p_value", inplace = True)

p_value_threshold = 0.05
score_threshold = 5

selected_variabless = summary_stats.loc[(summary_stats["chi2_score"] >= score_threshold) &
                                       (summary_stats["p_value"] <= p_value_threshold)]
selected_variables2 = selected_variabless["input_variable"]
X_new = x1[selected_variabless]

In [None]:
selected_variables2

In [None]:
X1 = data3[["Region", "Hus", "Premise", "Children", "P_test", "Referred_by", "S_type", "CEC", "Pregnant", "Referred"]]
Y1 = data3[["Method"]]

In [None]:
x_smote ,y_smote= smote.fit_resample(X1,Y1)

In [None]:
rf_accu = cross_val_score(algorithm, X1, Y1, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy RF: %.4f' % mean(rf_accu))

In [None]:
rf_accu1 = cross_val_score(algorithm, x_smote, y_smote, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy RF: %.4f' % mean(rf_accu1))

In [None]:
dt_acc = cross_val_score(dt_clf, X1, Y1, cv=cv).mean()
print("Cross Validation Accuracy DT: %.4f" % dt_acc)

In [None]:
dt_acc1 = cross_val_score(dt_clf, x_smote, y_smote, cv=cv).mean()
print("Cross Validation Accuracy DT: %.4f" % dt_acc1)

In [None]:
gb_accu = cross_val_score(gbMod, X1, Y1, cv=cv).mean()
print("Cross Validation Accuracy GB: %.4f" % gb_accu)

In [None]:
gb_accu1 = cross_val_score(gbMod, x_smote, y_smote, cv=cv, n_jobs=-1).mean()
print("Cross Validation Accuracy GB: %.4f" % gb_accu1)

In [None]:
mlnn_accu = cross_val_score(model1, X1, Y1, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy of MLNN: %.3f' % mean(mlnn_accu))

In [None]:
mlnn_accu1 = cross_val_score(model1, x_smote, y_smote, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy of MLNN: %.3f' % mean(mlnn_accu1))

# Hyper-Parameter Tuning

#using randomized search

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import randint
from sklearn.model_selection import GridSearchCV

In [None]:
rfHyperParams = {'max_features': randint(1,10),
                 'min_samples_split': randint(1,10),
                 'min_samples_leaf': randint(1,5),
                 'bootstrap': randint(1,5),
                 'n_estimators': randint(10, 500),
                 'max_depth': randint(1,10)}

In [None]:
gridSearchRF = RandomizedSearchCV(estimator=algorithm, param_distributions=rfHyperParams, n_iter=10,
                                   scoring='roc_auc', verbose=2).fit(x_smote, y_smote)

In [None]:
gridSearchRF.best_params_,gridSearchRF.best_score_

In [None]:
rfMod = RandomForestClassifier(max_depth=4, min_samples_split=6,
                               min_samples_leaf=2
                               )

In [None]:
rf_accu = cross_val_score(rfMod, x_re, y_re, scoring='accuracy', cv=cv, n_jobs=-1)
# report performance
print('Accuracy RF: %.4f' % mean(rf_accu))

# confusion matrix

In [None]:
from sklearn.metrics import confusion_matrix

matrix=plot_confusion_matrix(algorithm,x_test,y_test,cmap=plt.cm.Reds)
# matrix.ax_.set_title('Confusion Matrix',color='white')
# plt.xlabel('True Label',color='white')
# plt.ylabel('True Label',color='white')
plt.gcf().axes[0].tick_params(color='white')
plt.gcf().axes[1].tick_params(color='white')
plt.gcf().set_size_inches(10,5)
plt.show()
# confusion_matrix(y_test, predict)
# confusionchart(confusion_matrix(y_true, y_pred))
# correct=27+2+27+207+6
# incorrect=2+1
# total=274 #the total is y_test length
# accuray=correct/total

In [None]:
import numpy as np
from sklearn.metrics import classification_report

print(classification_report(y_test, predict))

In [None]:
data6.columns

In [None]:
X1.columns

In [None]:
data3['Region'].unique()

In [None]:
data3.head()

# Prediction

In [None]:
# X1 = data3[["Region", "Hus", "Premise", "Children", "P_test", "Referred_by", "S_type", "CEC", "Pregnant", "Referred"]]
# Y1 = data3[["Method"]]
print(data3['Method'].value_counts().sum())
data3['Method'].count()
data3['Referred'].value_counts()

In [None]:
data2['Referred'].value_counts()

In [None]:
data.columns

In [None]:
new_data = [[0, 0, 1, 1, 0, 5, 1, 0,1,0]]
# 0=Daily Pill
# 1=IUCD        
# 2=Implant 3 year Implanon
# 3=Implant 5 year Jadelle
# 4=Injection
# 5=None

In [None]:
algorithm.fit(X1, Y1)
new_output = algorithm.predict(new_data)

In [None]:
new_output

In [None]:
data.columns

In [None]:
data6.columns

In [None]:
data['used_ec_or_condoms_last_time_you_had_sex'].unique()

In [None]:
new_data = [[0,0,2, 2, 5, 1, 0, 0, 1]]
dt_clf.fit(X, Y)
Referred_output = dt_clf.predict(new_data)
Referred_output

In [None]:
data2['Region'].value_counts()

In [None]:
data3['Region'].value_counts()

In [None]:
data2['Hus'].value_counts()

In [None]:
data3['Hus'].value_counts()

In [None]:
data2['Children'].value_counts()

In [None]:
data3['Children'].value_counts()

In [None]:
data2['S_type'].value_counts()

In [None]:
data3['S_type'].value_counts()

In [None]:
data2['Pregnant'].value_counts()

In [None]:
data3['Pregnant'].value_counts()

In [None]:
# X = data3[["Region", "Hus", "Premise", "Children", "P_test", "Referred_by", "S_type", "CEC", "Pregnant"]]
# Y = data3[["Referred"]]

# X1 = data3[["Region", "Hus", "Premise", "Children", "P_test", "Referred_by", "S_type", "CEC", "Pregnant", "Referred"]]
# Y1 = data3[["Method"]]

def pridictReferred(Region,Hus,Premise,Children,P_test,Referred_by,S_type,CEC,Pregnant):
    
    data=[]
    
    data.append(Region) 
    
    data.append(Hus)
    
    data.append(Premise)
    
    data.append(Children)
    
    data.append(P_test)
    
    data.append(Referred_by)
    
    data.append(S_type)
    
    data.append(CEC)
    
    data.append(Pregnant)
    
    result=dt_clf.predict([data])
    return result

def pridictMethod(Region,Hus,Premise,Children,P_test,Referred_by,S_type,CEC,Pregnant,Referred):
    
    data=[]
    
    data.append(Region) 
    
    data.append(Hus)
    
    data.append(Premise)
    
    data.append(Children)
    
    data.append(P_test)
    
    data.append(Referred_by)
    
    data.append(S_type)
    
    data.append(CEC)
    
    data.append(Pregnant)
    
    data.append(Referred)
    
    result=algorithm.predict([data])
    return result
    
# 'N_Age','N_Gender','N_Nationality','N_FlightID','N_CabinClass','Tiketed_Date'    
def dataPredictor():
    
    regions=["Oromiya","SNNP","Sidama"]
   
    i=1
    for region in regions:
        if(region==regionmenu.get()):
            Region=i
        i=i+1
        
    if(discussion_includes_husbandmenu.get()=="YES"):
        Hus=1       
    else:
        Hus=0 
        
    Premises=["Health Post","Home","Meeting hall"]
   
    i=1
    for premise in Premises:
        if(premise==Follow_up_locationlblmenu.get()):
            Premise=i
        i=i+1    
        
    NumChildren=["0","1","2","3","4"]
    
    i=1
    for child in NumChildren:
        if(child==number_of_childrenlblmenu.get()):
            Children=i
        i=i+1  
        
    if(discussion_includes_husbandmenu.get()=="YES"):
        P_test=1       
    else:
        P_test=0 
        
    Referreds=['WDA', 'Navigator', 'HEW', 'Husband', 'Friend/peer']
    
    i=1
    for referred in Referreds:
        if(referred==referred_bylblmenu.get()):
            Referred_by=i
        i=i+1  

        
    if(discussion_includes_husbandmenu.get()=="Individual Session"):
        S_type=1       
    else:
        S_type=0   

    CECs=['None', 'Used Condoms', 'Used EC']  
    
    i=1
    for cec in CECs:
        if(cec==CEClblmenu.get()):
            CEC=i
        i=i+1 
    if(discussion_includes_husbandmenu.get()=="YES"):
        P_test=1       
    else:
        P_test=0 
    if(are_you_pregnantmenu.get()=="Unknown"):
        Pregnant=1       
    else:
        Pregnant=0 
    
    referredResult=pridictReferred(Region,Hus,Premise,Children,P_test,Referred_by,S_type,CEC,Pregnant)
    if(referredResult[0]==0):
        ReferredlblMessege.config(text="The Referred is NO")
        
    elif(referredResult[0]==1):
        ReferredlblMessege.config(text="The Referred is YES")
        
    methodResult=pridictMethod(Region,Hus,Premise,Children,P_test,Referred_by,S_type,CEC,Pregnant,referredResult[0])
    
    if(methodResult[0]==0):
        
        MethodlblMessege.config(text="The Method is Daily Pill")
        
    elif(methodResult[0]==1):
        
        MethodlblMessege.config(text="The Method is IUCD")
        
    elif(methodResult[0]==2):
        
        MethodlblMessege.config(text="The Method is Implant 3 year Implanon")
        
    elif(methodResult[0]==3):
        
        MethodlblMessege.config(text="The Method is Implant 5 year Jadelle")
        
    elif(methodResult[0]==4):
        
        MethodlblMessege.config(text="The Method is Injection")
        
    elif(methodResult[0]==5):
        
        MethodlblMessege.config(text="The Method is None")
        
def ErrorMessage(message):
    
    lblError.config(text=message)
window = Tk()


window.title("Welcome to refferd and method prediction system ")

window.geometry('1000x600')

lbl1 = Label(window, text="Let us predict refferd and method ",font=(50),fg="green")

lbl1.grid(column=1, row=1,padx=10, pady=10)


regionlbl = Label(window, text="Region",font=(12),fg="green", anchor="w", width=15)

regionlbl.grid(column=0, row=4,padx=0, pady=10)

regionlbl.grid(column=0, row=4,padx=0, pady=10)

regionmenu= StringVar()
regionmenu.set("Oromiya")
drop= OptionMenu(window, regionmenu, "Oromiya", "Sidama","SNNP")
drop.grid(row=4, column=1,padx=20, pady=10)
drop.config(width = 25)
drop.config(bg = "White")
drop.grid()


are_you_pregnantlbl = Label(window, text="Are you pregnant",font=(12),fg="green",anchor="w", width=15)

are_you_pregnantlbl.grid(column=0, row=5,padx=0, pady=5)

are_you_pregnantlbl.grid(column=0, row=5,padx=0, pady=5)

are_you_pregnantmenu= StringVar()
are_you_pregnantmenu.set("Unknown")
drop= OptionMenu(window, are_you_pregnantmenu, "Unknown", "Not pregnant")
drop.grid(row=5, column=1,padx=20, pady=10)
drop.config(width = 25)
drop.config(bg = "White")
drop.grid()



discussion_includes_husbandlbl = Label(window, text="Discussion includes husband",font=(12),fg="green",anchor="w", width=15)

discussion_includes_husbandlbl.grid(column=0, row=6,padx=20, pady=2)

discussion_includes_husbandlbl.grid(column=0, row=6,padx=20, pady=2)

discussion_includes_husbandmenu= StringVar()
discussion_includes_husbandmenu.set("YES")
drop= OptionMenu(window, discussion_includes_husbandmenu, "YES", "NO")
drop.grid(row=6, column=1,padx=20, pady=10)
drop.config(width = 25)
drop.config(bg = "White")
drop.grid()


Follow_up_locationlbl = Label(window, text="Follow up location",font=(12),fg="green",anchor="w", width=15)

Follow_up_locationlbl.grid(column=0, row=7,padx=20, pady=2)

Follow_up_locationlbl.grid(column=0, row=7,padx=20, pady=2)

Follow_up_locationlblmenu= StringVar()
Follow_up_locationlblmenu.set("Home")
drop= OptionMenu(window, Follow_up_locationlblmenu, "Home","Health Post","Meeting hall")
drop.grid(row=7, column=1,padx=20, pady=10)
drop.config(width = 25)
drop.config(bg = "White")
drop.grid()


number_of_childrenlbl = Label(window, text="number of children",font=(12),fg="green",anchor="w", width=15)

number_of_childrenlbl.grid(column=0, row=8,padx=20, pady=2)

number_of_childrenlbl.grid(column=0, row=8,padx=20, pady=2)

number_of_childrenlblmenu= StringVar()
number_of_childrenlblmenu.set("0")
drop= OptionMenu(window, number_of_childrenlblmenu, "0", "1","2","3","4")
drop.grid(row=8, column=1)
drop.config(width = 25)
drop.config(bg = "White")
drop.grid()


pregnancy_testlbl = Label(window, text="pregnancy test",font=(12),fg="green",anchor="w", width=15)

pregnancy_testlbl.grid(column=0, row=9,padx=20, pady=2)

pregnancy_testlbl.grid(column=0, row=9,padx=20, pady=2)

pregnancy_testlblmenu= StringVar()
pregnancy_testlblmenu.set("No-test given")
drop= OptionMenu(window, pregnancy_testlblmenu, 'No-test given', 'Tested-negative')
drop.grid(row=9, column=1,padx=20, pady=10)
drop.config(width = 25)
drop.config(bg = "White")
drop.grid()


referred_bylbl = Label(window, text="Referred by",font=(12),fg="green",anchor="w", width=15)

referred_bylbl.grid(column=0, row=10,padx=20, pady=2)

referred_bylbl.grid(column=0, row=10,padx=20, pady=2)

referred_bylblmenu= StringVar()
referred_bylblmenu.set("Navigator")
drop= OptionMenu(window, referred_bylblmenu, 'WDA', 'Navigator', 'HEW', 'Husband', 'Friend/peer',
       'Community Leader', 'Friend/Peer')
drop.grid(row=10, column=1,padx=20, pady=10)
drop.config(width = 25)
drop.config(bg = "White")
drop.grid()

Sessionlbl = Label(window, text="Session type",font=(12),fg="green",anchor="w", width=15)

Sessionlbl.grid(column=0, row=11,padx=20, pady=2)

Sessionlbl.grid(column=0, row=11,padx=20, pady=2)

Sessionlblmenu= StringVar()
Sessionlblmenu.set("Individual Session")
drop= OptionMenu(window, Sessionlblmenu,'Individual Session', 'Group Session')
drop.grid(row=11, column=1,padx=20, pady=10)
drop.config(width = 25)
drop.config(bg = "White")
drop.grid()


CEClbl = Label(window, text="CEC",font=(12),fg="green",anchor="w", width=15)

CEClbl.grid(column=0, row=12,padx=20, pady=2)

CEClbl.grid(column=0, row=12,padx=20, pady=2)

CEClblmenu= StringVar()
CEClblmenu.set("None")
drop= OptionMenu(window, CEClblmenu,'None', 'Used Condoms', 'Used EC')
drop.grid(row=12, column=1,padx=20, pady=10)
drop.config(width = 25)
drop.config(bg = "White")
drop.grid()

btn = Button(window, text="pridict",font=(12), command=dataPredictor, bg="green", fg="white",anchor="e")
# btn = Button(window, text="pridict",font=(12), bg="green", fg="white")

btn.grid(column=1, row=13,padx=0)

ReferredlblMessege=Label(window, text=" ",font=(18), fg="blue", anchor="w", width=30)

ReferredlblMessege.grid(column=2, row=4,padx=20, pady=0)

MethodlblMessege=Label(window, text=" ",font=(18), fg="blue",anchor="w", width=30)

MethodlblMessege.grid(column=2, row=5,padx=20, pady=0)


window.mainloop()

