# Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2,f_classif

# Importing the dataset

The dataset "TB_Burden_Country.csv" contains information about the burden of tuberculosis (TB) in different countries. Each row in the dataset represents a country, and the columns provide various indicators related to TB prevalence, incidence, mortality, and treatment success rates.

The dataset is provided by the World Health Organization (WHO) and is based on data collected from national TB programs. The data is updated annually and is available for download from the WHO website.

In [None]:
df = pd.read_csv('/content/Naive-Bayes-Classification-Data.csv')
df

Unnamed: 0,glucose,bloodpressure,diabetes
0,40,85,0
1,40,92,0
2,45,63,1
3,45,80,0
4,40,73,1
...,...,...,...
990,45,87,0
991,40,83,0
992,40,83,0
993,40,60,1


In [None]:
df.describe()

Unnamed: 0,glucose,bloodpressure,diabetes
count,995.0,995.0,995.0
mean,44.306533,79.184925,0.500503
std,6.707567,9.340204,0.500251
min,20.0,50.0,0.0
25%,40.0,72.0,0.0
50%,45.0,80.0,1.0
75%,50.0,87.0,1.0
max,70.0,100.0,1.0


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 995 entries, 0 to 994
Data columns (total 3 columns):
 #   Column         Non-Null Count  Dtype
---  ------         --------------  -----
 0   glucose        995 non-null    int64
 1   bloodpressure  995 non-null    int64
 2   diabetes       995 non-null    int64
dtypes: int64(3)
memory usage: 23.4 KB


In [None]:
df.isnull().sum()

glucose          0
bloodpressure    0
diabetes         0
dtype: int64

# Selecting features and Target columns using iloc function

In [None]:
X = df.iloc[:, :-1]
y = df.iloc[:, -1]

# Splitting the dataset into the Training set and Test set

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 0)


# Feature Scaling
#Since our dataset containing character variables
#we have to encode it using LabelEncoder


In [None]:
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

# Training the Naive Bayes model on the Training set


In [None]:
from sklearn.naive_bayes import GaussianNB
classifier = GaussianNB()
classifier.fit(X_train, y_train)

# Predicting the Test set results


In [None]:
y_pred = classifier.predict(X_test)
print("Predicted Test Results : ",y_pred)
print("~"*20)

Predicted Test Results :  [1 1 1 0 0 0 1 1 0 1 0 0 1 1 1 1 1 0 1 1 0 0 0 0 1 1 1 0 0 0 1 1 0 0 0 1 0
 1 1 1 1 1 0 0 0 1 0 1 0 0 0 0 1 0 0 0 0 1 1 0 1 0 1 0 1 0 0 1 1 1 0 1 0 1
 0 1 1 1 0 1 0 1 1 1 0 1 0 0 1 0 1 0 0 0 1 0 1 0 0 1 0 0 1 0 0 0 0 1 1 0 0
 1 1 0 1 1 0 0 0 1 1 0 1 1 1 0 0 1 1 0 1 1 1 1 0 0 1 1 0 0 1 1 1 1 0 1 0 0
 0 1 1 0 1 1 1 0 1 1 1 1 0 1 0 0 1 0 1 1 1 0 0 0 1 0 0 0 0 1 1 1 1 0 1 1 0
 0 1 1 1 1 0 1 0 1 1 0 1 0 1]
~~~~~~~~~~~~~~~~~~~~


# Making the Confusion Matrix

In [None]:

# Making the Confusion Matrix
from sklearn.metrics import confusion_matrix, accuracy_score
ac = accuracy_score(y_test,y_pred)
print("Model Accuracy : ",ac*100,"%")
print("~"*20)
cm = confusion_matrix(y_test, y_pred)
print("Model Confusion Matrix : ")
print(cm)

Model Accuracy :  90.95477386934674 %
~~~~~~~~~~~~~~~~~~~~
Model Confusion Matrix : 
[[82  6]
 [12 99]]
