In [2]:
import pandas as pd
import numpy as np

In [3]:
data = pd.read_csv('data/play_tennis.csv')

In [4]:
data.head()

Unnamed: 0,day,outlook,temp,humidity,wind,play
0,D1,Sunny,Hot,High,Weak,No
1,D2,Sunny,Hot,High,Strong,No
2,D3,Overcast,Hot,High,Weak,Yes
3,D4,Rain,Mild,High,Weak,Yes
4,D5,Rain,Cool,Normal,Weak,Yes


In [5]:
data.shape

(14, 6)

In [6]:
data.drop(columns=['day'],inplace = True)

In [7]:
data.describe()

Unnamed: 0,outlook,temp,humidity,wind,play
count,14,14,14,14,14
unique,3,3,2,2,2
top,Sunny,Mild,High,Weak,Yes
freq,5,6,7,8,9


Let Problem Statement:
Let outlook = sunny, temp = hot , humidity = high, wind = weak
will they play tennis or not?

We have to find : 
p(yes | sunny,hot,high,weak) = p(sunny|yes)*p(hot|yes)*p(high|yes)*p(weak|yes) * p(yes)

p(no | sunny,hot,high,weak) = p(sunny|no)*p(hot|no)*p(high|no)*p(weak|no) * p(no)

Compare and decide using maximum a posteriori rule

chances that the problem statement might change. So this algorithm first creates a lookup table or a dictionary where it stores all the possible probabilities

In outlook column there are 3 types of values sunny, overcast,rain . I chooses prob. values for 3 with respect to yes and no

similarily for other columns also.

Also we need to find P(yes) and p(no)

In [8]:
data['play'].value_counts()

play
Yes    9
No     5
Name: count, dtype: int64

In [9]:
# py - prob(yes) pn - prob(no)
py = 9/14
pn = 5/14

In [10]:
print(py)

0.6428571428571429


In [11]:
print(pn)

0.35714285714285715


In [12]:
# now we have to find the other probabilities. 
pd.crosstab(data['outlook'],data['play'])

play,No,Yes
outlook,Unnamed: 1_level_1,Unnamed: 2_level_1
Overcast,0,4
Rain,2,3
Sunny,3,2


In [13]:
#pon - prob(overcast|no)
pon = 0
prn = 2/5
psn = 3/5

poy = 4/9
pry =3/9
psy = 2/9

In [14]:
#temp
pd.crosstab(data['temp'],data['play'])

play,No,Yes
temp,Unnamed: 1_level_1,Unnamed: 2_level_1
Cool,1,3
Hot,2,2
Mild,2,4


In [15]:

pcooln = 1/5
photn = 2/5
pmildn = 2/5

pcooly = 3/9
photy = 2/9
pmildy = 4/9

In [16]:
# humidity
pd.crosstab(data['humidity'],data['play'])

play,No,Yes
humidity,Unnamed: 1_level_1,Unnamed: 2_level_1
High,4,3
Normal,1,6


In [17]:
phighn = 4/5
pnormaln = 1/5

phighy = 3/9
pnormaly = 6/9

In [18]:
# wind
pd.crosstab(data['wind'],data['play'])

play,No,Yes
wind,Unnamed: 1_level_1,Unnamed: 2_level_1
Strong,3,3
Weak,2,6


In [19]:
pstrongn = 3/5
pweakn = 2/5

pstrongy = 3/9
pweaky = 6/9

In [20]:
#outlook = sunny, temp = hot , humidity = high, wind = weak
pyes = py * psy* photy* phighy* pweaky
pno = pn *psn* photn* phighn* pweakn

In [21]:
pyes

0.007054673721340388

In [22]:
pno

0.02742857142857143

In [23]:
# So the output is ' No play'

# doing the same using scikit learn library

In [24]:
# Import label encoder
from sklearn import preprocessing
  
# label_encoder object knows 
# how to understand word labels.
label_encoder = preprocessing.LabelEncoder()

features = ['outlook','temp','humidity','wind']
df = data.apply(label_encoder.fit_transform)

In [25]:
df

Unnamed: 0,outlook,temp,humidity,wind,play
0,2,1,0,1,0
1,2,1,0,0,0
2,0,1,0,1,1
3,1,2,0,1,1
4,1,0,1,1,1
5,1,0,1,0,0
6,0,0,1,0,1
7,2,2,0,1,0
8,2,0,1,1,1
9,1,2,1,1,1


In [26]:
df.iloc[:,4]

0     0
1     0
2     1
3     1
4     1
5     0
6     1
7     0
8     1
9     1
10    1
11    1
12    1
13    0
Name: play, dtype: int32

In [27]:
# splitting X and y into training and testing sets
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(df.iloc[:,0:4],df.iloc[:,-1] , test_size=0.4, random_state=42)


In [28]:
X_test.head()

Unnamed: 0,outlook,temp,humidity,wind
9,1,2,1,1
11,0,2,0,0
0,2,1,0,1
12,0,1,1,1
5,1,0,1,0


In [29]:
from sklearn.naive_bayes import BernoulliNB
model = BernoulliNB(binarize = 0.0)

In [30]:
model.fit(X_train,y_train)

In [31]:
y_pred = model.predict(X_test)

In [32]:
from sklearn import metrics
print("model accuracy(in %):", metrics.accuracy_score(y_test, y_pred)*100)

model accuracy(in %): 66.66666666666666
