## Implementing Naive-Bayes algorithm with laplacian smoothing 

### Importing tennis.csv as a pandas DataFrame

In [55]:
import pandas as pd
import numpy as np

In [76]:
df = pd.read_csv('tennis.csv')
df

Unnamed: 0,Outlook,Temp.,Humidity,Windy,Play
0,sunny,hot,high,False,no
1,sunny,hot,high,True,no
2,overcast,hot,high,False,yes
3,rainy,mild,high,False,yes
4,rainy,cool,normal,False,yes
5,rainy,cool,normal,True,no
6,overcast,cool,normal,True,yes
7,sunny,mild,high,False,no
8,sunny,cool,normal,False,yes
9,rainy,mild,normal,False,yes


#### spliting dataframe into Test and Training set.

In [57]:
df_train = df.iloc[:10,:].copy()
df_test = df.iloc[10:,:].copy()
print(df_train.shape)
print(df_test.shape)

(10, 5)
(4, 5)


### spliting training set into different features

In [58]:
# features are considered independent in naive-Bayes better to split them out 

df_outlook = df[['Outlook','Play']]
df_temp = df[['Temp.','Play']]
df_Humidity = df[['Humidity','Play']]
df_windy = df[['Windy','Play']]
df_windy['Windy'] = df_windy.apply(lambda x : 'Y' if x['Windy'] == 'true ' else 'N' , axis =1)
df_windy

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  import sys


Unnamed: 0,Windy,Play
0,N,no
1,Y,no
2,N,yes
3,N,yes
4,N,yes
5,Y,no
6,Y,yes
7,N,no
8,N,yes
9,N,yes


In [59]:
windy = df_windy.reset_index().groupby(['Windy','Play']).count().unstack()
outlook = df_outlook.reset_index().groupby(['Outlook','Play']).count().unstack()
temp = df_temp.reset_index().groupby(['Temp.','Play']).count().unstack()
humidity = df_Humidity.reset_index().groupby(['Humidity','Play']).count().unstack()
windy.columns = ['no' ,'yes']
outlook.columns = ['no' ,'yes']
temp.columns = ['no' ,'yes']
humidity.columns = ['no' ,'yes']
outlook.fillna(0 ,inplace=True) # filling nan value with zero
outlook

Unnamed: 0_level_0,no,yes
Outlook,Unnamed: 1_level_1,Unnamed: 2_level_1
overcast,0.0,4.0
rainy,2.0,3.0
sunny,3.0,2.0


### computing totals as a row and column

In [60]:
windy.loc['All'] = windy.sum()
windy['Total'] = windy.sum(axis=1)
windy.index = ['Y' , 'N','All']
windy

Unnamed: 0,no,yes,Total
Y,2,6,8
N,3,3,6
All,5,9,14


In [61]:
outlook.loc['All'] = outlook.sum()
outlook['Total'] = outlook.sum(axis=1)
outlook

Unnamed: 0_level_0,no,yes,Total
Outlook,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
overcast,0.0,4.0,4.0
rainy,2.0,3.0,5.0
sunny,3.0,2.0,5.0
All,5.0,9.0,14.0


In [62]:
temp.loc['All'] = temp.sum()
temp['Total'] = temp.sum(axis=1)
temp

Unnamed: 0_level_0,no,yes,Total
Temp.,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
cool,1,3,4
hot,2,2,4
mild,2,4,6
All,5,9,14


In [63]:
humidity.loc['All'] = humidity.sum()
humidity['Total'] = humidity.sum(axis=1)
humidity

Unnamed: 0_level_0,no,yes,Total
Humidity,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
high,4,3,7
normal,1,6,7
All,5,9,14


In [64]:
temp.loc['hot','yes']

2

## making prediction and adding laplacian smoothing

In [65]:
input_for_prediction = {'temp':'mild','humidity':'high'}
outcome = 'yes'

In [88]:
def prediction(features,outcome):
    length = len(features)
    likely_mul = 1
    feature_occur = 0
    for key,value in features.items():
        if (key == 'outlook'):
            df = outlook
        elif(key == 'windy'):
            df = windy
        elif (key == 'humidity'):
            df = humidity
        else :
            df = temp
            
        likely_mul *= laplacian_likelihood(df,value,outcome,length)
        feature_occur += df.loc[value,'Total']
    outcome_prob = temp.loc['All',outcome]/temp.loc['All','Total']
    #print(outcome_prob)
    #print(likely_mul)
    probab = (likely_mul *outcome_prob)/(feature_occur/(14*4)) # finding final probablity
    
    return probab

In [89]:
def laplacian_likelihood(df,value,outcome,length):
    ret = (df.loc[value,outcome] + 1)/(df.loc['All',outcome] + length + 1)
    
    return ret

In [90]:
input_for_prediction = {'temp':'cool','windy':'N','humidity':'normal','outlook':'rainy'}
outcome = 'yes'

probablity = prediction(input_for_prediction,outcome)
print(probablity)
k =round(probablity)
if k == 1:
    print('yes')
else :
    print('No')

0.01908295785846806
No
