# Imports

In [41]:
import numpy as np
import pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OrdinalEncoder, LabelEncoder

# Dataset Loading

In [42]:
data = pd.read_csv("weather_data.csv")
data

Unnamed: 0,outlook,temperature,humidity,windy,play
0,sunny,85,85,False,no
1,sunny,80,90,True,no
2,overcast,83,86,False,yes
3,rainy,70,96,False,yes
4,rainy,68,80,False,yes
5,rainy,65,70,True,no
6,overcast,64,65,True,yes
7,sunny,72,95,False,no
8,sunny,69,70,False,yes
9,rainy,75,80,False,yes


In [46]:
X = data.iloc[:,:-1]
y = data.iloc[:,-1]

# Model

In [47]:
class GaussianNaiveBayes:
    def __init__(self, alpha=1e-10):
        self._alpha = alpha
        self._classes = []
        self._priors = {}
        self._mean = {}
        self._var = {}
        self._feature_probs = {}

    def fit(self, X:pd.DataFrame, y:pd.Series, continuous_features = []):
        self._cont = continuous_features
        self._cat = [col for col in X if col not in self._cont]
        X_cont = X[self._cont]
        X_cat = X[self._cat]
        n_samples, _ = X.shape
        self._classes = np.unique(y)
        
        for c in range(len(self._classes)):
            X_cont_c = X_cont[y==c]
            X_cat_c = X_cat[y==c]
            
            self._priors[c] = X_cont_c.shape[0]/n_samples #prior calc is same for both cont and cat
            
            #for continuous features calculate mean and variance
            self._mean[c] = X_cont_c.mean(axis=0)
            self._var[c] = X_cont_c.var(axis=0) + self._alpha
            
            #for categorical features get word likelihood
            word_counts = np.sum(X_cat_c, axis=0) + self._alpha
            self._feature_probs[c] = word_counts/(np.sum(word_counts) + self._alpha * X_cat_c.shape[1])
            
    def predict(self, X:pd.DataFrame) -> np.array:
        if isinstance(X, pd.Series):
            X = X.to_frame().T
        elif isinstance(X, np.ndarray):
            X = pd.DataFrame(X, columns=self._cat + self._cont)
            
        predictions = []
        for _, sample in X.iterrows():
            print(sample)
            log_probs = {}
            for c in self._classes:
                log_prob = np.log(self._priors[c])

                if self._cont:
                    cont_values = sample[self._cont].values
                    log_prob += np.sum(np.log(self._pdf(c, cont_values)))

                if self._cat:
                    cat_values = sample[self._cat].values
                    log_prob += np.sum(cat_values * (np.log(self._feature_probs[c])))
    
                log_probs[c] = log_prob
            predictions.append(max(log_probs, key = log_probs.get))
        return np.array(predictions)

    def _pdf(self, c, x):
        num = np.exp(-((x - self._mean[c])**2) / (2 * self._var[c]))
        den = np.sqrt(2 * np.pi * self._var[c])
        return num/den

# Training

In [48]:
categorical_features = ['outlook','windy']
preprocessor = ColumnTransformer(
    transformers=[
        ('cat', OrdinalEncoder(), categorical_features)
    ],
    remainder='passthrough'
)
X_transformed = preprocessor.fit_transform(X)
X_train = pd.DataFrame(
    X_transformed, 
    columns=preprocessor.get_feature_names_out()
)
X_train

Unnamed: 0,cat__outlook,cat__windy,remainder__temperature,remainder__humidity
0,2.0,0.0,85.0,85.0
1,2.0,1.0,80.0,90.0
2,0.0,0.0,83.0,86.0
3,1.0,0.0,70.0,96.0
4,1.0,0.0,68.0,80.0
5,1.0,1.0,65.0,70.0
6,0.0,1.0,64.0,65.0
7,2.0,0.0,72.0,95.0
8,2.0,0.0,69.0,70.0
9,1.0,0.0,75.0,80.0


In [49]:
y_encoder = LabelEncoder()
y_train = y_encoder.fit_transform(y)

In [50]:
model = GaussianNaiveBayes()
model.fit(X_train,y_train, continuous_features=[])

# Testing

In [51]:
test_data = {
    'outlook': ['sunny', 'rainy'],
    'temperature': [85, 65], 
    'humidity': [85, 70],
    'windy': [False, True]
}
test_df = pd.DataFrame(test_data)

print(test_df)

X_test = preprocessor.transform(test_df)

X_test

preds = model.predict(X_test)

pred_labels = y_encoder.inverse_transform(preds)
print("\nPredictions for Test Samples:")
for orig, enc, pred in zip(test_df.values, X_test, pred_labels):
    print(f"Original: {orig}, Encoded: {enc}, Predicted: {pred}")

  outlook  temperature  humidity  windy
0   sunny           85        85  False
1   rainy           65        70   True
cat__outlook               2.0
cat__windy                 0.0
remainder__temperature    85.0
remainder__humidity       85.0
Name: 0, dtype: float64
cat__outlook               1.0
cat__windy                 1.0
remainder__temperature    65.0
remainder__humidity       70.0
Name: 1, dtype: float64

Predictions for Test Samples:
Original: ['sunny' 85 85 False], Encoded: [ 2.  0. 85. 85.], Predicted: yes
Original: ['rainy' 65 70 True], Encoded: [ 1.  1. 65. 70.], Predicted: yes
