<h1>1. Dataset Definition</h1>

In [38]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
import joblib
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np

np.random.seed(42)

# Generate 3000 synthetic records
n = 3000
outlook_choices = ['sunny', 'overcast', 'rainy']

data = {
    'outlook': np.random.choice(outlook_choices, n),
    'temperature': np.random.randint(60, 100, n),  # between 60°F and 100°F
    'humidity': np.random.randint(50, 100, n),     # between 50% and 100%
    'windy': np.random.choice([True, False], n)
}

df_large = pd.DataFrame(data)

# Apply rules to decide play_tennis
def decide_play(row):
    if row['outlook'] == 'overcast':
        return 'yes'
    elif row['outlook'] == 'sunny':
        return 'no' if row['humidity'] > 75 else 'yes'
    elif row['outlook'] == 'rainy':
        return 'no' if row['windy'] else 'yes'

df_large['play_tennis'] = df_large.apply(decide_play, axis=1)

print("Synthetic Tennis Dataset (first 10 rows):")
print(df_large.head(10))
print("\nDataset shape:", df_large.shape)



Synthetic Tennis Dataset (first 10 rows):
    outlook  temperature  humidity  windy play_tennis
0     rainy           89        75  False         yes
1     sunny           81        95  False          no
2     rainy           87        63   True          no
3     rainy           66        94  False         yes
4     sunny           94        93   True          no
5     sunny           87        73   True         yes
6     rainy           73        90   True          no
7  overcast           69        50  False         yes
8     rainy           75        92  False         yes
9     rainy           75        71  False         yes

Dataset shape: (3000, 5)


<h1>2. Data Preprocessing and Model Training</h1>

In [39]:
class TennisPlayPredictor:
    def __init__(self):
        self.label_encoders = {}
        self.scaler = StandardScaler()
        self.model = GaussianNB()
    
    def preprocess_data(self, df):
        """Preprocess the data for training"""
        df_processed = df.copy()
        
        # Encode categorical variables
        categorical_cols = ['outlook', 'windy']
        for col in categorical_cols:
            le = LabelEncoder()
            df_processed[col] = le.fit_transform(df_processed[col])
            self.label_encoders[col] = le
        
        # Separate features and target
        X = df_processed.drop('play_tennis', axis=1)
        y = df_processed['play_tennis']
        
        # Encode target variable
        target_encoder = LabelEncoder()
        y_encoded = target_encoder.fit_transform(y)
        self.label_encoders['play_tennis'] = target_encoder
        
        # Scale numerical features
        numerical_cols = ['temperature', 'humidity']
        X[numerical_cols] = self.scaler.fit_transform(X[numerical_cols])
        
        return X, y_encoded
    
    def train(self, df, test_size=0.2, random_state=42):
        """Train the Naive Bayes model"""
        X, y = self.preprocess_data(df)
        
        # Split the data
        X_train, X_test, y_train, y_test = train_test_split(
            X, y, test_size=test_size, random_state=random_state
        )
        
        # Train the model
        self.model.fit(X_train, y_train)
        
        # Make predictions
        y_pred = self.model.predict(X_test)
        
        # Calculate accuracy
        accuracy = accuracy_score(y_test, y_pred)
        print(f"Model trained with accuracy: {accuracy:.2f}")
        
        return accuracy
    
    def predict_single(self, outlook, temperature, humidity, windy):
        """Make prediction for a single instance"""
        # Create input DataFrame
        input_data = pd.DataFrame({
            'outlook': [outlook],
            'temperature': [temperature],
            'humidity': [humidity],
            'windy': [windy]
        })
        
        return self.predict(input_data)
    
    def predict(self, input_data):
        """Make predictions on new data"""
        # Preprocess the input data
        input_processed = input_data.copy()
        
        # Encode categorical variables
        if 'outlook' in input_processed.columns:
            input_processed['outlook'] = self.label_encoders['outlook'].transform(
                input_processed['outlook']
            )
        
        if 'windy' in input_processed.columns:
            input_processed['windy'] = self.label_encoders['windy'].transform(
                input_processed['windy']
            )
        
        # Scale numerical features
        numerical_cols = ['temperature', 'humidity']
        input_processed[numerical_cols] = self.scaler.transform(
            input_processed[numerical_cols]
        )
        
        # Make prediction
        predictions = self.model.predict(input_processed)
        
        # Decode predictions
        decoded_predictions = self.label_encoders['play_tennis'].inverse_transform(
            predictions
        )
        
        return decoded_predictions
    
    def save_model(self, filename='tennis_predictor.joblib'):
        """Save the entire predictor object"""
        joblib.dump(self, filename)
        print(f"Model saved as {filename}")
    
    @staticmethod
    def load_model(filename='tennis_predictor.joblib'):
        """Load the predictor object"""
        return joblib.load(filename)

<h1>3. Train and Save the Model</h1>

In [40]:
import pandas as pd

# Create and train the predictor
predictor = TennisPlayPredictor()
accuracy = predictor.train(df_large)

# Save the model
predictor.save_model('model/tennis_play_predictor.joblib')

# Prepare encoder mappings
outlook_mapping = dict(zip(
    predictor.label_encoders['outlook'].classes_, 
    range(len(predictor.label_encoders['outlook'].classes_))
))
windy_mapping = dict(zip(
    predictor.label_encoders['windy'].classes_, 
    range(len(predictor.label_encoders['windy'].classes_))
))
play_mapping = dict(zip(
    predictor.label_encoders['play_tennis'].classes_, 
    range(len(predictor.label_encoders['play_tennis'].classes_))
))

# Convert mappings into DataFrames
df_outlook = pd.DataFrame(list(outlook_mapping.items()), columns=["Outlook", "Encoded"])
df_windy = pd.DataFrame(list(windy_mapping.items()), columns=["Windy", "Encoded"])
df_play = pd.DataFrame(list(play_mapping.items()), columns=["Play Tennis", "Encoded"])

# Display neatly
print("\nLabel Encoders Mapping Tables:\n")

print("Outlook Mapping:\n", df_outlook, "\n")
print("Windy Mapping:\n", df_windy, "\n")
print("Play Tennis Mapping:\n", df_play, "\n")


Model trained with accuracy: 0.82
Model saved as model/tennis_play_predictor.joblib

Label Encoders Mapping Tables:

Outlook Mapping:
     Outlook  Encoded
0  overcast        0
1     rainy        1
2     sunny        2 

Windy Mapping:
    Windy  Encoded
0  False        0
1   True        1 

Play Tennis Mapping:
   Play Tennis  Encoded
0          no        0
1         yes        1 



<h1>4. Load Model and Make Predictions</h1>

In [44]:
df_large.head()

Unnamed: 0,outlook,temperature,humidity,windy,play_tennis
0,rainy,89,75,False,yes
1,sunny,81,95,False,no
2,rainy,87,63,True,no
3,rainy,66,94,False,yes
4,sunny,94,93,True,no


In [45]:
import pandas as pd

# Load the saved model
print("\n" + "="*50)
print("LOADING SAVED MODEL AND MAKING PREDICTIONS")
print("="*50)

loaded_predictor = TennisPlayPredictor.load_model('model/tennis_play_predictor.joblib')

# Test predictions with new data
test_cases = [
    # (outlook, temperature, humidity, windy)
    ('rainy', 89, 75, False),      # Should predict 'no'
    ('overcast', 68, 65, False),  # Should predict 'yes'
    ('rainy', 65, 95, False),     # Should predict 'yes'
    ('sunny', 85, 85, False),     # Should predict 'no'
]

# Collect results
results = []
for i, (outlook, temp, humidity, windy) in enumerate(test_cases, 1):
    prediction = loaded_predictor.predict_single(outlook, temp, humidity, windy)
    results.append({
        "Case": i,
        "Outlook": outlook,
        "Temperature (°F)": temp,
        "Humidity (%)": humidity,
        "Windy": windy,
        "Prediction": "PLAY TENNIS" if prediction[0] == "yes" else "DONT PLAY"
    })

# Convert to DataFrame
df_results = pd.DataFrame(results)

# Display table
print("\nPredictions Table:\n")
print(df_results)



LOADING SAVED MODEL AND MAKING PREDICTIONS

Predictions Table:

   Case   Outlook  Temperature (°F)  Humidity (%)  Windy   Prediction
0     1     rainy                89            75  False  PLAY TENNIS
1     2  overcast                68            65  False  PLAY TENNIS
2     3     rainy                65            95  False  PLAY TENNIS
3     4     sunny                85            85  False  PLAY TENNIS


<h1>5. Complete Example with Additional Features</h1>

In [42]:
# Additional functionality: Probability predictions
def predict_with_probability(predictor, outlook, temperature, humidity, windy):
    """Make prediction with probability scores"""
    # Prepare input
    input_data = pd.DataFrame({
        'outlook': [outlook],
        'temperature': [temperature],
        'humidity': [humidity],
        'windy': [windy]
    })
    
    # Preprocess
    input_processed = input_data.copy()
    input_processed['outlook'] = predictor.label_encoders['outlook'].transform(
        input_processed['outlook']
    )
    input_processed['windy'] = predictor.label_encoders['windy'].transform(
        input_processed['windy']
    )
    
    numerical_cols = ['temperature', 'humidity']
    input_processed[numerical_cols] = predictor.scaler.transform(
        input_processed[numerical_cols]
    )
    
    # Get probabilities
    probabilities = predictor.model.predict_proba(input_processed)[0]
    classes = predictor.label_encoders['play_tennis'].classes_
    
    return dict(zip(classes, probabilities))

# Test probability predictions
print("Probability predictions:")
test_case = ('overcast', 72, 80, False)
probs = predict_with_probability(loaded_predictor, *test_case)
print(f"Input: {test_case}")
print(f"Probabilities: {probs}")
print(f"Final prediction: {'PLAY' if max(probs, key=probs.get) == 'yes' else 'DONT PLAY'}")

Probability predictions:
Input: ('overcast', 72, 80, False)
Probabilities: {'no': np.float64(0.005016071467347686), 'yes': np.float64(0.994983928532652)}
Final prediction: PLAY


<h1>6. Batch Prediction Example</h1>

In [43]:
# Example of batch prediction
print("\n" + "="*50)
print("BATCH PREDICTION EXAMPLE")
print("="*50)

# Create batch of new data
new_data = pd.DataFrame({
    'outlook': ['sunny', 'rainy', 'overcast', 'sunny'],
    'temperature': [85, 68, 70, 88],
    'humidity': [85, 75, 65, 92],
    'windy': [False, False, False, True]
})

print("New data for prediction:")
print(new_data)

# Make batch predictions
batch_predictions = loaded_predictor.predict(new_data)
new_data['prediction'] = batch_predictions
new_data['should_play'] = new_data['prediction'].apply(
    lambda x: 'YES' if x == 'yes' else 'NO'
)

print("\nPredictions:")
print(new_data[['outlook', 'temperature', 'humidity', 'windy', 'should_play']])


BATCH PREDICTION EXAMPLE
New data for prediction:
    outlook  temperature  humidity  windy
0     sunny           85        85  False
1     rainy           68        75  False
2  overcast           70        65  False
3     sunny           88        92   True

Predictions:
    outlook  temperature  humidity  windy should_play
0     sunny           85        85  False         YES
1     rainy           68        75  False         YES
2  overcast           70        65  False         YES
3     sunny           88        92   True          NO
