# Random Forest Classifier on Tips Dataset

This notebook demonstrates how to build a classification model using a Random Forest. We use preprocessing pipelines to handle missing values and categorical features using scikit-learn tools like `SimpleImputer`, `OneHotEncoder`, and `ColumnTransformer`.

In [None]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score


In [None]:
# Load the tips dataset
df = sns.load_dataset("tips")
df.head()


In [None]:
# Drop the target column 'time'
X = df.drop("time", axis=1)
y = df["time"]


In [None]:
# Identify categorical and numerical columns
cat_cols = ["sex", "smoker", "day"]
num_cols = ["total_bill", "tip", "size"]


In [None]:
# Pipeline for numerical features
num_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy='median')),
    ("scaler", StandardScaler())
])

# Pipeline for categorical features
cat_pipeline = Pipeline([
    ("imputer", SimpleImputer(strategy='most_frequent')),
    ("encoder", OneHotEncoder(drop='first', sparse_output=False))
])


In [None]:
# Combine pipelines
preprocessor = ColumnTransformer([
    ("num", num_pipeline, num_cols),
    ("cat", cat_pipeline, cat_cols)
])


In [None]:
# Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)

# Apply preprocessing
X_train = preprocessor.fit_transform(X_train)
X_test = preprocessor.transform(X_test)


In [None]:
# Define the model
rf = RandomForestClassifier(random_state=42)

# Hyperparameter grid
params = {
    'max_depth': [1, 2, 3, 5, 10, None],
    'n_estimators': [30, 50, 100, 200],
    'criterion': ['gini', 'entropy']
}

# Randomized search
clf = RandomizedSearchCV(rf, param_distributions=params, cv=5, verbose=2, n_iter=10, scoring='accuracy')
clf.fit(X_train, y_train)


In [None]:
# Predict and evaluate
y_pred = clf.predict(X_test)
print("Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))
