In [1]:
import pandas as pd
import numpy as np
from scipy.stats import skew, kurtosis, ks_2samp, chisquare

from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, accuracy_score

import joblib  # or pickle, according to preference

# 1) Reading your files
file_path1 = 'modelData/melb_data.csv'
file_path2 = 'modelData/Obesity prediction.csv'
file_path3 = 'modelData/Titanic.csv'
file_path4 = 'modelData/phone_usage_india.csv'
file_path5 = 'modelData/panic_attack_dataset.csv'

data1 = pd.read_csv(file_path1)
data2 = pd.read_csv(file_path2)
data3 = pd.read_csv(file_path3)
data4 = pd.read_csv(file_path4)
data5 = pd.read_csv(file_path5)

# 2) Renaming conflicting column names (as you did in your code)
data2 = data2.rename(columns={
    'Gender': 'Obesity_prediction_gender',
    'Age': 'Obesity_prediction_age'
})
data3 = data3.rename(columns={
    'Age': 'Titanic_age'
})
data4 = data4.rename(columns={
    'Gender': 'Phone_usage_india_gender',
    'Age': 'Phone_usage_india_age'
})
data5 = data5.rename(columns={
    'Gender': 'Panic_attack_gender',
    'Age': 'Panic_attack_age'
})

# 3) Combining all DataFrames
combined_data = pd.concat(
    [data1, data2, data3, data4, data5],
    axis=1,
    join='inner'
)

print("Combined data shape:", combined_data.shape)

# 4) Defining a list of categorical columns (for labeling is_categorical)
categorical_features = [
    'Bedroom2', 'Bathroom', 'Car', 'Postcode', 'Distance',
    'BuildingArea', 'YearBuilt', 'Price', 'Landsize', 'NCP', 'CH2O',
    'FCVC', 'FAF', 'TUE', 'Pclass', 'SibSp', 'Parch', 'Survived',
    'Caffeine_Intake', 'Exercise_Frequency', 'Alcohol_Consumption',
    'Panic_Score', 'Suburb', 'Type', 'Method', 'SellerG', 'Date',
    'CouncilArea', 'Regionname', 'Obesity_prediction_gender', 'family_history', 'FAVC',
    'CAEC', 'SMOKE', 'SCC', 'CALC', 'MTRANS', 'Obesity', 'Sex',
    'Embarked', 'Phone_usage_india_gender', 'Location', 'Phone Brand', 'OS',
    'Primary Use', 'Panic_attack_gender', 'Trigger', 'Sweating',
    'Shortness_of_Breath', 'Dizziness', 'Chest_Pain', 'Trembling',
    'Medical_History', 'Medication', 'Smoking', 'Therapy'
]

# 5) Function to prepare the DataFrame for the model (creating target column is_categorical)
def prepare_combined_dataset_with_target(df, cat_cols):
    df_copy = df.copy()
    feature_data = []

    for col in df_copy.columns:
        # 1 = categorical if in cat_cols, else 0
        is_categorical = 1 if col in cat_cols else 0

        col_data = df_copy[col]
        if pd.api.types.is_numeric_dtype(col_data):
            # Numeric values
            col_data_dropna = col_data.dropna()
            n_total = len(col_data_dropna)
            value_counts = col_data_dropna.value_counts()
            n_unique = len(value_counts)
            unique_ratio = (n_unique / n_total) * 100 if n_total > 0 else 0
            max_count = value_counts.max() if len(value_counts) > 0 else 0

            if n_total > 1:
                column_skewness = skew(col_data_dropna)
                column_kurtosis = kurtosis(col_data_dropna)
            else:
                column_skewness = 0
                column_kurtosis = 0

            ks_p_value = 0
            chi_p_value = 0
            if n_total > 1:
                # KS
                _, ks_p_value = ks_2samp(col_data_dropna, np.random.uniform(size=n_total))

                # chi-square with bins
                binned_series = pd.cut(col_data_dropna, bins=10, labels=False)
                observed = binned_series.value_counts()
                expected = np.ones_like(observed) * observed.sum() / len(observed)
                _, chi_p_value = chisquare(f_obs=observed, f_exp=expected)

        else:
            # Textual/categorical values
            col_factor, _ = pd.factorize(col_data)
            col_factor = pd.Series(col_factor).replace(-1, np.nan).dropna()
            n_total = len(col_factor)
            value_counts = col_factor.value_counts()
            n_unique = len(value_counts)
            unique_ratio = (n_unique / n_total) * 100 if n_total > 0 else 0
            max_count = value_counts.max() if len(value_counts) > 0 else 0

            if n_total > 1:
                column_skewness = skew(col_factor)
                column_kurtosis = kurtosis(col_factor)
            else:
                column_skewness = 0
                column_kurtosis = 0

            ks_p_value = 0
            chi_p_value = 0
            if n_total > 1:
                # KS
                _, ks_p_value = ks_2samp(col_factor, np.random.uniform(size=n_total))

                # chi-square
                observed = value_counts
                expected = np.ones_like(observed) * observed.sum() / len(observed)
                _, chi_p_value = chisquare(f_obs=observed, f_exp=expected)

        feature_data.append({
            'skewness': column_skewness,
            'kurtosis': column_kurtosis,
            'relative_unique_ratio': unique_ratio,
            'max_repeated_value_count': max_count,
            'ks_p_value': ks_p_value,
            'chi_p_value': chi_p_value,
            'is_categorical': is_categorical
        })
    return pd.DataFrame(feature_data)

# 6) Preparing the data for the model
prepared_data = prepare_combined_dataset_with_target(combined_data, categorical_features)

# 7) Separating X, y
X = prepared_data.drop(columns=['is_categorical'])
Y = prepared_data['is_categorical']

# 8) Splitting into training/testing sets
X_train, X_test, y_train, y_test = train_test_split(X, Y,
                                                    test_size=0.25,
                                                    random_state=42)

# 9) Defining param_grid for RandomForest
param_grid_rf = {
    'n_estimators': [100, 200, 300, 400],
    'max_depth': [3, 5, 10, None],
    'min_samples_split': [2, 5, 7, 10],
    'min_samples_leaf': [1, 2, 3, 4]
}

from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestClassifier

rf_grid_search = GridSearchCV(
    RandomForestClassifier(random_state=42),
    param_grid_rf,
    cv=3,
    scoring='accuracy',
    n_jobs=-1,
    verbose=1
)
rf_grid_search.fit(X_train, y_train)

best_rf_model = rf_grid_search.best_estimator_

# 10) Testing on the test set
test_preds = best_rf_model.predict(X_test)
acc = accuracy_score(y_test, test_preds)
print("Best Params:", rf_grid_search.best_params_)
print("Accuracy:", acc)
print("Classification Report:")
print(classification_report(y_test, test_preds))

# 11) Saving the trained model to a file (e.g., my_model.pkl)
model_filename = "my_model.pkl"
joblib.dump(best_rf_model, model_filename)
print(f"Model saved to {model_filename}")

Combined data shape: (1000, 86)
Fitting 3 folds for each of 256 candidates, totalling 768 fits
Best Params: {'max_depth': 3, 'min_samples_leaf': 4, 'min_samples_split': 2, 'n_estimators': 100}
Accuracy: 0.9090909090909091
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.83      0.83         6
           1       0.94      0.94      0.94        16

    accuracy                           0.91        22
   macro avg       0.89      0.89      0.89        22
weighted avg       0.91      0.91      0.91        22

Model saved to my_model.pkl
