In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score
from imblearn.over_sampling import SMOTE

# Load the data
data = pd.read_csv('/HB1.csv')

# Display the first few rows and data info
print(data.head())
print(data.info())

# Preprocess the data
le = LabelEncoder()
data['behavior'] = le.fit_transform(data['behavior'])
data['gender'] = le.fit_transform(data['gender'])
data['location'] = le.fit_transform(data['location'])

# Handle age ranges
def age_to_numeric(age_range):
    if isinstance(age_range, str):
        if '+' in age_range:
            return int(age_range.replace('+', ''))
        else:
            return int(age_range.split('-')[0])
    return age_range

data['age_numeric'] = data['age'].apply(age_to_numeric)

# Feature engineering
data['age_group'] = pd.cut(data['age_numeric'], bins=[0, 25, 35, 45, 55, 65, 100], labels=[0, 1, 2, 3, 4, 5])
data['sample_size_log'] = np.log1p(data['sample_size'])

# Split features and target
X = data[['gender', 'age_numeric', 'age_group', 'location', 'sample_size', 'sample_size_log', 'likelihood_percent']]
y = data['behavior']

# Get unique classes
unique_classes = np.unique(y)
class_names = le.inverse_transform(unique_classes)

# Split the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Scale the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Handle class imbalance
smote = SMOTE(random_state=42)
X_train_resampled, y_train_resampled = smote.fit_resample(X_train_scaled, y_train)

# Define models
models = {
    'Random Forest': RandomForestClassifier(random_state=42),
    'SVM': SVC(random_state=42),
    'Logistic Regression': LogisticRegression(random_state=42, multi_class='ovr', max_iter=1000)
}

# Define parameter grids for GridSearchCV
param_grids = {
    'Random Forest': {'n_estimators': [50, 100, 200], 'max_depth': [None, 10, 20, 30]},
    'SVM': {'C': [0.1, 1, 10], 'kernel': ['rbf', 'linear']},
    'Logistic Regression': {'C': [0.1, 1, 10], 'solver': ['lbfgs', 'liblinear']}
}

# Train and evaluate models
for name, model in models.items():
    print(f"\nTraining {name}...")

    # Perform GridSearchCV
    grid_search = GridSearchCV(model, param_grids[name], cv=5, n_jobs=-1)
    grid_search.fit(X_train_resampled, y_train_resampled)

    # Get best model
    best_model = grid_search.best_estimator_

    # Make predictions
    y_pred = best_model.predict(X_test_scaled)

    # Evaluate the model
    print(f"Best parameters: {grid_search.best_params_}")
    print(f"Accuracy: {accuracy_score(y_test, y_pred)}")

    # Perform cross-validation
    cv_scores = cross_val_score(best_model, X_train_resampled, y_train_resampled, cv=5)
    print(f"Cross-validation scores: {cv_scores}")
    print(f"Mean CV score: {np.mean(cv_scores)}")

    # Classification report
    print("\nClassification Report:")
    print(classification_report(y_test, y_pred, target_names=class_names))

    # Feature importance (only for Random Forest)
    if name == 'Random Forest':
        feature_importance = pd.DataFrame({
            'feature': X.columns,
            'importance': best_model.feature_importances_
        }).sort_values('importance', ascending=False)
        print("\nFeature Importance:")
        print(feature_importance)

# Print unique behaviors and their encoded values
behavior_mapping = dict(zip(le.classes_, le.transform(le.classes_)))
print("\nBehavior Mapping:")
for behavior, code in behavior_mapping.items():
    print(f"{behavior}: {code}")

# Print number of unique classes
print(f"\nNumber of unique classes: {len(unique_classes)}")
print("Unique classes:")
for class_name in class_names:
    print(class_name)

        behavior description  gender    age  \
0  Mental Health   excellent    Male  18-24   
1  Mental Health   excellent  Female  25-34   
2  Mental Health   excellent  Female  35-44   
3  Mental Health   excellent  Female  45-54   
4  Mental Health   excellent  Female  55-64   

                                    location  sample_size  likelihood_percent  
0                             East Melbourne          761                41.6  
1                  South Wharf and Southbank          573                50.3  
2                  Kensington and Flemington          274                35.6  
3  South Yarra, Melbourne and St Kilda Road           421                43.3  
4                                  Docklands          229                47.5  
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 80 entries, 0 to 79
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   behavior            80 non-null 