In [None]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.metrics import (
    roc_auc_score, accuracy_score, precision_score, f1_score,
    confusion_matrix, ConfusionMatrixDisplay, roc_curve
)
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import optuna
from optuna.samplers import TPESampler
from collections import Counter
import warnings
import logging


In [None]:

logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s [%(levelname)s] %(message)s',
    handlers=[
        logging.StreamHandler()
    ]
)
logger = logging.getLogger(__name__)

warnings.filterwarnings('ignore')

try:
    train_data = pd.read_csv('train.csv')
    test_data = pd.read_csv('test.csv')
    logger.info("Datasets loaded successfully.")
except FileNotFoundError as e:
    logger.error("Error loading datasets: ", exc_info=True)
    raise e

X = train_data.drop(columns=['id', 'smoking'])
y = train_data['smoking']
logger.info("Features and target variable defined.")

numerical_cols = X.select_dtypes(include=['int64', 'float64']).columns.tolist()
categorical_cols = X.select_dtypes(include=['object']).columns.tolist()
logger.info(f"Numerical columns: {numerical_cols}")
logger.info(f"Categorical columns: {categorical_cols}")



numerical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='median')),
    ('scaler', StandardScaler())
])

categorical_pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

preprocessor = ColumnTransformer([
    ('num', numerical_pipeline, numerical_cols),
    ('cat', categorical_pipeline, categorical_cols)
])
logger.info("Preprocessing pipelines created.")

counter = Counter(y)
scale_pos_weight = counter[0] / counter[1]
logger.info(f"Class distribution: {counter}")
logger.info(f"Scale_pos_weight: {scale_pos_weight:.2f}")

X_train, X_val, y_train, y_val = train_test_split(
    X, y, test_size=0.2, random_state=101, stratify=y
)
logger.info("Data split into training and validation sets.")


In [None]:

def objective(trial):
    try:
        param = {
            'classifier__n_estimators': trial.suggest_int('n_estimators', 100, 1000),
            'classifier__max_depth': trial.suggest_int('max_depth', 5, 50),
            'classifier__min_samples_split': trial.suggest_int('min_samples_split', 2, 20),
            'classifier__min_samples_leaf': trial.suggest_int('min_samples_leaf', 1, 20),
            'classifier__max_features': trial.suggest_categorical('max_features', ['sqrt', 'log2', None]),
            'classifier__bootstrap': trial.suggest_categorical('bootstrap', [True, False]),
            'classifier__criterion': trial.suggest_categorical('criterion', ['gini', 'entropy'])
        }
        
        pipeline = Pipeline([
            ('preprocessor', preprocessor),
            ('classifier', RandomForestClassifier(
                random_state=101,
                class_weight='balanced',
                n_jobs=-1
            ))
        ])
        
        pipeline.set_params(**param)
        
        skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=101)
        
        roc_auc_scores = []
        
        for train_idx, test_idx in skf.split(X_train, y_train):
            X_tr, X_te = X_train.iloc[train_idx], X_train.iloc[test_idx]
            y_tr, y_te = y_train.iloc[train_idx], y_train.iloc[test_idx]
            
            pipeline.fit(X_tr, y_tr)
            
            y_pred_prob = pipeline.predict_proba(X_te)[:, 1]
            
            score = roc_auc_score(y_te, y_pred_prob)
            roc_auc_scores.append(score)
        
        return np.mean(roc_auc_scores)
    
    except Exception as e:
        logger.error(f"An error occurred during trial: {e}", exc_info=True)
        return float('nan')  # Optuna treats NaN as a failed trial

sampler = TPESampler(seed=101)
study = optuna.create_study(direction='maximize', sampler=sampler)
logger.info("Starting hyperparameter optimization with Optuna...")
study.optimize(objective, n_trials=100, timeout=3600)  # Adjust n_trials and timeout as needed

logger.info(f"Number of finished trials: {len(study.trials)}")
logger.info("Best trial:")
trial = study.best_trial

logger.info(f"  Value (Average ROC-AUC): {trial.value:.4f}")
logger.info("  Params:")
for key, value in trial.params.items():
    logger.info(f"    {key}: {value}")

best_params = trial.params

if not all(key.startswith('classifier__') for key in best_params.keys()):
    best_params_prefixed = {f"classifier__{key}": value for key, value in best_params.items()}
else:
    best_params_prefixed = best_params


In [None]:

final_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier(
        random_state=101,
        class_weight='balanced',
        n_jobs=-1
    ))
])

final_pipeline.set_params(**best_params_prefixed)
logger.info("Best hyperparameters set on the final pipeline.")

final_pipeline.fit(X_train, y_train)
logger.info("Final model trained on the training set.")

y_val_pred = final_pipeline.predict(X_val)
y_val_pred_prob = final_pipeline.predict_proba(X_val)[:, 1]
logger.info("Predictions made on the validation set.")

roc_auc = roc_auc_score(y_val, y_val_pred_prob)
accuracy = accuracy_score(y_val, y_val_pred)
precision = precision_score(y_val, y_val_pred)
f1 = f1_score(y_val, y_val_pred)

logger.info(f"Validation ROC-AUC: {roc_auc:.3f}")
logger.info(f"Validation Accuracy: {accuracy:.3f}")
logger.info(f"Validation Precision: {precision:.3f}")
logger.info(f"Validation F1-Score: {f1:.3f}")



In [None]:
fpr, tpr, thresholds = roc_curve(y_val, y_val_pred_prob)
plt.figure(figsize=(8, 6))
plt.plot(fpr, tpr, label=f'ROC curve (area = {roc_auc:.2f})')
plt.plot([0, 1], [0, 1], 'k--', label='Random Guess')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Receiver Operating Characteristic (ROC) Curve')
plt.legend(loc='lower right')
plt.grid(True)
plt.tight_layout()
plt.show()

cm = confusion_matrix(y_val, y_val_pred)
disp = ConfusionMatrixDisplay(confusion_matrix=cm)
disp.plot(cmap=plt.cm.Blues)
plt.title('Confusion Matrix')
plt.show()

def get_feature_names(preprocessor):
    num_features = numerical_cols
    
    cat_pipeline = preprocessor.named_transformers_['cat']
    onehot = cat_pipeline.named_steps['onehot']
    cat_features = onehot.get_feature_names_out(categorical_cols)
    
    return np.concatenate([num_features, cat_features])

try:
    feature_names = get_feature_names(preprocessor)
    importances = final_pipeline.named_steps['classifier'].feature_importances_
    feature_importances_df = pd.DataFrame({
        'Feature': feature_names,
        'Importance': importances
    }).sort_values(by='Importance', ascending=False)
    
    logger.info("Top 20 Feature Importances:")
    print(feature_importances_df.head(20))
    
    plt.figure(figsize=(10, 8))
    sns.barplot(x='Importance', y='Feature', data=feature_importances_df.head(20))
    plt.title('Top 20 Feature Importances')
    plt.xlabel('Importance')
    plt.ylabel('Feature')
    plt.tight_layout()
    plt.show()
except AttributeError as e:
    logger.error("Error in extracting feature names: ", exc_info=True)


In [None]:

final_pipeline.fit(X, y)
logger.info("Final model retrained on the entire training dataset.")

X_test = test_data.drop(columns=['id'])
logger.info("Test dataset prepared.")

y_test_pred_prob = final_pipeline.predict_proba(X_test)[:, 1]
logger.info("Predictions made on the test set.")

# def custom_round(prob):
#     if prob > 0.85:
#         return 1.0
#     elif 0.35 < prob < 0.5:
#         return 0.5
#     else:
#         return round(prob, 2)

y_test_pred_prob_rounded = pd.Series(y_test_pred_prob)

submission = pd.DataFrame({
    'id': test_data['id'],
    'smoking': y_test_pred_prob_rounded
})

# assert submission.isnull().sum().sum() == 0, "Submission contains missing values."
# assert set(submission.columns) == {'id', 'smoking'}, "Submission columns mismatch."

submission.to_csv('submission_smoking_random_forest_optuna11.csv', index=False)
logger.info("Submission file saved as 'submission_smoking_random_forest_optuna11.csv'")

