In [1]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
import numpy as np

# Load and merge data
data_values = pd.read_csv("train_values.csv")
data_labels = pd.read_csv("train_labels.csv")
data = pd.merge(data_values, data_labels, on='building_id')

# Encode categorical variables using OneHotEncoder for better feature representation
categorical_columns = ['land_surface_condition', 'foundation_type', 'roof_type', 'ground_floor_type', 
                       'other_floor_type', 'position', 'plan_configuration', 'legal_ownership_status']
data = pd.get_dummies(data, columns=categorical_columns)

# Adjust target variable to start at 0
data['damage_grade'] -= 1

# Split features and target variable
X = data.drop(columns=['building_id', 'damage_grade'])
y = data['damage_grade']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Define a basic XGBClassifier model and set up RandomizedSearchCV for hyperparameter tuning
xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7, 10],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
}

# Randomized search for best hyperparameters
random_search = RandomizedSearchCV(xgb, param_distributions=param_grid, n_iter=20, scoring='accuracy', cv=3, random_state=42, n_jobs=-1)
random_search.fit(X_train, y_train)

# Use the best estimator for a StackingClassifier ensemble with RandomForest and LogisticRegression
stacking_clf = StackingClassifier(
    estimators=[
        ('xgb', random_search.best_estimator_),
        ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
        ('lr', LogisticRegression(max_iter=1000))
    ],
    final_estimator=LogisticRegression(),
    cv=3
)

# Train the stacking model
stacking_clf.fit(X_train, y_train)

# Make predictions and calculate performance metrics
y_pred = stacking_clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
error_rate = 1 - accuracy

print("Tuned Accuracy:", accuracy)
print("Tuned Precision:", precision)
print("Tuned Recall:", recall)
print("Tuned F1 Score:", f1)
print("Error Rate:", error_rate)


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_res

Tuned Accuracy: 0.7401431284894764
Tuned Precision: 0.7404326187408687
Tuned Recall: 0.7401431284894764
Tuned F1 Score: 0.7341845292918336
Error Rate: 0.2598568715105236


In [2]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split, RandomizedSearchCV, StratifiedKFold
from xgboost import XGBClassifier
from sklearn.ensemble import StackingClassifier, GradientBoostingClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.pipeline import Pipeline
import numpy as np

# Load and merge data
data_values = pd.read_csv("train_values.csv")
data_labels = pd.read_csv("train_labels.csv")
data = pd.merge(data_values, data_labels, on='building_id')

# Encode categorical variables using OneHotEncoder for better feature representation
categorical_columns = ['land_surface_condition', 'foundation_type', 'roof_type', 'ground_floor_type', 
                       'other_floor_type', 'position', 'plan_configuration', 'legal_ownership_status']
data = pd.get_dummies(data, columns=categorical_columns)

# Adjust target variable to start at 0
data['damage_grade'] -= 1

# Split features and target variable
X = data.drop(columns=['building_id', 'damage_grade'])
y = data['damage_grade']

# Split data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Feature Scaling for models like Logistic Regression and others that might need it
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Define a basic XGBClassifier model and set up RandomizedSearchCV for hyperparameter tuning
xgb = XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
param_grid = {
    'n_estimators': [100, 200, 300],
    'learning_rate': [0.01, 0.05, 0.1, 0.2],
    'max_depth': [3, 5, 7, 10],
    'subsample': [0.6, 0.8, 1.0],
    'colsample_bytree': [0.6, 0.8, 1.0],
}

# Randomized search for best hyperparameters
random_search = RandomizedSearchCV(xgb, param_distributions=param_grid, n_iter=20, scoring='accuracy', cv=3, random_state=42, n_jobs=-1)
random_search.fit(X_train_scaled, y_train)

# Use the best estimator for a StackingClassifier ensemble with GradientBoosting and RandomForest
stacking_clf = StackingClassifier(
    estimators=[
        ('xgb', random_search.best_estimator_),
        ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
        ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42)),
        ('ada', AdaBoostClassifier(n_estimators=100, random_state=42)),
    ],
    final_estimator=LogisticRegression(),
    cv=StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
)

# Train the stacking model
stacking_clf.fit(X_train_scaled, y_train)

# Make predictions and calculate performance metrics
y_pred = stacking_clf.predict(X_test_scaled)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
error_rate = 1 - accuracy

print("Improved Accuracy:", accuracy)
print("Improved Precision:", precision)
print("Improved Recall:", recall)
print("Improved F1 Score:", f1)
print("Improved Error Rate:", error_rate)


Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.

Parameters: { "use_label_encoder" } are not used.



Improved Accuracy: 0.7404692926075862
Improved Precision: 0.7405845987716988
Improved Recall: 0.7404692926075862
Improved F1 Score: 0.734654609209143
Improved Error Rate: 0.2595307073924138
