In [None]:
# Import libraries
import pandas as pd
import numpy as np
import tensorflow as tf
import tensorflow_decision_forests as tfdf
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split, KFold
from lightgbm import LGBMRegressor
from pathlib import Path

pd.set_option('display.max_columns', None)

%matplotlib inline
import warnings
warnings.filterwarnings('ignore')

In [None]:
DATA_PATH = Path('')

# Load files
train = pd.read_csv(DATA_PATH / '/Users/aarononosala/Downloads/geoai-ground-level-no2-estimation-challenge20240612-4943-16iro0r/Train.csv')
test = pd.read_csv(DATA_PATH / '/Users/aarononosala/Downloads/geoai-ground-level-no2-estimation-challenge20240612-4943-16iro0r/Test.csv')

In [None]:
data = train.dropna()

data['Date'] = pd.to_datetime(data['Date']) 
data['year'] = data['Date'].dt.year
data['month'] = data['Date'].dt.month
data['day'] = data['Date'].dt.day

data.drop(['ID_Zindi', 'ID','Date'], axis=1, inplace = True)

target = data[['GT_NO2']]
train_data = data.drop(target, axis=1)

test['Date'] = pd.to_datetime(test['Date']) 
test['year'] = test['Date'].dt.year
test['month'] = test['Date'].dt.month
test['day'] = test['Date'].dt.day

test.drop(['ID_Zindi', 'ID', 'Date'], axis=1, inplace=True)

In [None]:
train_data.shape , target.shape

In [None]:
from openfe import OpenFE, tree_to_formula, transform, get_candidate_features

if __name__ == "__main__":
    n_jobs = 52
    params = {"n_estimators": 1000, "importance_type": "gain", "num_leaves": 64,
               "seed": 1, "n_jobs": n_jobs}

    ofe1 = OpenFE()
    candidate_features_list = get_candidate_features(numerical_features=list(test.columns))

    features1 = ofe1.fit(data=train_data, label=target,
                        candidate_features_list=candidate_features_list, metric='rmse', task='regression', stage2_params=params,
                        min_candidate_features=5000,
                        n_jobs=n_jobs, n_data_blocks=2, feature_boosting=True)

    train_ft1, test_ft1 = transform(train_data, test, features1[:300], n_jobs=n_jobs) 

In [None]:
train_final = pd.concat([train_data, train_ft1], axis=1) 
test_final = pd.concat([test, test_ft1], axis=1)

In [None]:
train_final.shape , test_final.shape

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error
import xgboost as xgb
import lightgbm as lgb
from catboost import CatBoostRegressor
from tqdm.auto import tqdm

def preprocess_data(X):
    X = X.replace([np.inf, -np.inf], np.nan)
    return X

def get_most_important_features(X_train, y_train, n, model_input, device='cpu'):
    xgb_params = {
        'n_jobs': -1,
        'objective': 'reg:squarederror',
        'tree_method': 'hist',
        'verbosity': 0,
        'random_state': 42,
    }
    if device == 'gpu':
        xgb_params['tree_method'] = 'gpu_hist'
        xgb_params['predictor'] = 'gpu_predictor'
        
    lgb_params = {
        'objective': 'regression',
        'metric': 'rmse',
        'boosting_type': 'gbdt',
        'random_state': 42,
        'device': device,
        'n_jobs':-1
    }
    
    cb_params = {
        'grow_policy': 'Depthwise',
        'bootstrap_type': 'Bayesian',
        'eval_metric': 'RMSE',
        'loss_function': 'RMSE',
        'random_state': 42,
        'task_type': device.upper(),
        'thread_count':-1
    }
    
    if 'xgb' in model_input:
        model = xgb.XGBRegressor(**xgb_params)
    elif 'cat' in model_input:
        model = CatBoostRegressor(**cb_params)
    else:
        model = lgb.LGBMRegressor(**lgb_params)
    
    kfold = KFold(n_splits=10, shuffle=True)
    
    rmse_scores = []
    feature_importances_list = []
    
    for i in range(5):
        for train_idx, val_idx in tqdm(kfold.split(X_train)):
            X_train_fold, X_val_fold = X_train.iloc[train_idx], X_train.iloc[val_idx]
            y_train_fold, y_val_fold = y_train.iloc[train_idx], y_train.iloc[val_idx]
            
            X_train_fold = preprocess_data(X_train_fold)
            X_val_fold = preprocess_data(X_val_fold)
            
            if 'lgb' in model_input:
                model.fit(X_train_fold, y_train_fold)
            else:
                model.fit(X_train_fold, y_train_fold, verbose=False)
            
            y_pred = model.predict(X_val_fold)
            rmse_scores.append(np.sqrt(mean_squared_error(y_val_fold, y_pred)))
            
            feature_importances = model.feature_importances_
            feature_importances_list.append(feature_importances)
    
    avg_rmse = np.mean(rmse_scores)
    avg_feature_importances = np.mean(feature_importances_list, axis=0)
    
    feature_importance_list = [(X_train.columns[i], importance) for i, importance in enumerate(avg_feature_importances)]
    sorted_features = sorted(feature_importance_list, key=lambda x: x[1], reverse=True)
    
    top_n_features = [feature[0] for feature in sorted_features[:n]]
    display_features = top_n_features[:10]
    
    sns.set_palette("Set2")
    plt.figure(figsize=(8, 6))
    plt.barh(range(len(display_features)), [avg_feature_importances[X_train.columns.get_loc(feature)] for feature in display_features])
    plt.yticks(range(len(display_features)), display_features, fontsize=12)
    plt.xlabel('Average Feature Importance', fontsize=14)
    plt.ylabel('Features', fontsize=10)
    plt.title(f'Top {10} of {n} Feature Importances with RMSE score {avg_rmse:.3f}', fontsize=16)
    plt.gca().invert_yaxis()  # Invert y-axis to have the most important feature on top
    plt.grid(axis='x', linestyle='--', alpha=0.7)
    plt.xticks(fontsize=8)
    plt.yticks(fontsize=8)

    # Add data labels on the bars
    for index, value in enumerate([avg_feature_importances[X_train.columns.get_loc(feature)] for feature in display_features]):
        plt.text(value + 0.005, index, f'{value:.3f}', fontsize=12, va='center')

    plt.tight_layout()
    plt.show()
    
    return top_n_features


In [None]:
# Check for duplicate column names
duplicate_columns = train_final.columns[train_final.columns.duplicated()].unique()
if len(duplicate_columns) > 0:
    print(f"Duplicate columns found: {duplicate_columns}")
    # Option 1: Rename duplicate columns
    #train_final = train_final.rename(columns=lambda x: x + '_dup' if x in duplicate_columns else x)
    
    # Option 2: Drop duplicate columns (if they're not needed)
    train_final = train_final.loc[:, ~train_final.columns.duplicated()]

In [None]:
n_imp_features_cat=get_most_important_features(train_final.reset_index(drop=True), target,10, 'cat')
#n_imp_features_xgb=get_most_important_features(train_final.reset_index(drop=True), target,30, 'xgb')
#n_imp_features_lgbm=get_most_important_features(train_final.reset_index(drop=True), target,30, 'lgbm')

#n_imp_features=[*set(n_imp_features_xgb+n_imp_features_lgbm+n_imp_features_cat)]
print(f"{len(n_imp_features_cat)} features have been selected from three algorithms for the final model")

In [None]:
print(f"{len(n_imp_features_cat)} features have been selected from three algorithms for the final model")

In [None]:
#df = pd.concat([train_final[n_imp_features]] + target, axis=1)
#test1 = test_final[n_imp_features]

df = pd.concat([train_final[n_imp_features_cat]] + target, axis=1)
test1 = test_final[n_imp_features_cat]