In [1]:
from sklearn.model_selection import KFold
from xgboost import XGBRegressor
import numpy as np
%run data_preprocessing.ipynb

train = load_train_data('../data/train.csv')
test = load_test_data('../data/test.csv')

# Adding some basic new features

RMV = ['rainfall','id']
train['year_group'] = train['id']//365
train['temperature_range'] = train['maxtemp'] - train['mintemp']
train['seasonal_sin'] = np.sin(2 * np.pi * train['day'] / 365)    # Tracks seasonal behavior
test['year_group'] = test['id']//365
test['temperature_range'] = test['maxtemp'] - test['mintemp']
test['seasonal_sin'] = np.sin(2 * np.pi * test['day'] / 365)
FEATURES = [c for c in train.columns if not c in RMV]

if __name__ == "__main__":
    print("Our features are:")
    print( FEATURES )

Our features are:
['day', 'pressure', 'maxtemp', 'temperature', 'mintemp', 'dewpoint', 'humidity', 'cloud', 'sunshine', 'winddirection', 'windspeed', 'year_group', 'temperature_range', 'seasonal_sin']


In [3]:
# Smack together all features for future testing

INTERACT = []
for i,c1 in enumerate(FEATURES):
    for j,c2 in enumerate(FEATURES[i+1:]):
        n = f"{c1}_{c2}"
        train[n] = train[c1] * train[c2]
        test[n] = test[c1] * test[c2]
        INTERACT.append(n)

if __name__ == "__main__":
    print(f"There are {len(INTERACT)} interaction features:")
    print( INTERACT )

There are 91 interaction features:
['day_pressure', 'day_maxtemp', 'day_temperature', 'day_mintemp', 'day_dewpoint', 'day_humidity', 'day_cloud', 'day_sunshine', 'day_winddirection', 'day_windspeed', 'day_year_group', 'day_temperature_range', 'day_seasonal_sin', 'pressure_maxtemp', 'pressure_temperature', 'pressure_mintemp', 'pressure_dewpoint', 'pressure_humidity', 'pressure_cloud', 'pressure_sunshine', 'pressure_winddirection', 'pressure_windspeed', 'pressure_year_group', 'pressure_temperature_range', 'pressure_seasonal_sin', 'maxtemp_temperature', 'maxtemp_mintemp', 'maxtemp_dewpoint', 'maxtemp_humidity', 'maxtemp_cloud', 'maxtemp_sunshine', 'maxtemp_winddirection', 'maxtemp_windspeed', 'maxtemp_year_group', 'maxtemp_temperature_range', 'maxtemp_seasonal_sin', 'temperature_mintemp', 'temperature_dewpoint', 'temperature_humidity', 'temperature_cloud', 'temperature_sunshine', 'temperature_winddirection', 'temperature_windspeed', 'temperature_year_group', 'temperature_temperature_range

In [5]:
from sklearn.model_selection import KFold, GroupKFold
from sklearn.metrics import roc_auc_score
from sklearn.svm import SVC, LinearSVC  # Use sklearn's SVC or LinearSVC

# Testing above features for best performance (change model as needed).

if __name__ == "__main__":
    ADD  = []
    best_auc = 0
    best_oof = None
    best_pred = None
    
    # FORWARD FEATURE SELECTION
    for k,col in enumerate(['baseline']+INTERACT):
    
        FOLDS = train.year_group.nunique()
        kf = GroupKFold(n_splits=FOLDS)
    
        oof_svc = np.zeros(len(train))
        pred_svc = np.zeros(len(test))
    
        if col!='baseline': ADD.append(col)
    
        # GROUP K FOLD USING YEAR AS GROUP
        for i, (train_index, test_index) in enumerate(kf.split(train, groups=train.year_group)):
            # TRAIN AND VALID DATA
            x_train = train.loc[train_index, FEATURES+ADD].copy()
            y_train = train.loc[train_index, "rainfall"]
            x_valid = train.loc[test_index, FEATURES+ADD].copy()
            y_valid = train.loc[test_index, "rainfall"]
            x_test = test[FEATURES+ADD].copy()
    
            # SVC WANTS STANDARIZED FEATURES
            for c in FEATURES + ADD:
                m = x_train[c].mean()
                s = x_train[c].std()
                x_train[c] = (x_train[c] - m) / s
                x_valid[c] = (x_valid[c] - m) / s
                x_test[c] = (x_test[c] - m) / s
                x_test[c] = x_test[c].fillna(0)
    
            # TRAIN SVC MODEL
            # LinearSVC does not support `predict_proba`, so we use decision_function to get scores
            model = LinearSVC(C=0.1)
            model.fit(x_train.values, y_train.values)
    
            # INFER OOF
            decision_values = model.decision_function(x_valid.values)
            oof_svc[test_index] = 1 / (1 + np.exp(-decision_values))  # Logistic transformation
    
            # INFER TEST
            decision_values_test = model.decision_function(x_test.values)
            pred_svc += 1 / (1 + np.exp(-decision_values_test))  # Logistic transformation
    
        # COMPUTE AVERAGE TEST PREDS
        pred_svc /= FOLDS
    
        # COMPUTE CV VALIDATION AUC SCORE
        true = train.rainfall.values
        m = roc_auc_score(true, oof_svc)
    
        if m > best_auc:
            print(f"NEW BEST with {col} at {m}")
            best_auc = m
            best_oof = oof_svc.copy()
            best_pred = pred_svc.copy()
        else:
            print(f"Worse with {col} at {m}")
            ADD.remove(col)

NEW BEST with baseline at 0.8918092031425365
Worse with day_pressure at 0.8918047138047138
NEW BEST with day_maxtemp at 0.8919416386083052
Worse with day_temperature at 0.8918395061728395
Worse with day_mintemp at 0.8919068462401797
Worse with day_dewpoint at 0.8918563411896745
Worse with day_humidity at 0.8918350168350168
Worse with day_cloud at 0.8918002244668912
NEW BEST with day_sunshine at 0.8920336700336701
NEW BEST with day_winddirection at 0.8928563411896744
Worse with day_windspeed at 0.8927710437710439
NEW BEST with day_year_group at 0.8931705948372614
Worse with day_temperature_range at 0.8930112233445566
Worse with day_seasonal_sin at 0.892993265993266
NEW BEST with pressure_maxtemp at 0.8931728395061728
Worse with pressure_temperature at 0.8931593714927049
NEW BEST with pressure_mintemp at 0.8931919191919191
Worse with pressure_dewpoint at 0.8931840628507295
Worse with pressure_humidity at 0.8931818181818181
Worse with pressure_cloud at 0.8931537598204264
Worse with pressu

In [7]:
# Best AUC
if __name__ == "__main__":
    print(f"We achieved CV SVC AUC = {best_auc:.4f} adding {len(ADD)} interactions features:")
    print( ADD )

We achieved CV SVC AUC = 0.8955 adding 22 interactions features:
['day_maxtemp', 'day_sunshine', 'day_winddirection', 'day_year_group', 'pressure_maxtemp', 'pressure_mintemp', 'pressure_temperature_range', 'maxtemp_temperature', 'maxtemp_dewpoint', 'maxtemp_year_group', 'maxtemp_seasonal_sin', 'temperature_dewpoint', 'temperature_cloud', 'temperature_winddirection', 'temperature_windspeed', 'temperature_year_group', 'mintemp_winddirection', 'dewpoint_winddirection', 'dewpoint_year_group', 'humidity_year_group', 'cloud_windspeed', 'windspeed_year_group']


In [9]:
# Update dataframe with best features!

ADD = ['day_maxtemp', 'day_sunshine', 'day_winddirection', 'day_year_group', 'pressure_maxtemp', 'pressure_mintemp', 'pressure_temperature_range', 'maxtemp_temparature', 'maxtemp_dewpoint', 'maxtemp_year_group', 'maxtemp_seasonal_sin', 'temparature_dewpoint', 'temparature_cloud', 'temparature_winddirection', 'temparature_windspeed', 'temparature_year_group', 'mintemp_winddirection', 'dewpoint_winddirection', 'dewpoint_year_group', 'humidity_year_group', 'cloud_windspeed', 'windspeed_year_group']

difference = list(set(INTERACT) - set(ADD))

train = train.drop(columns=difference)
test = test.drop(columns=difference)
