In [12]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split

train = pd.read_csv('data/train_data.csv')
test = pd.read_csv('data/test_data.csv')

# remove nans from train
train = train.dropna(axis=0, how='any')
train = train[(train["AMPS"] <= 1) & (train["AMPS"] >= 0)]
train = train.reset_index(drop=True)
# convert unknown to C or K
for i in range(len(train)):
    if train.iloc[i]['UNIT'] == '?':
        if train.iloc[i]['TEMP'] > -98 and train.iloc[i]['TEMP'] < 102:
            train.at[i, 'UNIT'] = train.at[i, 'UNIT'].replace('?', 'C')
        else:
            train.at[i, 'UNIT'] = train.at[i, 'UNIT'].replace('?', 'K')

train.loc[train["UNIT"] == "K", "TEMP"] = train.loc[train["UNIT"] == "K", "TEMP"] - 273.15
train.loc[train["UNIT"] == "K", "UNIT"] = "C"

# mean of TEMP where UNIT is C
train = train.drop(['UNIT'], axis=1)
test = test.drop(['UNIT'], axis=1)

# One hot encoding
# Train
ohe = OneHotEncoder(handle_unknown='ignore')
encoded_cols = ohe.fit_transform(train[['MODE', 'POWER']]).toarray()
encoded_df = pd.DataFrame(encoded_cols, columns=ohe.get_feature_names(['MODE', 'POWER']))
train = pd.concat([train, encoded_df], axis=1)

train = train.drop(['MODE', 'POWER'], axis=1)
# Test
encoded_cols = ohe.transform(test[['MODE', 'POWER']]).toarray()
encoded_df = pd.DataFrame(encoded_cols, columns=ohe.get_feature_names(['MODE', 'POWER']))
test = pd.concat([test, encoded_df], axis=1)
test = test.drop(['MODE', 'POWER'], axis=1)

# split train into x and y take the OUTPUT column as y
y_train = train['OUTPUT']
x_train = train.drop(['OUTPUT'], axis=1)

from sklearn.utils import shuffle

x_train, y_train = shuffle(x_train, y_train)

x_test = test

In [13]:

from sklearn.preprocessing import PolynomialFeatures
poly = PolynomialFeatures(3)
print(train.shape)
new_x = poly.fit_transform(x_train)
print(new_x.shape)

(971, 12)
(971, 364)


In [14]:
X_train, X_val, y_train, y_val = train_test_split(new_x,y_train, test_size=0.2)

In [15]:
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import mean_squared_error

RANDOM_SEED = 42

xgb_reg = XGBRegressor(objective='reg:squarederror', seed=RANDOM_SEED)

params = {
    'max_depth': range(4, 9),
    'n_estimators': [25, 50, 100, 200],
    'gamma': [i/10.0 for i in range(0,10)],
    'learning_rate': [0.05, 0.1, 0.5],
    'subsample': [0.5, 0.7, 1.0],
    'colsample_bytree': [0.5, 0.7, 1.0],
    'reg_alpha': [0, 0.5, 1],
    'reg_lambda': [1, 1.5, 2, 3, 4.5]
}

grid_search = GridSearchCV(estimator=xgb_reg, param_grid=params, cv=3, n_jobs=-1, scoring='neg_mean_squared_error')
grid_search.fit(X_train,y_train)

print("Best parameters found: ", grid_search.best_params_)
print("Lowest RMSE found: ", np.sqrt(np.abs(grid_search.best_score_)))

xgb_reg = XGBRegressor(objective='reg:squarederror', seed=RANDOM_SEED, **grid_search.best_params_)
xgb_reg.fit(X_train,y_train)

predictions = xgb_reg.predict(X_val)

rmse = np.sqrt(mean_squared_error(y_val, predictions))
print("RMSE on test set: %f" % (rmse))

KeyboardInterrupt: 

In [None]:
x_test = poly.transform(x_test)

yhat = xgb_reg.predict(x_test)
# write y_hat to a txt file with one entry per line
with open('y_hat.txt', 'w') as f:
    for item in yhat:
        f.write("%s \n" % item)