In [46]:
import pandas as pd
import xgboost as xgb
import json

# read datasets
train_source = pd.read_csv("./data/train_cleaned.csv")
test_source = pd.read_csv("./data/testing_set.csv")

# new datasets
train_data = train_source.copy()
test_data = test_source.copy()

# Preprocess the Time column creating 3 new columns: day, month and year
train_data['day'] = pd.to_datetime(train_data['Time']).dt.day
train_data['month'] = pd.to_datetime(train_data['Time']).dt.month
train_data['year'] = pd.to_datetime(train_data['Time']).dt.year
test_data['day'] = pd.to_datetime(test_data['Time']).dt.day
test_data['month'] = pd.to_datetime(test_data['Time']).dt.month
test_data['year'] = pd.to_datetime(test_data['Time']).dt.year

# remove the columns that are not needed: train_idx, valid, Time, outlier and valid2 columns
train_data = train_data.drop(['train_idx', 'valid', 'outlier', 'valid2', "Time"], axis=1)
test_data = test_data.drop(['test_idx', "Time"], axis=1)

# Split the training dataset into features (X) and target variable (y)
X_train = train_data.drop('label', axis=1)
y_train = train_data['label']

# Create a Gradient Boosting Classifier
model = xgb.XGBClassifier()

# Train the model
model.fit(X_train, y_train)

# Make predictions on the test dataset
predictions = model.predict(test_data)

# Create the dictionary with test_idx as keys and predicted labels as values
predictions_dict = {str(idx): int(label) for idx, label in zip(test_source['test_idx'], predictions)}

# Create the final JSON structure
output = {"target": predictions_dict}

# Save the JSON to a file
with open('./data/predsGB.json', 'w') as json_file:
    json.dump(output, json_file)
