In [7]:
import pandas as pd
import ast
import numpy as np
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score
import matplotlib.pyplot as plt

# Load the data
train_data = pd.read_csv('train.csv')
valid_data = pd.read_csv('validation.csv')
test_data = pd.read_csv('test.csv')

# Name the data columns
train_data.columns = ['features', 'label']
valid_data.columns = ['features', 'label']
test_data.columns = ['features']

# Parse the string representations of features
train_data['features'] = train_data['features'].apply(lambda x: np.array(ast.literal_eval(x)))
valid_data['features'] = valid_data['features'].apply(lambda x: np.array(ast.literal_eval(x)))
test_data['features'] = test_data['features'].apply(lambda x: np.array(ast.literal_eval(x)))

# Convert features to numpy arrays
X_train = np.stack(train_data['features'].values)
X_valid = np.stack(valid_data['features'].values)
X_test = np.stack(test_data['features'].values)

# Convert labels to numpy arrays and adjust from -1, 0, 1 to 0, 1, 2
y_train = train_data['label'].values
y_valid = valid_data['label'].values

# Adjust labels
y_train_adjusted = y_train + 1
y_valid_adjusted = y_valid + 1

# Define the XGBoost model
model = XGBClassifier(objective='multi:softmax', num_class=3, eval_metric='mlogloss')

# Train the model and record the evaluation history
eval_set = [(X_train, y_train_adjusted), (X_valid, y_valid_adjusted)]
history = model.fit(X_train, y_train_adjusted, eval_set=eval_set, early_stopping_rounds=10, verbose=True)

# Evaluate the model
train_pred = model.predict(X_train)
valid_pred = model.predict(X_valid)
train_accuracy = accuracy_score(y_train_adjusted, train_pred)
valid_accuracy = accuracy_score(y_valid_adjusted, valid_pred)

print(f'Train accuracy: {train_accuracy:.4f}')
print(f'Validation accuracy: {valid_accuracy:.4f}')

# Plot feature importance
plt.figure(figsize=(10, 8))
plt.bar(range(len(model.feature_importances_)), model.feature_importances_)
plt.xlabel('Feature Index')
plt.ylabel('Feature Importance')
plt.title('Feature Importance Plot')
plt.show()

# Plot training and validation accuracy
epochs = len(history.evals_result()['validation_0']['mlogloss'])
plt.figure(figsize=(10, 6))
plt.plot(range(epochs), history.evals_result()['validation_0']['mlogloss'], label='Train')
plt.plot(range(epochs), history.evals_result()['validation_1']['mlogloss'], label='Validation')

plt.xlabel('Epoch')
plt.ylabel('Log Loss')
plt.title('Training and Validation Log Loss')
plt.legend()
plt.show()

# Predict labels for the test set
y_pred = model.predict(X_test)

# Adjust the predicted labels back from 0, 1, 2 to -1, 0, 1
y_pred_adjusted = y_pred - 1

# # Save the predicted labels in the same format as train.csv
test_data['predicted_label'] = y_pred_adjusted
test_data[['features', 'predicted_label']].to_csv('test_predicted.csv', index=False)
