In [58]:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# Load train data
train_data = pd.read_csv("train.csv")

# Convert Date columns to YYYYMMDD format
train_data['Date'] = pd.to_datetime(train_data['Date'], format='%Y%m%d').dt.strftime('%Y%m%d')

# Encode the 'Strategy' column (target variable) using LabelEncoder
label_encoder = LabelEncoder()
train_data['Strategy_encoded'] = label_encoder.fit_transform(train_data['Strategy'])

# Select features and target variable
features = ['Open', 'Volume', 'Date']
target = 'Strategy_encoded'

# Split the data into features and target variable
X = train_data[features]
y = train_data[target]

# Split the data into training and validation sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Initialize the RandomForestClassifier
classifier_model = RandomForestClassifier(random_state=42)

# Train the classifier using the training data
classifier_model.fit(X_train, y_train)

# Use the trained classifier to make predictions on the test data
test_predictions_classifier = classifier_model.predict(X_test)

# Calculate accuracy on the test set
accuracy = accuracy_score(y_test, test_predictions_classifier)
print("Accuracy Score on Test Set:", accuracy)

# Initialize the RandomForestRegressor
regressor_model = RandomForestRegressor(random_state=42)

# Train the regressor using the training data
regressor_model.fit(X_train, y_train)

# Use the trained regressor to make predictions on the test data
test_predictions_regressor = regressor_model.predict(X_test)

# Calculate mean squared error on the test set
mse = mean_squared_error(y_test, test_predictions_regressor)
print("Mean Squared Error on Test Set:", mse)


Accuracy Score on Test Set: 0.7166666666666667
Mean Squared Error on Test Set: 0.2979833333333334


In [60]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
import xgboost as xgb

from sklearn.preprocessing import LabelEncoder

# Load train data
train_data = pd.read_csv("train.csv")

# Convert Date column to numeric format (days since a reference date)
reference_date = pd.to_datetime("2000-01-01")
train_data['Date'] = (pd.to_datetime(train_data['Date'], format='%Y%m%d') - reference_date).dt.days

# Encode the 'Strategy' column (classification target variable) using LabelEncoder
label_encoder = LabelEncoder()
train_data['Strategy_encoded'] = label_encoder.fit_transform(train_data['Strategy'])

# Separate features and target variable
features = ['Open', 'Volume', 'Date']  # Include additional columns as needed
target_classification = 'Strategy_encoded'
target_regression = 'Close'

# Split the data into training (90%) and validation (10%) sets for both tasks
X_train, X_val, y_train_classification, y_val_classification, y_train_regression, y_val_regression = train_test_split(
    train_data[features],
    train_data[target_classification],
    train_data[target_regression],
    test_size=0.1,
    random_state=42
)

# Initialize XGBoost classifiers for classification and regression tasks
classifier_model = xgb.XGBClassifier(objective='multi:softmax', num_class=num_classes, random_state=42)
regressor_model = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

# Rest of the code remains the same

# Rest of the code remains the same


# Train the models
classifier_model.fit(X_train, y_train_classification)
regressor_model.fit(X_train, y_train_regression)

# Validate the models
val_predictions_classification = classifier_model.predict(X_val)
val_predictions_regression = regressor_model.predict(X_val)

# Calculate accuracy for classification
accuracy_classifier = accuracy_score(y_val_classification, val_predictions_classification)
print("Accuracy Score (Classification) on Validation Set:", accuracy_classifier)

# Calculate mean squared error for regression
mse = mean_squared_error(y_val_regression, val_predictions_regression)
print("Mean Squared Error (Regression) on Validation Set:", mse)

# Load test data
test_data = pd.read_csv("test.csv")

# Convert Date column to numeric format (days since a reference date)
test_data['Date'] = (pd.to_datetime(test_data['Date'], format='%Y-%m-%d') - reference_date).dt.days

# Handle non-numeric columns in test data (similar to the training data preprocessing)

# Make predictions for classification task
test_predictions_classification = classifier_model.predict(test_data[features])

# Make predictions for regression task
test_predictions_regression = regressor_model.predict(test_data[features])

# Create a DataFrame for the final predictions
predictions_df = pd.DataFrame({
    'id': test_data['id'],
    'Date': pd.to_datetime(test_data['Date'], format='%Y%m%d'),  # Convert back to original date format if needed
    'Close': test_predictions_regression,
    'Strategy': test_predictions_classification
})

# Store predictions in a CSV file
predictions_df.to_csv("predictions.csv", index=False)

print("Predictions saved to predictions.csv")


Accuracy Score (Classification) on Validation Set: 0.8
Mean Squared Error (Regression) on Validation Set: 9.706976783233207


ValueError: time data "7730" doesn't match format "%Y%m%d", at position 0. You might want to try:
    - passing `format` if your strings have a consistent format;
    - passing `format='ISO8601'` if your strings are all ISO8601 but not necessarily in exactly the same format;
    - passing `format='mixed'`, and the format will be inferred for each element individually. You might want to use `dayfirst` alongside this.