In [11]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor
from sklearn.metrics import accuracy_score, mean_squared_error
import numpy as np

# Load the training data
train_data = pd.read_csv('train.csv')

# Preprocess 'Date' column
train_data['Date'] = pd.to_datetime(train_data['Date'])
train_data['Date'] = train_data['Date'].dt.year * 10000 + train_data['Date'].dt.month * 100 + train_data['Date'].dt.day

# Drop rows with missing values
train_data.dropna(inplace=True)

# Split features and labels for training
X = train_data.drop(['Strategy', 'Close'], axis=1)
y_strategy = train_data['Strategy']
y_close = train_data['Close']

# Encode 'Strategy' labels
label_encoder = LabelEncoder()
y_strategy_encoded = label_encoder.fit_transform(y_strategy)

# Standardize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Split data into training and validation sets
X_train, X_valid, y_strategy_train, y_strategy_valid, y_close_train, y_close_valid = train_test_split(
    X_scaled, y_strategy_encoded, y_close, test_size=0.2, random_state=42
)

# Hyperparameter tuning for RandomForestClassifier
param_dist_classifier = {
    'n_estimators': np.arange(100, 501, 50),
    'max_depth': np.arange(10, 51, 10),
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

random_search_classifier = RandomizedSearchCV(
    estimator=RandomForestClassifier(random_state=42),
    param_distributions=param_dist_classifier,
    n_iter=100,  # Increased number of iterations
    cv=5,
    n_jobs=-1,
    random_state=42
)
random_search_classifier.fit(X_train, y_strategy_train)
best_classifier = random_search_classifier.best_estimator_

# Hyperparameter tuning for RandomForestRegressor
param_dist_regressor = {
    'n_estimators': np.arange(100, 501, 50),
    'max_depth': np.arange(10, 51, 10),
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4]
}

random_search_regressor = RandomizedSearchCV(
    estimator=RandomForestRegressor(random_state=42),
    param_distributions=param_dist_regressor,
    n_iter=100,  # Increased number of iterations
    cv=5,
    n_jobs=-1,
    random_state=42
)
random_search_regressor.fit(X_train, y_close_train)
best_regressor = random_search_regressor.best_estimator_

# Make predictions for 'Strategy' on the validation set
strategy_pred_valid = best_classifier.predict(X_valid)
decoded_strategy_pred_valid = label_encoder.inverse_transform(strategy_pred_valid)

# Make predictions for 'Close' on the validation set
close_pred_valid = best_regressor.predict(X_valid)

# Calculate accuracy for the classification model
accuracy = accuracy_score(y_strategy_valid, strategy_pred_valid)

# Calculate MSE for the regression model
mse = mean_squared_error(y_close_valid, close_pred_valid)

print(f'Accuracy: {accuracy:.2f}')
print(f'Mean Squared Error: {mse:.2f}')

# Continue with making predictions on the test set and saving the results if desired
# Load the test data
test_data = pd.read_csv('test.csv')

# Preprocess 'Date' column for test data
test_data['Date'] = pd.to_datetime(test_data['Date'])
test_data['Date'] = test_data['Date'].dt.year * 10000 + test_data['Date'].dt.month * 100 + test_data['Date'].dt.day

# Standardize features for test data
X_test_scaled = scaler.transform(test_data)

# Make predictions for 'Strategy'
strategy_pred = best_classifier.predict(X_test_scaled)

# Decode 'Strategy' predictions
decoded_strategy_pred = label_encoder.inverse_transform(strategy_pred)

# Make predictions for 'Close'
close_pred = best_regressor.predict(X_test_scaled)

# Output predictions to a DataFrame
predictions_df = pd.DataFrame({
    'id': test_data['id'],  # Include 'id' column in the predictions
    'Date': test_data['Date'],
    'Close': close_pred,
    'Strategy': decoded_strategy_pred
})

# Save predictions to a CSV file
predictions_df[['id', 'Date', 'Close', 'Strategy']].to_csv('sample_submission7.csv', index=False)

print("Predictions saved to 'sample_submission7.csv'")


Accuracy: 0.72
Mean Squared Error: 16.27
Predictions saved to 'sample_submission7.csv'
