In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier

In [None]:
data = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [None]:
test.describe()

In [None]:
test['target'] = (test['close'].shift(-1) > test['close']).astype(int)
test = test[:-1]

In [None]:
plt.scatter(data['timestamp'], data['high'])
plt.xlabel('Timestamp')
plt.ylabel('Highest price during minute')
plt.figure(figsize=(20, 6))
plt.show()

In [None]:
plt.scatter(data['timestamp'], data['target'])
plt.xlabel('Timestamp')
plt.ylabel('1 if high > prev minute, 0 else')
plt.show()

In [None]:
time = data['timestamp'].values.reshape(-1, 1)
target = data['high']
model = LinearRegression()
model.fit(time, target)
y_pred = model.predict(test['timestamp'].values.reshape(-1, 1))

In [None]:
y_actual = test['high']
mse = np.mean((y_actual - y_pred)**2)
print(mse)

In [None]:
plt.scatter(test['timestamp'], test['high'])
plt.title('actual test scatter')
plt.show()

In [None]:
plt.scatter(test['timestamp'], y_pred)
plt.title('predicted stock high per minute vs time')
plt.show()

In [None]:
model = RandomForestClassifier(n_estimators = 100, min_samples_split = 100, random_state = 1)
preds = ["close", "high", "low", "taker_buy_base_volume"]
model.fit(data[preds], data["target"])

In [None]:
from sklearn.metrics import f1_score
est = model.predict(temp_test[preds])
est = model.predict(test[preds])
f1_macro = f1_score(test["target"], est, average='macro')

print(" F1 Score:", f1_macro)

In [None]:
model = RandomForestClassifier(n_estimators = 100, min_samples_split = 100, random_state = 1)
preds = ["high", "low", "close"]
model.fit(data[preds], data["target"])

In [None]:
est = model.predict(test[preds])

prediction_df = pd.DataFrame({
    'row_id': range(0, len(est)),
    'predicted_target': est
})
prediction_df.to_csv('predictions.csv', index=False)

In [None]:
import itertools
all_features = ['open', 'high', 'low', 'close', 'volume', 
                'quote_asset_volume', 'number_of_trades', 
                'taker_buy_base_volume', 'taker_buy_quote_volume']
target_column = 'target'
X_train = data[all_features]
y_train = data[target_column]
X_test = test[all_features]
y_test = test[target_column]

def evaluate_feature_subsets(X_train, y_train, X_test, y_test, features):
    best_score = 0
    best_subset = None
    scores_dict = {}
    for r in range(1, len(features) + 1):
        for subset in itertools.combinations(features, r):
            subset = list(subset)
            X_train_subset = X_train[subset]
            X_test_subset = X_test[subset]
            model = RandomForestClassifier(n_estimators=100, min_samples_split=100, random_state=1)
            model.fit(X_train_subset, y_train)
            y_pred = model.predict(X_test_subset)
            score = f1_score(y_test, y_pred, average='macro')
            scores_dict[tuple(subset)] = score
            if score > best_score:
                best_score = score
                best_subset = subset
    return best_subset, best_score, scores_dict
best_features, best_f1, all_scores = evaluate_feature_subsets(X_train, y_train, X_test, y_test, all_features)

print("Best subset:", best_features)
print("F1:", best_f1)


In [None]:
df = pd.read_csv('predictions.csv')
df = df.dropna(how='all')
df.to_csv('predictions.csv', index=False)
print( len(test))
print( len(df))