In [1]:
import tensorflow as tf
gpus = tf.config.list_physical_devices('GPU')
if gpus:
  try:
    # Currently, memory growth needs to be the same across GPUs
    for gpu in gpus:
      tf.config.experimental.set_memory_growth(gpu, True)
    logical_gpus = tf.config.list_logical_devices('GPU')
    print(len(gpus), "Physical GPUs,", len(logical_gpus), "Logical GPUs")
  except RuntimeError as e:
    # Memory growth must be set before GPUs have been initialized
    print(e)

1 Physical GPUs, 1 Logical GPUs


We will first install the numerapi library, which provides an interface to the Numerai API

In [None]:
!pip install numerapi


In [1]:
# import dependencies
import pandas as pd
import numerapi
import sklearn.linear_model

Next, we will authenticate our API key.

Now, we will download the current tournament data.

In [None]:
# Download current round data
data_path = "numerai_dataset.zip"
napi.download_current_dataset(dest_path=data_path)


# Download current round data
data_path = "numerai_dataset.zip"
napi.download_current_dataset(dest_path=data_path)


The data is in a compressed format, so we will extract it.

# Extract data
import zipfile

with zipfile.ZipFile(data_path, "r") as zip_ref:
    zip_ref.extractall(".")


we load the dataset

In [2]:
import pandas as pd

# Load data
train_df = pd.read_csv(r'C:\Users\Sortolng\Downloads\traindata.csv')
test_df =  pd.read_csv(r'C:\Users\Sortolng\Downloads\testdata.csv')

The training data contains 501,808 rows and 314 columns, where the first column is the id, the second column is the era, and the last column is the target variable, which we will try to predict.

We will print the first few rows of the training data to get an idea of the format.

In [3]:
# Print first few rows
print(train_df.head())


   Unnamed: 0                id   era data_type  feature_intelligence1  \
0           0  n000315175b67977  era1     train                   0.00   
1           1  n0014af834a96cdd  era1     train                   0.00   
2           2  n001c93979ac41d4  era1     train                   0.25   
3           3  n0034e4143f22a13  era1     train                   1.00   
4           4  n00679d1a636062f  era1     train                   0.25   

   feature_intelligence2  feature_intelligence3  feature_intelligence4  \
0                   0.50                   0.25                   0.00   
1                   0.00                   0.00                   0.25   
2                   0.50                   0.25                   0.25   
3                   0.00                   0.00                   0.50   
4                   0.25                   0.25                   0.25   

   feature_intelligence5  feature_intelligence6  ...  feature_wisdom38  \
0                    0.5            

Next, we will create a function to split the data into features and target variables.

In [4]:
# find only the feature columns
feature_cols = train_df.columns[train_df.columns.str.startswith('feature')]

In [5]:
# select those columns out of the training dataset
training_features = train_df[feature_cols]

In [6]:
# create a model and fit the training data (~30 sec to run) using Linear regression
import sklearn.linear_model
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
model0 = sklearn.linear_model.LinearRegression()
model0.fit(training_features, train_df.target)
# Predict on training data
y_pred_train = model0.predict(training_features)

# Calculate mean squared error
mse_train = mean_squared_error(train_df.target, y_pred_train)
print("Training MSE:", mse_train)

Training MSE: 0.049711504080342465


Generate your first predictions from the Linear regression model
Now that we have a trained model, we can use it to make predictions on the tournament data.

In [7]:
# select the feature columns from the tournament data
live_features = test_df[feature_cols]
     

# predict the target on the live features
predictions = model0.predict(live_features)
     

In [8]:
# predictions must have an `id` column and a `prediction` column
predictions_df = test_df["id"].to_frame()
predictions_df["prediction"] = predictions
predictions_df.head(10)
     

Unnamed: 0,id,prediction
0,n0003aa52cab36c2,0.481608
1,n000920ed083903f,0.492837
2,n0038e640522c4a6,0.530817
3,n004ac94a87dc54b,0.497083
4,n0052fe97ea0c05f,0.503089
5,n00a5ccf3b6b2870,0.502672
6,n00bf78d0bbbc1b6,0.511966
7,n00c6fd95ff0c83e,0.497699
8,n00cd56868258aec,0.492206
9,n00e7d6fb71ef69f,0.476394


Comapring with Linear regression using another model usinng Lstm

In [22]:
#import libraries
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.preprocessing import StandardScaler

# Random forest regression
rf = RandomForestRegressor(n_estimators=100, random_state=42, n_jobs=-1)
rf.fit(training_features, train_df.target)
rf_pred = rf.predict(live_features)

# Support vector regression
svm = SVR(kernel='rbf')
svm.fit(training_features, train_df.target)
svm_pred = svm.predict(live_features)

# Evaluate the performance of the models
print('Random Forest Regression')
print('MAE:', mean_absolute_error(train_df.target, rf_pred))
print('RMSE:', mean_squared_error(train_df.target, rf_pred, squared=False))
print('MSE:', mean_squared_error(train_df.target, rf_pred))
print('R^2:', r2_score(train_df.target, rf_pred))
print()
print('Support Vector Regression')
print('MAE:', mean_absolute_error(train_df.target, svm_pred))
print('RMSE:', mean_squared_error(train_df.target, svm_pred, squared=False))
print('MSE:', mean_squared_error(train_df.target, svm_pred))
print('R^2:', r2_score(train_df.target, svm_pred))

#comparing with another model and testing from the train data

This `code` above will output the MAE, RMSE, MSE, and R-squared scores for both the random forest regression and SVM models. You can adjust the hyperparameters of the models as needed to improve performance

Make your first submission for the first model(Linear regression)
To enter the tournament, we must submit the predictions back to Numerai. We will use the numerapi library to do this.

In [9]:
# Get your API keys and model_id from https://numer.ai/notebook
public_id = "J4SPYZOFARWFLPXCK2NZE5F4ML2CJQA2"
secret_key = "IEXQ372PB7TQSK4YGJ7MYNVCOCM33L7QQ3A2FUIMBJ2HMSK4BHUM3BOCISKHMKSA"
model_id = "ab3f7e70-452f-4a04-b228-0b52efbc09c0"
napi = numerapi.NumerAPI(public_id=public_id, secret_key=secret_key)

In [10]:
# Upload your predictions
predictions_df.to_csv("predictions.csv", index=False)
submission_id = napi.upload_predictions("predictions.csv", model_id=model_id)

2023-02-23 23:27:57,759 INFO numerapi.base_api: uploading predictions...
2023-02-23 23:34:47,868 ERROR numerapi.base_api: You must provide predictions for the current live IDs. Make sure you are using the latest live data.


ValueError: You must provide predictions for the current live IDs. Make sure you are using the latest live data.

The above error is an error yet to be resolved  though i downloaded the latest data and it is found in this forum. https://forum.numer.ai/t/faced-an-error-while-uploading-predictions/6069