In [6]:
import torch
import csv
import pandas as pd

In [7]:
# Input NFT collection names and their floor prices, market volume
def nft_market_data(nft_name):
    df1 = pd.read_csv(nft_name +'_floor_price.csv')
    df2 = pd.read_csv(nft_name +'_market_volume.csv')
    market_volume = df2['market_volume'].tolist()
    floor_price = df1['floor_price'].tolist()
    
    floor_price.pop()
    market_volume.pop()
    return floor_price, market_volume

In [8]:
import os
import pandas as pd
import numpy as np

def nft_vadar_sentiment_score(nft_name):
    # Read in the vader folder and get all 
    nft_vadar_sentiment_score = []
    start_date = '2022-08-01'
    end_date = '2023-02-27'
    date_range = pd.date_range(start_date, end_date)
    
    for date in date_range:
        date_str = date.strftime('%Y-%m-%d')
        csv_file = 'vader/' + 'vader' + '_' + date_str + '.csv'
        # filter the Text column and get all the indices
        df = pd.read_csv(csv_file)
        # keep only rows whose 'Text' has the nft_name
        df = df[df['Text'].str.contains(nft_name)]
        # if there is no row with the nft_name, then the vader score is 0
        if df.empty:
            nft_vadar_sentiment_score.append(0)
            continue
        # get the vader column's compound score
        vader_col = df['vader']
        # extract the compound score
        # Extract the compound number from each row of the 'vader' column
        compound_list = [eval(vader)['compound'] for vader in vader_col]
        # # calculate the average compound score
        vader_avg = np.mean(compound_list)
        nft_vadar_sentiment_score.append(vader_avg)
    return nft_vadar_sentiment_score


In [38]:
# test the function
azuki_sentiment_scores = nft_vadar_sentiment_score('azuki')

In [39]:
# print azuki_sentiment_scores
print(len(azuki_sentiment_scores))

211


In [40]:
# print shape of azuki_sentiment_scores
print(np.shape(azuki_sentiment_scores))

(211,)


In [41]:
# print first 10 elements
print(azuki_sentiment_scores[:10])

[0.20656249999999998, 0.765, 0.2202, -0.29569999999999996, 0.0371, 0.40519999999999995, 0.37649999999999995, 0.4404, -0.0258, 0.6065999999999999]


In [42]:
import os
import pandas as pd
import numpy as np

def nft_tweets_textual_embeddings(nft_name, embedding_name):
    average_embeddings = []

    start_date = '2022-08-01'
    end_date = '2023-02-27'
    date_range = pd.date_range(start_date, end_date)
    sentiment_scores = nft_vadar_sentiment_score(nft_name)
    for date in date_range:
        date_str = date.strftime("%Y-%m-%d")
        csv_file = os.path.join(f"{date_str}.csv")
        embeddings = []
        if os.path.exists(csv_file):
            df = pd.read_csv(csv_file)
            keyword_indices = df[df['Text'].str.contains(nft_name, case=False)].index

            daily_embedding_folder = os.path.join(f"/home/qian/qian/CS6220-NFT-Celebrities-Twitter-Analysis-main/{embedding_name}/{date_str}")
            for index in keyword_indices:
                index_file = os.path.join(f"{daily_embedding_folder}/{index}.npy")
                if os.path.exists(index_file):
                    # keep only the first 768 dimensions
                    embedding = np.load(index_file)[..., :768]
                    embeddings.append(embedding)

            if len(embeddings) > 0:
                average_bert_embedding = np.mean(embeddings, axis=0)
                # append value of vader sentiment score
                # for i in range(len(average_bert_embedding)):
                #     average_bert_embedding = np.append(average_bert_embedding, sentiment_scores[i])
            else:
                average_bert_embedding = np.zeros((769,))
                
    
            average_embeddings.append(average_bert_embedding)
            # then for each date, we have a 768 dimension vector, now we need to append the vader sentiment score
            
            
    for i in range(len(average_embeddings)):
        average_embeddings[i] = np.append(average_embeddings[i], sentiment_scores[i])
    return average_embeddings


In [43]:
# test azuki bert embeddings
azuki_bert_embeddings = nft_tweets_textual_embeddings('azuki', 'bert_embeddings')

In [44]:
# print first 10 elements
print(azuki_bert_embeddings[0].shape)

(769,)


In [45]:
azuki_embeddings_tensor = torch.tensor(np.vstack(azuki_bert_embeddings), dtype=torch.float32)

In [46]:
# convert list to tensor
# test:nft_market_data('azuki')
azuki_price, azuki_volume = nft_market_data('azuki')

azuki_price_tensor = torch.tensor(azuki_price, dtype=torch.float32)
azuki_volume_tensor = torch.tensor(azuki_volume, dtype=torch.float32)

In [51]:
# get bayc data and convert to tensor
bayc_price, bayc_volume = nft_market_data('bayc')
bayc_price_tensor = torch.tensor(bayc_price, dtype=torch.float32)
bayc_volume_tensor = torch.tensor(bayc_volume, dtype=torch.float32)
bayc_embeddings = nft_tweets_textual_embeddings('bayc', 'bert_embeddings')
bayc_embeddings_tensor = torch.tensor(np.vstack(bayc_embeddings), dtype=torch.float32)

In [52]:
# mayc data and convert to tensor
mayc_price, mayc_volume = nft_market_data('mayc')
mayc_price_tensor = torch.tensor(mayc_price, dtype=torch.float32)
mayc_volume_tensor = torch.tensor(mayc_volume, dtype=torch.float32)
mayc_embeddings = nft_tweets_textual_embeddings('mayc', 'bert_embeddings')
mayc_embeddings_tensor = torch.tensor(np.vstack(mayc_embeddings), dtype=torch.float32)

In [114]:
# otherdeed data and convert to tensor
otherdeed_price, otherdeed_volume = nft_market_data('otherdeed_for_otherside')
otherdeed_price_tensor = torch.tensor(otherdeed_price, dtype=torch.float32)
otherdeed_volume_tensor = torch.tensor(otherdeed_volume, dtype=torch.float32)
otherdeed_embeddings = nft_tweets_textual_embeddings('otherdeed_for_otherside', 'bert_embeddings')
otherdeed_embeddings_tensor = torch.tensor(np.vstack(otherdeed_embeddings), dtype=torch.float32)

In [56]:
# clonex data and convert to tensor
clonex_price, clonex_volume = nft_market_data('clonex')
clonex_price_tensor = torch.tensor(clonex_price, dtype=torch.float32)
clonex_volume_tensor = torch.tensor(clonex_volume, dtype=torch.float32)
clonex_embeddings = nft_tweets_textual_embeddings('clonex', 'bert_embeddings')
clonex_embeddings_tensor = torch.tensor(np.vstack(clonex_embeddings), dtype=torch.float32)

ValueError: all the input array dimensions for the concatenation axis must match exactly, but along dimension 1, the array at index 0 has size 769 and the array at index 55 has size 770

In [126]:
# Train LSTM model
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import math

# Prepare dataset
X = bayc_embeddings_tensor  # Shape: (number_of_dates, 769)
y = bayc_price_tensor # Shape: (number_of_dates, 1)

# Reshape the data before normalization
X_numpy = X.numpy()
y_numpy = y.numpy().reshape(-1, 1)

# Split the dataset into training and test sets, use first 80% of the data for training
X_train, X_test, y_train, y_test = train_test_split(X_numpy, y_numpy, test_size=0.2, shuffle=False)

# Normalize the input data
scaler_X = MinMaxScaler()
X_train_normalized = scaler_X.fit_transform(X_train)
X_test_normalized = scaler_X.transform(X_test)

scaler_y = MinMaxScaler()
y_train_normalized = scaler_y.fit_transform(y_train)

# Convert the normalized data back to tensors
X_train = torch.tensor(X_train_normalized, dtype=torch.float32)
X_test = torch.tensor(X_test_normalized, dtype=torch.float32)
y_train = torch.tensor(y_train_normalized, dtype=torch.float32)
y_test = torch.tensor(y_test, dtype=torch.float32)

# Create DataLoader
train_dataset = TensorDataset(X_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# Define the LSTM model architecture
class LSTMModel(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers, output_size, dropout_prob=0.2):
        super(LSTMModel, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True)
        self.dropout = nn.Dropout(dropout_prob)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)
        c0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size)
        out, _ = self.lstm(x, (h0, c0))
        out = self.fc(out[:, -1, :])
        return out

# Initialize the model, loss function, and optimizer
input_size = 769
hidden_size = 32
num_layers = 2
output_size = 1
drop_prob = 0.2
learning_rate = 0.001

model = LSTMModel(input_size, hidden_size, num_layers, output_size, drop_prob)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)

# Train the model with early stopping
num_epochs = 200
early_stop = 10 # Number of epochs to wait before early stopping
best_loss = float('inf')
best_epoch = 0
best_model = None

for epoch in range(num_epochs):
    for i, (embeddings, targets) in enumerate(train_loader):
        embeddings = embeddings.unsqueeze(1)  # Shape: (batch_size, 1, 770)

        # Forward pass
        outputs = model(embeddings)
        loss = criterion(outputs, targets)

        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    if loss.item() < best_loss:
        best_loss = loss.item()
        best_epoch = epoch
        best_model = model.state_dict()
    elif epoch - best_epoch >= early_stop:
        print(f'Early stopping at epoch {epoch+1}')
        break

    print(f'Epoch [{epoch+1}/{num_epochs}], Loss: {loss.item():.4f}')

# Evaluate the model on the test set
model.eval()

with torch.no_grad():
    X_test = X_test.unsqueeze(1)  # Shape: (test_size, 1, 770)
    y_pred_normalized = model(X_test)

    # Inverse transform the normalized predictions to get the unnormalized predictions
    y_pred = scaler_y.inverse_transform(y_pred_normalized.numpy())

# Calculate the mean squared error or any other performance metric
mse = criterion(torch.tensor(y_pred), y_test)
# keep 2 decimal places
# Round mean squared error to two decimal places
mse_rounded = torch.round(mse * 100) / 100

# Print mean squared error with two decimal places
print(f"Mean Squared Error: {mse_rounded:.2f}")

rmse = math.sqrt(mse.item())
print('Root Mean Squared Error:', rmse)

# Add some code to document the results


Epoch [1/200], Loss: 0.3874
Epoch [2/200], Loss: 0.2316
Epoch [3/200], Loss: 0.0379
Epoch [4/200], Loss: 0.0117
Epoch [5/200], Loss: 0.0170
Epoch [6/200], Loss: 0.0050
Epoch [7/200], Loss: 0.0122
Epoch [8/200], Loss: 0.0137
Epoch [9/200], Loss: 0.0078
Epoch [10/200], Loss: 0.0066
Epoch [11/200], Loss: 0.0018
Epoch [12/200], Loss: 0.0885
Epoch [13/200], Loss: 0.0129
Epoch [14/200], Loss: 0.0065
Epoch [15/200], Loss: 0.0036
Epoch [16/200], Loss: 0.0171
Epoch [17/200], Loss: 0.0063
Epoch [18/200], Loss: 0.0150
Epoch [19/200], Loss: 0.0095
Epoch [20/200], Loss: 0.1008
Early stopping at epoch 21
Mean Squared Error: 16.59
Root Mean Squared Error: 4.0725702917356665
