# Trading Signal Generation

In [2]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
from sklearn.manifold import TSNE
import os

pd.set_option('display.max_colwidth', None)

## Preprocessing

In [12]:
file_path = "data/processed/labeled_january_data.csv"

with open(file_path, "r", encoding="utf-8") as file:
    df_jan = pd.read_csv(file)

In [13]:
file_path = "data/processed/labeled_february_data.csv"

with open(file_path, "r", encoding="utf-8") as file:
    df_feb = pd.read_csv(file)

In [14]:
import spacy

# spacy PT model
nlp = spacy.load('pt_core_news_sm')

#preprocessing
def preprocess_text_spacy(text):
    doc = nlp(text)
    
    # lemmatization and stopwords removal
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    
    #tokens back to 1 string
    return ' '.join(tokens)

# preprocess ALL articles in january's dataframe
df_jan['preprocessed_article'] = df_jan['article'].apply(preprocess_text_spacy)
df_feb['preprocessed_article'] = df_feb['article'].apply(preprocess_text_spacy)


## Applying the Word2Vec approach

In [18]:
from gensim.models import Word2Vec

def train_word2vec(df_list, vector_size=300, window=5, min_count=5, epochs=20, sg=1):
    """
    Train a Word2Vec model using tokenized articles from multiple dataframes.
    :param df_list: List of dataframes containing a 'preprocessed_article' column.
    :param vector_size: Dimensionality of word vectors.
    :param window: Maximum distance between current and predicted word.
    :param min_count: Ignores words with total frequency lower than this.
    :param epochs: Number of training iterations.
    :param sg: Skip-gram method (1) or CBOW (0).
    :return: Trained Word2Vec model.
    """
    # Combine tokenized articles from all dataframes
    tokenized_articles = []
    for df in df_list:
        tokenized_articles.extend(df['preprocessed_article'])

    # Train Word2Vec model
    word2vec_model = Word2Vec(
        sentences=tokenized_articles,
        vector_size=vector_size,
        window=window,
        min_count=min_count,
        sg=sg,
        workers=4,
        epochs=epochs
    )
    return word2vec_model

# Train Word2Vec model on both datasets
word2vec_model = train_word2vec([df_jan, df_feb])

# Save the model
word2vec_model.save("models/word2vec_combined.model")
print("Word2Vec model trained on combined datasets and saved to models/word2vec_combined.model.")

Word2Vec model trained on combined datasets and saved to models/word2vec_combined.model.


## Visualising the distribution of the Target variable
- helps us to realize a class imbalance
- good to keep track of
- MOVE THIS INTO the PREPROCESSING file eventually

In [None]:
sns.countplot(x='label', data=df_jan)
plt.title('Label Distribution in January Dataset')
plt.savefig("results/label_distribution_january.png")
plt.show()

In [None]:
sns.countplot(x='label', data=df_feb)
plt.title('Label Distribution in February Dataset')
plt.savefig("results/label_distribution_february.png")
plt.show()

In [None]:
from sklearn.manifold import TSNE

# t-SNE
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
embeddings_2d = tsne.fit_transform(np.vstack(df_feb['embedding']))

# Add 2D coordinates to the DataFrame
df_feb['embedding_2d_x'] = embeddings_2d[:, 0]
df_feb['embedding_2d_y'] = embeddings_2d[:, 1]

# Plot t-SNE stuff
plt.figure(figsize=(10, 6))
sns.scatterplot(
    x='embedding_2d_x', y='embedding_2d_y', hue='label',
    data=df_feb, palette='viridis', legend='full'
)
plt.title('t-SNE Visualization of February Dataset Embeddings')
plt.savefig("results/tsne_visualization_february.png")
plt.show()

In [None]:
# t-SNE
tsne = TSNE(n_components=2, random_state=42, perplexity=30)
embeddings_2d = tsne.fit_transform(np.vstack(df_jan['embedding']))

# Add 2D coordinates to the DataFrame
df_jan['embedding_2d_x'] = embeddings_2d[:, 0]
df_jan['embedding_2d_y'] = embeddings_2d[:, 1]

# Plot t-SNE stuff
plt.figure(figsize=(10, 6))
sns.scatterplot(
    x='embedding_2d_x', y='embedding_2d_y', hue='label',
    data=df_jan, palette='viridis', legend='full'
)
plt.title('t-SNE Visualization of January Dataset Embeddings')
plt.savefig("results/tsne_visualization_january.png")
plt.show()

In [None]:
# January

import pandas as pd

file_path = 'data/processed/labeled_january_data.csv'
df_jan = pd.read_csv(file_path)

column_data = df_jan['label']
counts = column_data.value_counts()

count_1 = counts[1]
count_minus_1 = counts[-1]
count_0 = counts[0]

print("January\n")
print(f"Count of 1: {count_1}")
print(f"Count of -1: {count_minus_1}")
print(f"Count of 0: {count_0}")
print(f"Total Number of Articles: {count_1 + count_minus_1 + count_0}")

In [None]:
# February

import pandas as pd

file_path = 'data/processed/labeled_february_data.csv'
df = pd.read_csv(file_path)

column_name = 'label'
column_data = df[column_name]

counts = column_data.value_counts()

count_1 = counts[1]
count_minus_1 = counts[-1]
count_0 = counts[0]


print("February\n")
print(f"Count of 1: {count_1}")
print(f"Count of -1: {count_minus_1}")
print(f"Count of 0: {count_0}")
print(f"Total Number of Articles: {count_1 + count_minus_1 + count_0}")

## Multinomial Logistic Regression Model with Custom Word2Vec Model

Task: 
- Train on January, test on first 2 weeks of February respectively
    - Train / test with 3 classes (+1, 0, -1), yielding a 3 x 3 confusion matrix
    - Train / test with 2 classes (+1, -1), yielding a 2 x 2 confusion matrix
- Apply softmax to improve accuracy if poor

In [6]:
import numpy as np
import pandas as pd
from gensim.models import Word2Vec
from sklearn.naive_bayes import MultinomialNB
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import classification_report

In [None]:
# Confusion Matrix

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# y_test is actual labels and y_pred is predicted labels
cm = confusion_matrix(y_test, y_pred)

# heatmap
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Negative', 'Neutral', 'Positive'], yticklabels=['Negative', 'Neutral', 'Positive'])

plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')

# display plot
plt.show()

In [None]:
# Confusion Matrix

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# y_test is actual labels and y_pred is predicted labels
cm = confusion_matrix(y_test, y_pred)

# heatmap
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Negative', 'Positive'], yticklabels=['Negative', 'Positive'])

plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')

# display plot
plt.show()