# Trading Signal Generation
- applied labels by observing whether the USD/BRL exchange rate increased, decreased, or experienced low volatility (neutral)
- applied threshold of 0.2% applied to manually determining neutral days (technically doesn't even matter because we're not using the above approach anymore. Using Eli's Labels instead)
- applied word2vec model to vectorize text and further classify new articles to generate buy/sell/hold signals

Notes (in-progress)
- avoid data-leakage: don't train/test on the same data

Checking Dates in articles

In [1]:
# checking what dates are in our file filled with articles

import re
import pandas as pd

def extract_dates_from_file(file_path):
    date_pattern = r'\d{2}/\d{2}/\d{2}'
    dates = []
    
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
        matches = re.findall(date_pattern, content)
        dates.extend(matches)
    
    dates_df = pd.DataFrame(dates, columns=['Date'])
    return dates_df

file_path = 'data/news_corpus_cleaned.txt'
dates_df = extract_dates_from_file(file_path)
display(dates_df)

Unnamed: 0,Date
0,09/01/24
1,10/01/24
2,11/01/24
3,12/01/24
4,15/01/24
5,16/01/24
6,19/01/24
7,22/01/24
8,23/01/24
9,24/01/24


## Organizing article file data into a DataFrame for readability and easier to convert to CSV
- PLEASE SAVE THIS OUTPUTTED DATA AS A CSV
- this is what i manually did for january, not exactly reproducable, be careful with this, i built the dataframe for february differently
- i need to get every data source into this dataframe format because it's easy to work with

In [None]:
# FINSIHED PREPROCESSING JANUARY DATA, SAVED AS labeled_january_data.csv now, keeping this code for reusability later on

# import pandas as pd
# import re
# from datetime import datetime

# # look at the file and make it into a list
# def parse_articles_to_df(file_path):
#     dates = []  # keep the dates
#     articles = []  # keep the articles
    
#     with open(file_path, "r", encoding="utf-8") as file:
#         current_date = None  # keep the date we are looking at right now
#         current_articles = []  # keep the article for the current date
        
#         for line in file:
#             line = line.strip()  # take away spaces from the start and end
#             if line:  # if the line is not empty
#                 if line[2] == "/":  # if the line looks like a date (dd/mm/yy)
#                     # we had a date before, so let's save it with its articles
#                     if current_date:
#                         for article in current_articles:
#                             dates.append(current_date)  # add current date
#                             articles.append(article)  # add article for that date
#                     # use the date as is, don't change it
#                     current_date = line  # keep new date
#                     current_articles = []  # start fresh for new date
#                 else:
#                     # 'line' is the article, keep adding them to the list
#                     current_articles.append(line) 
        
#         if current_date:  # when we are done looking at the file
#             for article in current_articles:  # for all the articles we saved
#                 dates.append(current_date)  # add the date again
#                 articles.append(article)  # add the article again
    
#     df = pd.DataFrame({'date': dates, 'article': articles})  # make table with the dates and articles
#     return df  # display table

# file_path = "data/news_corpus_cleaned.txt"  # where the file is
# df_template = parse_articles_to_df(file_path)  # call function to make the table, called df_template because it can be used for another month

# # show the table to see it
# display(df_template.head())

## January Preprocessing

In [3]:
file_path = "data/labeled_january_data.csv"

with open(file_path, "r", encoding="utf-8") as file:
    df_jan = pd.read_csv(file)

In [4]:
import spacy

# spacy PT model
nlp = spacy.load('pt_core_news_sm')

#preprocessing
def preprocess_text_spacy(text):
    doc = nlp(text)
    
    # lemmatization and stopwords removal
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    
    #tokens back to 1 string
    return ' '.join(tokens)

# preprocess ALL articles in df (the file)
df_jan['processed_article'] = df_jan['article'].apply(preprocess_text_spacy)

# Display the processed articles
display(df_jan)

Unnamed: 0,date,article,label,processed_article
0,09/01/24,"O petróleo testava reação moderada (+0,50%) no...",0,petróleo testar reação moderar pregão asiático...
1,09/01/24,Circularam comentários de que a reunião de Pac...,0,circularam comentário reunião Pacheco líder se...
2,09/01/24,"De qualquer modo, seis senadores estão com a p...",0,modo senador presença confirmar Único indicado...
3,09/01/24,"Nos EUA, sai a balança comercial de novembro (...",-1,EUA sair balança comercial novembro Fed boy Mi...
4,09/01/24,"O investidor cumpre a espera pela 5ªF, que pro...",-1,investidor cumprir espera prometer emoção CPI ...
...,...,...,...,...
1096,31/01/24,Emissão é de apenas uma série e já tem valor d...,0,Emissão série definir revelar executivo
1097,31/01/24,"ROMI teve lucro líquido de R$ 51,340 milhões n...",0,ROMI lucro líquido milhão queda
1098,31/01/24,ENEVA. Citi manteve recomendação de compra par...,0,ENEVA Citi manter recomendação compra ação ban...
1099,31/01/24,OI. Nova versão do plano de recuperação judici...,0,OI versão plano recuperação judicial concluir ...


## February Preprocessing

In [7]:
file_path = "data/labeled_february_data.csv"

with open(file_path, "r", encoding="utf-8") as file:
    df_feb = pd.read_csv(file)

In [8]:
import spacy

# spacy PT model
nlp = spacy.load('pt_core_news_sm')

#preprocessing
def preprocess_text_spacy(text):
    doc = nlp(text)
    
    # lemmatization and stopwords removal
    tokens = [token.lemma_ for token in doc if not token.is_stop and token.is_alpha]
    
    #tokens back to 1 string
    return ' '.join(tokens)

# preprocess ALL articles in df (the file)
df_feb['processed_article'] = df_feb['article'].apply(preprocess_text_spacy)

# Display the processed articles
display(df_feb)

Unnamed: 0,date,article,label,processed_article
0,01/02/2024,… O PMI industrial chinês medido pelo setor pr...,0,PMI industrial chinês meder setor privado fica...
1,01/02/2024,"… O texto do BC, praticamente igual ao anterio...",0,texto BC praticamente igual anterior dezembro ...
2,01/02/2024,"… Depois de baixar a Selic para 11,25%, o BC n...",1,baixar Selic BC mexeu quase comunicado parágra...
3,01/02/2024,"… O Copom não encurtou o horizonte de cortes, ...",1,Copom encurtar horizonte corte manter barra al...
4,01/02/2024,… Isso significa que março continua dado e que...,1,significar março continuar dar maio reservar s...
...,...,...,...,...
914,28/02/2024,CPFL PAULISTA. Conselho aprovou 14ª emissão de...,0,CPFL PAULISTA aprovar emissão debêntur montant...
915,28/02/2024,CPFL PIRATININGA. Conselho aprovou 16ª emissão...,0,CPFL PIRATININGA aprovar emissão debêntur mont...
916,28/02/2024,"UNIPAR informou a renúncia de Antonio Rabello,...",0,UNIPAR informar renúncia Antonio Rabello diret...
917,28/02/2024,GRUPO MATEUS concluiu venda de cinco imóveis p...,0,MATEUS concluir venda imóvel fundo TRX real mi...


## Applying the Word2Vec approach

In [5]:
from gensim.models import Word2Vec

tokenized_articles = df_articles['processed_article'].apply(lambda x: x.split()).tolist()

model = Word2Vec(sentences=tokenized_articles, 
                 vector_size=100,   # dimensionality of the word embeddings
                 window=5,          # context window size
                 min_count=5,       # minimum frequency of words to consider
                 workers=4,         # CPUs for training
                 sg=0)              # Use CBOW (0) or Skip-Gram (1)

model.save("word2vec.model")

Labeling of all articles on trading day "d" based on whether the USD/BRL exchange rate increased/decreased or hovered around a .2% increase/decrease on trading day "d"
- Increase: +1
- Decrease: -1
- Neutral: 0

fail

In [None]:
'''
# Word2Vec approach by applying +1, -1, 0 labels to ALL articles per trading day whether the BRL went up down or stayed around the same based on a .2% threshold.
# 0 for neutral
# +1 for up
# -1 for down

labels = {
    '09/01/24': 1,
    '10/01/24': 0,
    '11/01/24': -1,
    '12/01/24': -1, 
    '15/01/24': 1,
    '16/01/24': 1,
    '19/01/24': 0,
    '22/01/24': 1,
    '23/01/24': -1,
    '24/01/24': -1,
    '25/01/24': -1,
    '26/01/24': -1,
    '29/01/24': 1,
    '30/01/24': 1,
    '31/01/24': 1
}

# add lablels to df
df_template['label'] = df_template['date'].map(labels) # called df_template because I might use this in the future for any month

'''


### Vectorization Helper Function

In [7]:
import numpy as np
from scipy.spatial.distance import cdist
import itertools  # needed this for combinations

# function to get the article vector by averaging word vectors
def get_article_vector(article, model):
    tokens = article.split()  # split the article into words
    # get the word vectors for each word that exists in the model's vocabulary
    vectors = [model.wv[word] for word in tokens if word in model.wv]
    if len(vectors) > 0:  # if there are any word vectors
        return np.mean(vectors, axis=0)  # average them to get a single vector
    else:
        return np.zeros(model.vector_size)  # if no word vectors, return a zero vector

January Vectorization

In [None]:
# convert all articles into vectors
df_jan['article_vector'] = df_jan['processed_article'].apply(lambda x: get_article_vector(x, model))

labels = df_jan['label'].values  # get the labels

# Checking the data we're working with before feeding it into the model
df_jan.to_csv('vectorized_january_data.csv')

display(df_jan.head())

February Vectorization

In [None]:
df_feb['article_vector'] = df_feb['processed_article'].apply(lambda x: get_article_vector(x, model))
labels = df_feb['label'].values

df_feb.to_csv('vectorized_january_data.csv')

display(df_feb.head())

## Multinomial Logistic Regression Model with Custom Word2Vec Model

Next steps:
- Apply softmax to improve accuracy
- Train on January, test on first 2 weeks of February
    - Or grab equal amounts of each label and make sure to test on NEW articles since the training data may come from spread out dates

In [None]:
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import numpy as np
import pandas as pd

# Step 1: Prepare the data (X = article vectors, y = labels)
X = np.vstack(df_articles['article_vector'].values)
y = df_articles['label'].values

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

clf = LogisticRegression(
    max_iter=1000,
    solver='lbfgs',
    class_weight='balanced',
    C=1.0,
    penalty='l2'
)
clf.fit(X_train, y_train)

y_pred = clf.predict(X_test)

X_all = np.vstack(df_articles['article_vector'].values)
predicted_labels = clf.predict(X_all)
df_articles['predicted_label'] = predicted_labels

# Step 5: Evaluate the model
accuracy = accuracy_score(y_test, y_pred)

# Classification report
classification_rep = classification_report(y_test, y_pred, output_dict=True)  # Get classification report as dictionary

# Convert evaluation results into DataFrames for better display
classification_rep_df = pd.DataFrame(classification_rep).transpose()

# Step 6: Display the results
print("Accuracy:", accuracy)
display("Classification Report:")
display(classification_rep_df)

display(df_articles)

In [None]:
# Confusion Matrix

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.metrics import confusion_matrix

# y_test is actual labels and y_pred is predicted labels
cm = confusion_matrix(y_test, y_pred)

# heatmap
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues', xticklabels=['Negative', 'Neutral', 'Positive'], yticklabels=['Negative', 'Neutral', 'Positive'])

plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.title('Confusion Matrix')

# display plot
plt.show()