In [16]:
from time import time
from collections import Counter
from pathlib import Path
import pandas as pd
import numpy as np
from numpy.linalg import norm
from scipy.spatial.distance import cdist, cosine

import matplotlib.pyplot as plt
from matplotlib.ticker import FuncFormatter
import seaborn as sns

from gensim.models import Word2Vec, KeyedVectors
from gensim.scripts.glove2word2vec import glove2word2vec

from sklearn.decomposition import IncrementalPCA

In [2]:
from gensim.models import KeyedVectors
from gensim.models import FastText

# Path to FastText pre-trained Wikipedia model
fasttext_wiki_file = 'data/fasttext/wiki-news-300d-1M.vec'

# Load pre-trained FastText model (in Gensim format)
print("Loading pre-trained FastText model...")
fasttext_model = KeyedVectors.load_word2vec_format(fasttext_wiki_file, binary=False)
print("Pre-trained FastText model loaded.")


Loading pre-trained FastText model...
Pre-trained FastText model loaded.


In [6]:
# Example: Prepare a custom corpus (can be from your domain-specific dataset)
custom_corpus = [
    ['this', 'is', 'a', 'sample', 'sentence'],
    ['we', 'are', 'fine-tuning', 'the', 'fasttext', 'model'],
    ['domain', 'specific', 'data', 'helps', 'improve', 'performance'],
    ['stock', 'market', 'crash', 'leads', 'to', 'recession'],
    ['equity', 'prices', 'are', 'rising', 'due', 'to', 'inflation'],
    ['capital', 'gains', 'are', 'taxable', 'in', 'many', 'countries'],
    ['bull', 'market', 'is', 'typically', 'associated', 'with', 'growth'],
    ['bear', 'market', 'refers', 'to', 'declining', 'asset', 'prices'],
    ['dividends', 'are', 'a', 'portion', 'of', 'profits', 'distributed', 'to', 'shareholders'],
    ['bond', 'yields', 'are', 'inversely', 'proportional', 'to', 'bond', 'prices'],
    ['hedge', 'funds', 'use', 'leverage', 'to', 'maximize', 'returns'],
    ['credit', 'default', 'swap', 'protects', 'against', 'debt', 'default'],
    ['derivative', 'instruments', 'derive', 'value', 'from', 'underlying', 'assets'],
    ['central', 'banks', 'control', 'monetary', 'policy', 'through', 'interest', 'rates'],
    ['technical', 'analysis', 'involves', 'charting', 'stock', 'price', 'movements'],
    ['candlestick', 'patterns', 'are', 'used', 'to', 'predict', 'market', 'trends'],
    ['moving', 'averages', 'smooth', 'out', 'price', 'data', 'for', 'trend', 'identification'],
    ['relative', 'strength', 'index', 'measures', 'overbought', 'or', 'oversold', 'conditions'],
    ['fibonacci', 'retracement', 'helps', 'identify', 'potential', 'support', 'levels'],
    ['breakout', 'traders', 'focus', 'on', 'stocks', 'breaking', 'through', 'resistance'],
    ['stop', 'loss', 'orders', 'limit', 'potential', 'losses', 'on', 'trades'],
    ['limit', 'orders', 'allow', 'traders', 'to', 'set', 'buy', 'or', 'sell', 'price', 'points'],
    ['volume', 'analysis', 'is', 'used', 'to', 'confirm', 'market', 'momentum'],
    ['day', 'trading', 'involves', 'buying', 'and', 'selling', 'stocks', 'within', 'the', 'same', 'day'],
    ['swing', 'trading', 'captures', 'short', 'to', 'medium', 'term', 'market', 'moves'],
    ['price', 'to', 'earnings', 'ratio', 'is', 'a', 'valuation', 'metric', 'for', 'stocks'],
    ['market', 'capitalization', 'refers', 'to', 'the', 'total', 'value', 'of', 'a', 'company’s', 'outstanding', 'shares'],
    ['blue', 'chip', 'stocks', 'are', 'considered', 'stable', 'and', 'profitable', 'long-term', 'investments'],
    ['penny', 'stocks', 'are', 'low-priced', 'stocks', 'with', 'high', 'volatility'],
    ['initial', 'public', 'offering', 'occurs', 'when', 'a', 'company', 'goes', 'public'],
    ['short', 'selling', 'involves', 'borrowing', 'shares', 'to', 'sell', 'in', 'anticipation', 'of', 'price', 'declines'],
    ['margin', 'trading', 'allows', 'investors', 'to', 'borrow', 'money', 'to', 'buy', 'stocks'],
    ['options', 'contracts', 'give', 'traders', 'the', 'right', 'but', 'not', 'the', 'obligation', 'to', 'buy', 'or', 'sell', 'assets'],
    ['call', 'option', 'contracts', 'allow', 'the', 'holder', 'to', 'buy', 'at', 'a', 'specific', 'price'],
    ['put', 'option', 'contracts', 'allow', 'the', 'holder', 'to', 'sell', 'at', 'a', 'specific', 'price'],
    ['earnings', 'season', 'is', 'when', 'companies', 'release', 'their', 'quarterly', 'financial', 'results'],
    ['economic', 'indicators', 'such', 'as', 'GDP', 'and', 'unemployment', 'rates', 'affect', 'market', 'behavior'],
    ['market', 'sentiment', 'reflects', 'the', 'overall', 'attitude', 'of', 'investors', 'toward', 'a', 'particular', 'market'],
    ['algorithmic', 'trading', 'uses', 'computer', 'programs', 'to', 'execute', 'trades', 'based', 'on', 'predefined', 'criteria'],
    ['high-frequency', 'trading', 'involves', 'making', 'millions', 'of', 'trades', 'in', 'fractions', 'of', 'a', 'second'],
    ['risk', 'management', 'is', 'key', 'to', 'protecting', 'capital', 'in', 'volatile', 'markets'],
    ['portfolio', 'diversification', 'reduces', 'the', 'risk', 'by', 'investing', 'in', 'varied', 'assets'],
    ['beta', 'measures', 'a', 'stock’s', 'volatility', 'relative', 'to', 'the', 'overall', 'market']
]
# Ensure the corpus is tokenized properly (if needed)
def tokenize_sentence(sentence):
    if isinstance(sentence, str):
        return sentence.lower().split()  # Tokenize if it's a string
    return sentence  # Return unchanged if it's already a list

# Apply tokenization only if necessary
custom_corpus = [tokenize_sentence(sentence) for sentence in custom_corpus]

# Check the result
print(custom_corpus)


# Apply tokenization to your custom corpus (if it isn't tokenized yet)
custom_corpus = [tokenize_sentence(sentence) for sentence in custom_corpus]


[['this', 'is', 'a', 'sample', 'sentence'], ['we', 'are', 'fine-tuning', 'the', 'fasttext', 'model'], ['domain', 'specific', 'data', 'helps', 'improve', 'performance'], ['stock', 'market', 'crash', 'leads', 'to', 'recession'], ['equity', 'prices', 'are', 'rising', 'due', 'to', 'inflation'], ['capital', 'gains', 'are', 'taxable', 'in', 'many', 'countries'], ['bull', 'market', 'is', 'typically', 'associated', 'with', 'growth'], ['bear', 'market', 'refers', 'to', 'declining', 'asset', 'prices'], ['dividends', 'are', 'a', 'portion', 'of', 'profits', 'distributed', 'to', 'shareholders'], ['bond', 'yields', 'are', 'inversely', 'proportional', 'to', 'bond', 'prices'], ['hedge', 'funds', 'use', 'leverage', 'to', 'maximize', 'returns'], ['credit', 'default', 'swap', 'protects', 'against', 'debt', 'default'], ['derivative', 'instruments', 'derive', 'value', 'from', 'underlying', 'assets'], ['central', 'banks', 'control', 'monetary', 'policy', 'through', 'interest', 'rates'], ['technical', 'analy

In [13]:
from gensim.models import FastText

# Ensure the custom_corpus is a list of lists, where each inner list is a tokenized sentence
if not isinstance(custom_corpus, list) or not all(isinstance(sentence, list) for sentence in custom_corpus):
    raise ValueError("custom_corpus must be a list of tokenized sentences (list of lists).")

# Initialize FastText model
fasttext_model = FastText(vector_size=300)

# Build vocabulary from the custom corpus
print("Building vocabulary...")
fasttext_model.build_vocab(corpus_iterable=custom_corpus)

# Fine-tune the FastText model on the custom corpus
print("Fine-tuning the FastText model...")
fasttext_model.train(corpus_iterable=custom_corpus, total_examples=len(custom_corpus), epochs=5)
print("Fine-tuning complete.")


Building vocabulary...
Fine-tuning the FastText model...
Fine-tuning complete.


In [17]:
analogies_path = Path('data', 'analogies-en.txt')

In [18]:
# Save the fine-tuned model
fasttext_model.save('fine_tuned_fasttext.model')
print("Fine-tuned FastText model saved.")

# Load the saved model (if needed)
fine_tuned_model = FastText.load('fine_tuned_fasttext.model')

# Evaluate the fine-tuned model
def eval_analogies_fasttext(model, vocab=30000):
    analogies_result = model.wv.evaluate_word_analogies(analogies_path, restrict_vocab=vocab, case_insensitive=True)
    correct = sum([len(section['correct']) for section in analogies_result[1]])
    incorrect = sum([len(section['incorrect']) for section in analogies_result[1]])
    return pd.DataFrame([{
        'total_correct': correct,
        'total_incorrect': incorrect,
        'total_attempted': correct + incorrect,
        'accuracy': correct / (correct + incorrect) if (correct + incorrect) > 0 else 0
    }])

# Evaluate fine-tuned FastText model
fine_tuned_result = eval_analogies_fasttext(fine_tuned_model, vocab=100000)
print(fine_tuned_result)


Fine-tuned FastText model saved.
   total_correct  total_incorrect  total_attempted  accuracy
0              0                0                0         0


In [20]:
# Assuming you have a function `evaluate_analogy_accuracy(model)` that returns the accuracy
def evaluate_analogy_accuracy(model):
    # Implement your analogy evaluation logic here
    # For simplicity, returning a mock result
    return {'accuracy': 0.75}  # Example value, replace with actual evaluation result

# Evaluate pre-trained FastText model
fasttext_wiki_result = evaluate_analogy_accuracy(pretrained_fasttext_model)  # Use your pre-trained model

# Evaluate fine-tuned FastText model
fine_tuned_result = evaluate_analogy_accuracy(fasttext_model)  # Your fine-tuned model

# Combine results
results_df = pd.DataFrame({
    'Model': ['Pre-trained FastText', 'Fine-tuned FastText'],
    'Accuracy': [fasttext_wiki_result['accuracy'], fine_tuned_result['accuracy']]
})

# Plot the results
import matplotlib.pyplot as plt
import seaborn as sns

plt.figure(figsize=(8, 5))
sns.barplot(x='Model', y='Accuracy', data=results_df)
plt.title('Pre-trained vs Fine-tuned FastText Model Analogy Task Accuracy')
plt.ylim(0, 1)
plt.ylabel('Accuracy')
plt.show()


NameError: name 'pretrained_fasttext_model' is not defined