# Text Representation

## a. BOW with TF-IDF

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Sample corpus
corpus = [
    'This is the first document.',
    'This document is the second document.',
    'And this is the third one.',
    'Is this the first document?'
]

# Create the Document-Term Matrix using TF-IDF
vectorizer = TfidfVectorizer()
X = vectorizer.fit_transform(corpus)

# Convert to DataFrame for better visualization
df_tfidf = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

# Display the TF-IDF representation
print(df_tfidf)


        and  document     first        is       one    second       the  \
0  0.000000  0.469791  0.580286  0.384085  0.000000  0.000000  0.384085   
1  0.000000  0.687624  0.000000  0.281089  0.000000  0.538648  0.281089   
2  0.511849  0.000000  0.000000  0.267104  0.511849  0.000000  0.267104   
3  0.000000  0.469791  0.580286  0.384085  0.000000  0.000000  0.384085   

      third      this  
0  0.000000  0.384085  
1  0.000000  0.281089  
2  0.511849  0.267104  
3  0.000000  0.384085  


## b. N-Gram Language Model

In [2]:
from collections import Counter

# Sample text
text = "This is a sample text for n-gram generation."

# Tokenize the text
words = text.split()

# Generate n-grams (bigrams in this case)
bigrams = [(words[i], words[i + 1]) for i in range(len(words) - 1)]

# Count the frequency of each bigram
bigram_freq = Counter(bigrams)

# Display the bigram frequency
print(bigram_freq)


Counter({('This', 'is'): 1, ('is', 'a'): 1, ('a', 'sample'): 1, ('sample', 'text'): 1, ('text', 'for'): 1, ('for', 'n-gram'): 1, ('n-gram', 'generation.'): 1})


## c. Word2Vec

In [3]:
from gensim.models import Word2Vec

# Sample sentences for Word2Vec training
sentences = [
    'This is the first sentence.',
    'This is the second sentence.',
    'And here is the third one.',
    'Finally, this is the fourth sentence.'
]

# Tokenize the sentences
tokenized_sentences = [sentence.lower().split() for sentence in sentences]

# Create the Word2Vec model
model = Word2Vec(sentences=tokenized_sentences, vector_size=10, window=2, min_count=1, sg=0)

# Get the vector for a specific word
vector = model.wv['this']

# Display the vector
print('Vector for the word "this":', vector)


Vector for the word "this": [-0.07511582 -0.00930042  0.09538119 -0.07319167 -0.02333769 -0.01937741
  0.08077437 -0.05930896  0.00045162 -0.04753734]


## d. Glove

In [4]:
import numpy as np

# Simulating a vector for a word
example_vector = np.random.rand(100) # Simulated vector

# Display a simulated GloVe vector
print('Simulated GloVe vector for the word "example":', example_vector)


Simulated GloVe vector for the word "example": [0.09312298 0.43565348 0.16277451 0.31816155 0.24202945 0.9663202
 0.18652879 0.50754475 0.76576405 0.541292   0.6411781  0.23555393
 0.73269788 0.88517636 0.40502918 0.74080666 0.06440041 0.07715568
 0.35285397 0.49851457 0.51884199 0.94859039 0.37045403 0.54435933
 0.37984847 0.24439363 0.66733243 0.38932977 0.82259585 0.50469728
 0.45763156 0.85748527 0.88885336 0.50380596 0.21207641 0.88564121
 0.99633261 0.42620915 0.03789517 0.17119019 0.63111214 0.85959954
 0.09062758 0.62673221 0.50655351 0.19723588 0.29270621 0.6304541
 0.05360115 0.07622406 0.95428345 0.75503653 0.90855527 0.96239998
 0.63205802 0.09038089 0.1545848  0.90271416 0.52334789 0.63811695
 0.31613469 0.56645462 0.2451818  0.52771764 0.06403677 0.48227191
 0.50302776 0.01300329 0.11802857 0.33257802 0.11818651 0.24544976
 0.73283304 0.7234168  0.89214343 0.55006008 0.34962082 0.2013627
 0.51557133 0.08972122 0.17256836 0.49316844 0.99052534 0.19728438
 0.61826455 0.1401

## e. Fast Text

In [5]:
from gensim.models import FastText

# Sample sentences for FastText training
sentences = [
    'This is the first sentence.',
    'This is the second sentence.',
    'And here is the third one.',
    'Finally, this is the fourth sentence.'
]

# Tokenize the sentences
tokenized_sentences = [sentence.lower().split() for sentence in sentences]

# Create the FastText model
model = FastText(sentences=tokenized_sentences, vector_size=10, window=2, min_count=1, sg=0)

# Get the vector for a specific word
vector = model.wv['this']

# Display the vector
print('Vector for the word "this":', vector)


Vector for the word "this": [-0.02639857 -0.01558776 -0.00070657 -0.00016564 -0.02406672  0.01902237
  0.00485064  0.01075732 -0.00732209 -0.01498174]


## f. Setence Embedding Technique: Word2Vec

In [6]:
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# Sample documents for Doc2Vec training
documents = [
    TaggedDocument(words='This is the first document.'.split(), tags=['D1']),
    TaggedDocument(words='This document is the second document.'.split(), tags=['D2']),
    TaggedDocument(words='And this is the third one.'.split(), tags=['D3']),
    TaggedDocument(words='Is this the first document?'.split(), tags=['D4'])
]

# Create the Doc2Vec model
model = Doc2Vec(documents, vector_size=10, window=2, min_count=1, epochs=40)

# Get the vector for a specific document
vector = model.dv['D1']

# Display the vector
print('Vector for the document "D1":', vector)


Vector for the document "D1": [-0.05208231 -0.05845232 -0.1017203   0.08560619  0.03617087  0.00102675
 -0.10051455 -0.05322951 -0.09929331  0.01917398]


## g. Transformer - BERT

In [7]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained model tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Encode text
text = "This is a sample text for BERT embeddings."
encoded_input = tokenizer(text, return_tensors='pt')

# Load pre-trained model
model = BertModel.from_pretrained('bert-base-uncased')

# Get the embeddings
with torch.no_grad():
    output = model(**encoded_input)

# Extract the embeddings for the [CLS] token
cls_embedding = output.last_hidden_state[:, 0, :].squeeze()

# Display the embedding
print('BERT embedding for the sentence:', cls_embedding)


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

BERT embedding for the sentence: tensor([-3.4644e-01, -3.9823e-01, -3.4008e-01, -3.1049e-01, -3.3000e-01,
        -4.9033e-01,  2.1121e-01,  3.3549e-01,  1.8564e-02,  1.0879e-01,
        -2.7178e-01, -2.3501e-01, -3.1839e-01,  1.7870e-01,  2.3321e-01,
         2.0576e-01, -2.6735e-01,  5.2788e-01,  2.1023e-01, -4.3585e-03,
         6.2078e-02, -6.4946e-01, -1.7716e-01, -5.2473e-01,  1.8183e-01,
        -2.8081e-01, -1.2931e-01, -6.8570e-01, -3.0290e-01,  2.4773e-02,
        -1.4969e-01,  4.1658e-01, -8.6840e-02, -3.4362e-01,  5.7227e-01,
        -9.3132e-02,  2.5063e-01, -4.2000e-02,  7.3433e-01,  2.2131e-01,
        -2.5798e-01, -5.3677e-02,  5.3894e-01,  2.3923e-01,  2.4403e-01,
        -3.7884e-01, -3.0143e+00, -2.6396e-01, -5.1378e-01, -5.0800e-01,
        -3.5355e-01, -4.6622e-02,  4.7334e-01,  3.3344e-01, -7.2791e-02,
         1.3479e-01, -2.9829e-01,  5.4175e-02,  3.1048e-01,  1.7167e-01,
         1.7450e-01,  2.0310e-02,  3.9370e-02,  5.6207e-02,  8.8363e-02,
         5.2469e-0