In [3]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

In [4]:
from google.colab import drive
# Mount your Google Drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [36]:
# Read the dataset from the specified path
df = pd.read_csv('/content/preprocess_text_Nltk.csv', sep=',', encoding='utf-8', quotechar='"')


In [37]:
# Display the first few rows of the DataFrame
df.head()

Unnamed: 0,text,type,processed_Text
0,WASHINGTON (Reuters) - The head of a conservat...,True,washington reuters head conservative republica...
1,WASHINGTON (Reuters) - Transgender people will...,True,washington reuters transgender people allowed ...
2,WASHINGTON (Reuters) - The special counsel inv...,True,washington reuters special counsel investigati...
3,WASHINGTON (Reuters) - Trump campaign adviser ...,True,washington reuters trump campaign adviser geor...
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,True,seattlewashington reuters president donald tru...


In [38]:
df.isnull().sum()

text                0
type                0
processed_Text    632
dtype: int64

In [39]:
# Drop rows with NaN values in the 'processed_Text' column
df.dropna(subset=['processed_Text'], inplace=True)

**1. Gensim's Word2Vec:**

In [None]:
!pip install gensim

In [19]:
import gensim.downloader as api

# Load the pre-trained Word2Vec model
word2vec_model = api.load("word2vec-google-news-300")



                                                text  type  \
0  WASHINGTON (Reuters) - The head of a conservat...  True   
1  WASHINGTON (Reuters) - Transgender people will...  True   
2  WASHINGTON (Reuters) - The special counsel inv...  True   
3  WASHINGTON (Reuters) - Trump campaign adviser ...  True   
4  SEATTLE/WASHINGTON (Reuters) - President Donal...  True   

                                      processed_Text  \
0  washington reuters head conservative republica...   
1  washington reuters transgender people allowed ...   
2  washington reuters special counsel investigati...   
3  washington reuters trump campaign adviser geor...   
4  seattlewashington reuters president donald tru...   

                                              vector  
0  [[-0.24316406, 0.0390625, -0.067871094, 0.4746...  
1  [[-0.24316406, 0.0390625, -0.067871094, 0.4746...  
2  [[-0.24316406, 0.0390625, -0.067871094, 0.4746...  
3  [[-0.24316406, 0.0390625, -0.067871094, 0.4746...  
4  [[-0.0520019

In [21]:
# Function to convert text to vector representation
def text_to_vector(text):
    vector = []
    for word in text.split():
        if word in word2vec_model:
            vector.append(word2vec_model[word])
    return vector

# Apply the function to the processed_text column
df['vector'] = df['processed_Text'].apply(text_to_vector)

# Display the DataFrame after adding the vector column
df.head()

Unnamed: 0,text,type,processed_Text,vector
0,WASHINGTON (Reuters) - The head of a conservat...,True,washington reuters head conservative republica...,"[[-0.24316406, 0.0390625, -0.067871094, 0.4746..."
1,WASHINGTON (Reuters) - Transgender people will...,True,washington reuters transgender people allowed ...,"[[-0.24316406, 0.0390625, -0.067871094, 0.4746..."
2,WASHINGTON (Reuters) - The special counsel inv...,True,washington reuters special counsel investigati...,"[[-0.24316406, 0.0390625, -0.067871094, 0.4746..."
3,WASHINGTON (Reuters) - Trump campaign adviser ...,True,washington reuters trump campaign adviser geor...,"[[-0.24316406, 0.0390625, -0.067871094, 0.4746..."
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,True,seattlewashington reuters president donald tru...,"[[-0.052001953, 0.061767578, -0.13671875, -0.1..."


In [22]:
df.isnull().sum()

text              0
type              0
processed_Text    0
vector            0
dtype: int64

**Doc2Vec model provided by the Gensim** 34min




In [28]:
from nltk.tokenize import word_tokenize
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# Define a function to tag documents
def tag_docs(texts):
    tagged_docs = [TaggedDocument(words=word_tokenize(doc), tags=[i]) for i, doc in enumerate(texts)]
    return tagged_docs

# Tag the documents in your DataFrame
tagged_docs = tag_docs(df['processed_Text'])

# Train a Doc2Vec model
doc2vec_model = Doc2Vec(vector_size=300, window=5, min_count=1, workers=4, epochs=20)
doc2vec_model.build_vocab(tagged_docs)
doc2vec_model.train(tagged_docs, total_examples=doc2vec_model.corpus_count, epochs=doc2vec_model.epochs)

# Function to infer vector representations for documents
def infer_vector(text):
    return doc2vec_model.infer_vector(word_tokenize(text))

# Apply the function to the processed_text column
df['vector'] = df['processed_Text'].apply(infer_vector)

# Display the DataFrame after adding the vector column
df.head()

Unnamed: 0,text,type,processed_Text,vector
0,WASHINGTON (Reuters) - The head of a conservat...,True,washington reuters head conservative republica...,"[0.08159183, -0.09372088, -0.65823144, -0.3964..."
1,WASHINGTON (Reuters) - Transgender people will...,True,washington reuters transgender people allowed ...,"[0.58030653, -0.6449846, -0.68533283, -0.48252..."
2,WASHINGTON (Reuters) - The special counsel inv...,True,washington reuters special counsel investigati...,"[-0.11687033, -0.8204301, 0.5863683, 0.7192275..."
3,WASHINGTON (Reuters) - Trump campaign adviser ...,True,washington reuters trump campaign adviser geor...,"[-0.08453747, -0.79260373, -0.8524005, -0.2475..."
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,True,seattlewashington reuters president donald tru...,"[0.06517347, 0.33787113, -0.7747259, -0.933711..."


**Bag-of-Words (BoW) using Gensim:**

In [35]:
from gensim import corpora

# Create a list of tokenized documents
tokenized_documents = [doc.split() for doc in df['processed_Text']]

# Create a dictionary mapping words to IDs
dictionary = corpora.Dictionary(tokenized_documents)

# Create a BoW representation for each document
bow_corpus = [dictionary.doc2bow(doc) for doc in tokenized_documents]

# Store the BoW representations in the DataFrame
df['vector'] = bow_corpus

# Display the DataFrame with the BoW representations
df.head()

Unnamed: 0,text,type,processed_Text,vector
0,WASHINGTON (Reuters) - The head of a conservat...,True,washington reuters head conservative republica...,"[(0, 1), (1, 1), (2, 2), (3, 1), (4, 3), (5, 1..."
1,WASHINGTON (Reuters) - Transgender people will...,True,washington reuters transgender people allowed ...,"[(2, 9), (14, 1), (17, 3), (29, 1), (30, 1), (..."
2,WASHINGTON (Reuters) - The special counsel inv...,True,washington reuters special counsel investigati...,"[(2, 2), (31, 1), (71, 1), (75, 4), (86, 1), (..."
3,WASHINGTON (Reuters) - Trump campaign adviser ...,True,washington reuters trump campaign adviser geor...,"[(5, 1), (7, 1), (60, 2), (67, 1), (75, 1), (9..."
4,SEATTLE/WASHINGTON (Reuters) - President Donal...,True,seattlewashington reuters president donald tru...,"[(7, 2), (18, 2), (19, 1), (21, 6), (30, 1), (..."


**BERT Embeddings with Hugging Face Transformers**

In [40]:
from transformers import BertTokenizer, BertModel
import torch

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

# Function to obtain BERT embeddings for text
def get_bert_embeddings(text):
    # Tokenize input text
    tokens = tokenizer(text, return_tensors='pt', padding=True, truncation=True)

    # Forward pass through BERT model
    with torch.no_grad():
        outputs = model(**tokens)

    # Extract embeddings from BERT's output
    embeddings = torch.mean(outputs.last_hidden_state, dim=1).squeeze()

    return embeddings.numpy()

# Apply the function to the processed_Text column
df['vector'] = df['processed_Text'].apply(get_bert_embeddings)

# Display the DataFrame after adding the vector column
print(df.head())

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/48.0 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/570 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/440M [00:00<?, ?B/s]

KeyboardInterrupt: 

In [None]:
import numpy as np

# Define a function to obtain BERT embeddings for a batch of texts
def get_bert_embeddings_batch(texts):
    # Tokenize input texts to obtain tensors
    tokenized_texts = [tokenizer(text, return_tensors='pt', padding=True, truncation=True) for text in texts]

    # Batch tokenized texts
    batched_tokenized_texts = tokenizer.pad(tokenized_texts)

    # Forward pass through BERT model for the batch
    with torch.no_grad():
        outputs = model(**batched_tokenized_texts)

    # Extract embeddings from BERT's output for each text in the batch
    embeddings = torch.mean(outputs.last_hidden_state, dim=1).squeeze()

    return embeddings.numpy()

# Apply the function to the processed_Text column in batches
batch_size = 100  # Adjust the batch size as needed
num_batches = (len(df) + batch_size - 1) // batch_size
vectors = []
for i in range(num_batches):
    start_idx = i * batch_size
    end_idx = min((i + 1) * batch_size, len(df))
    batch_texts = df['processed_Text'][start_idx:end_idx]
    batch_vectors = get_bert_embeddings_batch(batch_texts)
    vectors.extend(batch_vectors)

# Store the resulting vectors in the 'vector' column of the DataFrame
df['vector'] = vectors

# Display the DataFrame after adding the vector column
df.head()