In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
df = pd.read_csv('/kaggle/input/blog-authorship-corpus/blogtext.csv')

In [None]:
df =  df.drop(['id', 'topic', 'sign', 'date'], axis=1)

In [None]:
print(df.head())

In [None]:
df.to_csv('/kaggle/working//blogtext.csv', index=False)

In [None]:
import re
import nltk

from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from multiprocessing import Pool, cpu_count

# Download NLTK stop words data
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet', '/usr/share/nltk_data')

from nltk.corpus import wordnet
!unzip /usr/share/nltk_data/corpora/wordnet.zip -d /usr/share/nltk_data/corpora/


In [None]:
def remove_special_char(text):
    pattern = r'[^a-zA-Z0-9\s]'
    cleaned_text = re.sub(pattern, '', text)
    return cleaned_text
def preprocessing(text):
    
    text = remove_special_char(text)
    words = word_tokenize(text)
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()
    filtered_words = [lemmatizer.lemmatize(word.lower()) for word in words if word.lower() not in stop_words]
    filtered_text = ' '.join(filtered_words)
    return filtered_text

def process_chunk(chunk):
    chunk['text'] = chunk['text'].apply(preprocessing)
    return chunk

def apply_preprocessing(File):
    chunks = pd.read_csv(File, chunksize=1000)
    
    pool = Pool(cpu_count())
    
    processed_chunks = pool.map(process_chunk, chunks)
    
    pool.close()
    pool.join()
    
    processed_df = pd.concat(processed_chunks)
    
    processed_df.to_csv(File, index=False)

In [None]:
apply_preprocessing('/kaggle/working//blogtext.csv')

In [None]:
df = pd.read_csv('/kaggle/working//blogtext.csv')
print(df.head())


In [None]:
nan_values = df.isna().sum()

print("The number of NaN in each column is :")
print(nan_values)

In [None]:
from transformers import BertTokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

In [None]:

def preprocess_for_bert(text):
    if isinstance(text, str):
        tokens = tokenizer.tokenize(text)
    
        tokens = ['[CLS]'] + tokens + ['[SEP]']
    
        input_ids = tokenizer.convert_tokens_to_ids(tokens)
    
        max_length = 128
        input_ids = input_ids[:max_length] + [0] * (max_length - len(input_ids))
    else:
        return None
    return input_ids

def process_chunk_for_bert(chunk):
    chunk['input_ids'] = chunk['text'].apply(preprocess_for_bert)
    return chunk

def apply_preprocess_for_bert(File):
    chunks = pd.read_csv(File, chunksize=1000)
    
    pool = Pool(cpu_count())
    
    processed_chunks = pool.map(process_chunk_for_bert, chunks)
    
    pool.close()
    pool.join()
    
    processed_df = pd.concat(processed_chunks)
    
    processed_df.to_csv(File, index=False)

In [None]:
apply_preprocess_for_bert('/kaggle/working/blogtext.csv')

In [None]:
df = pd.read_csv('/kaggle/working/blogtext.csv')

nan_values = df.isna().sum()

print("The number of NaN in each column is :")
print(nan_values)


In [None]:
print(df.shape[0])


In [None]:
df = df.drop(['text'], axis=1)
df = df.dropna()

In [None]:
nan_values = df.isna().sum()

print("The number of NaN in each column is :")
print(nan_values)

In [None]:
df.to_csv('/kaggle/working/blogtext.csv', index=False)

In [None]:
from transformers import BertModel
import torch

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [None]:
model = BertModel.from_pretrained('bert_base_uncased', output_hidden_states=True)


In [None]:
def get_bert_embeddings(input_ids):
    
    input_ids_tensor = torch.tensor(input_ids)
    
    model.eval()
    
    with torch.no_grad:
        outputs = model(input_ids_tensor)
    
    hidden_stated = outputs.hidden_states
    
    last_layer_embeddings = hidden_states[-1]
    
    cls_embeddings = torch.mean(last_layer_embeddings, dim=1)
    
    return cls_embeddings

def get_chunk_embeddings(chunks):
    chunk['bert_embeddings'] = chunk['input_ids'].apply(preprocess_for_bert)
    return chunk

def apply_embedding_extraction(File):
    chunks = pd.read_csv(File, chunksize=1000)
    
    pool = Pool(cpu_count())
    
    processed_chunks = pool.map(get_chunk_embeddings, chunks)
    
    pool.close()
    pool.join()
    
    processed_df = pd.concat(processed_chunks)
    
    processed_df.to_csv(File, index=False)

In [None]:
apply_embedding_extraction('/kaggle/working/blogtext.csv')