In [8]:
import re
from gensim.models import KeyedVectors
from pdf2image import convert_from_path
import pytesseract
from PIL import Image, ImageDraw
import cv2
import numpy as np
import easyocr

import nltk
from nltk.corpus import wordnet


from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import cosine_similarity

from transformers import BertTokenizer, BertModel
import torch

In [3]:
def ocr_pdf_to_text(pdf_path, roi):
    # Convert PDF pages to images
    pages = convert_from_path(pdf_path)

    # Extract text from each image using Tesseract OCR
    text = ""
    reader = easyocr.Reader(['en'])  # Initialize once outside the function

    for page in pages:
        page = preprocess_image(page)

        # Create a drawing context
        draw = ImageDraw.Draw(page)
        
        # Dimensions
        width, height = page.size
        
        draw.rectangle([0, 0, roi[0], height], fill="white")
        draw.rectangle([roi[2], 0, width, height], fill="white")
        draw.rectangle([0, 0, width, roi[1]], fill="white")
        draw.rectangle([0, roi[3], width, height], fill="white")

        text += easyocr_ocr(page, reader) #pytesseract.image_to_string(page)
    return text

def easyocr_ocr(image, reader):
    # Convert PIL Image to NumPy array
    image_np = np.array(image)
    
    # Perform OCR
    result = reader.readtext(image_np)
    
    # Extract and return the detected text
    text = " ".join([item[1] for item in result])
    return text

def preprocess_image(image):
    if image.mode != 'RGB':
        image = image.convert('RGB')
    # Convert PIL Image to NumPy array
    image_np = np.array(image)
    
    # Convert to grayscale
    gray = cv2.cvtColor(image_np, cv2.COLOR_BGR2GRAY)
    
    # Apply binary thresholding
    _, thresh = cv2.threshold(gray, 150, 255, cv2.THRESH_BINARY_INV)
    
    # Convert back to PIL Image and return
    return Image.fromarray(thresh)

# Function to process the extracted text
def process_text(text):
# Add a space before every capital letter
    text_with_spaces = re.sub(r'(?<=[a-z])(?=[A-Z])', ' ', text)
    
    cleaned_content = re.sub(r'(?<![a-z])[-](?![a-z])|[^a-zA-Z\s-]', '', text_with_spaces)    
    cleaned_words = cleaned_content.split()
    lowercase_words = [word for word in cleaned_words if not word.isupper()]
    filtered_words = [word for word in lowercase_words if len(word) > 1]
    prepositions = {"of", "and", "or"}
    filtered_words = [word for word in filtered_words if word not in prepositions]

    start_index = filtered_words.index("abandoned")
    words_from_abandoned = filtered_words[start_index:]
    final_words = [word for word in words_from_abandoned if word.islower()]

    return final_words

def embeddings(words):
    word_vectors = KeyedVectors.load_word2vec_format('GoogleNews-vectors-negative300.bin.gz', binary=True)

    # Generate embeddings for the unique words
    embeddings = {word: word_vectors[word] for word in words if word in word_vectors}

    return embeddings


In [4]:
# Extract text from PDF
pdf_text = ocr_pdf_to_text("trait_name_a_psycho-lexical_study_removed.pdf",  (0, 0, 312, 1656))


with open('output_text_file', 'w', encoding='utf-8') as file:
    file.write(pdf_text)



In [5]:
# Process the extracted text
processed_words = process_text(pdf_text)
print(len(processed_words))
# print(processed_words)

4249


In [65]:
def find_synonyms(word_list):
    synonym_groups = []
    word_set = set(word_list)  # Convert to set for faster lookup
    
    for word in word_list:
        synonyms = set()
        for syn in wordnet.synsets(word):
            for lemma in syn.lemmas():
                # Only add synonyms that are in the original word list
                if lemma.name() in word_set:
                    synonyms.add(lemma.name())
        
        if synonyms:
            synonym_groups.append(list(synonyms))
    
    return synonym_groups

def merge_overlapping_groups(groups):
    merged_groups = []
    for group in groups:
        # Check if the new group overlaps with an existing group
        for merged in merged_groups:
            if set(group) & set(merged):  # Check for overlapping words
                merged.extend(group)  # Merge groups
                merged = list(set(merged))  # Remove duplicates
                break
        else:
            # If no overlapping group is found, add it as a new group
            merged_groups.append(group)
    return merged_groups

initial_synonym_groups = find_synonyms(processed_words)
merged_synonym_groups = merge_overlapping_groups(initial_synonym_groups)

print(len(merged_synonym_groups))



1176


In [None]:


word_embeddings = embeddings(processed_words)

# Convert the embeddings into a matrix format
matrix_embeddings = np.array(list(word_embeddings.values()))

# Max number of clusters to test
max_k = 1000
inertia = []  

# K-Means clustering and evaluating loss
for k in range(1, max_k+1):
    kmeans = KMeans(n_clusters=k, random_state=42).fit(matrix_embeddings)
    inertia.append(kmeans.inertia_)
    
# Plotting number of clusters vs loss
plt.plot(range(1, max_k+1), inertia, marker='o')
plt.title('Elbow Method For Optimal k')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()


In [81]:
def convert_clusters_to_embeddings(cluster, embedding_dict):
    # Initialize a zero vector for words without an embedding
    zero_vector = np.zeros_like(next(iter(embedding_dict.values())))
    
    # Convert words to embeddings, using a zero vector if the word has no embedding
    return [
        [embedding_dict.get(word, zero_vector) for word in group] 
        for group in cluster
    ]

# Get unique words from the cluster
# Generate embeddings for the unique words

# Convert words in the cluster to embeddings
cluster1_embeddings = convert_clusters_to_embeddings(merged_synonym_groups, word_embeddings)

In [76]:
kmeans_chosen_k = KMeans(n_clusters=1176, random_state=42).fit(matrix_embeddings)
labels = kmeans_chosen_k.labels_

clusters = {i: [] for i in range(1176)}
for i, label in enumerate(labels):
    clusters[label].append(matrix_embeddings[i])

  super()._check_params_vs_input(X, default_n_init=10)


In [99]:
# comparing wordnet and kmeans clustering



def calculate_centroid(embeddings):
    """Calculate the centroid of a set of embeddings, handling empty lists."""
    if len(embeddings) == 0:
        print("Warning: empty embeddings list encountered.")
        return np.zeros((300,))  # Assuming embedding size is 300
    return np.mean(embeddings, axis=0)

def calculate_group_centroids(cluster):
    """Calculate the centroid of each group of embeddings within a cluster."""
    group_centroids = []
    for i, group in enumerate(cluster):
        centroid = calculate_centroid(group)
        if np.all(centroid == 0):  # Check if centroid is a zero vector
            print(f"Warning: Group {i} resulted in a zero-vector centroid.")
        group_centroids.append(centroid)
    return group_centroids

def calculate_similarity(cluster1, cluster2):
    """Calculate the cosine similarity between two clusters of embedding groups."""
    # Calculate the centroids of the embedding groups
    centroids1 = calculate_group_centroids(cluster1)
    centroids2 = calculate_group_centroids(cluster2)
    
    # Calculate the overall centroid for each cluster
    overall_centroid1 = calculate_centroid(centroids1)
    overall_centroid2 = calculate_centroid(centroids2)
    
    # Calculate and return the cosine similarity between the two overall centroids
    return cosine_similarity([overall_centroid1], [overall_centroid2])[0, 0]

cluster_wordnet = {i: embedding for i, embedding in enumerate(cluster1_embeddings)}
cluster_kmeans = clusters

count1 = 0
count2 = 0

for i in range(1176):
    count1 += len(cluster_wordnet[i])
    count2 += len(cluster_kmeans[i])
print(count1, count2)

In [111]:
print(type(cluster_wordnet[0][0][0]))
print(type(cluster_kmeans[0][0][0]))
print(len(word_embeddings))

<class 'numpy.float32'>
<class 'numpy.float32'>
1575


In [7]:
# BERT


def bert_embeddings(words):
    # Load pre-trained model/tokenizer
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
    model = BertModel.from_pretrained('bert-base-uncased')

    # Ensure model is in eval mode
    model.eval()

    embeddings = {}

    for word in words:
        # Tokenize input word and return as PyTorch tensor
        inputs = tokenizer(word, return_tensors="pt")

        # Get embeddings
        with torch.no_grad():
            outputs = model(**inputs)
        
        # Use the embeddings from the last hidden-state
        # (size: batch_size, sequence_length, model_hidden_dimension)
        embeddings[word] = outputs.last_hidden_state[0, 0].numpy()

    return embeddings


word_embeddings = bert_embeddings(processed_words)

# Convert the embeddings into a matrix format
matrix_embeddings = np.array(list(word_embeddings.values()))

# Max number of clusters to test
max_k = 1000
inertia = []  

# K-Means clustering and evaluating loss
for k in range(1, max_k+1, 20):
    print(k)
    kmeans = KMeans(n_clusters=k, random_state=42).fit(matrix_embeddings)
    inertia.append(kmeans.inertia_)
    
# Plotting number of clusters vs loss
plt.plot(range(1, max_k+1), inertia, marker='o')
plt.title('Elbow Method For Optimal k')
plt.xlabel('Number of clusters')
plt.ylabel('Inertia')
plt.show()

Downloading (…)okenizer_config.json: 100%|██████████| 28.0/28.0 [00:00<00:00, 87.1kB/s]
Downloading (…)solve/main/vocab.txt: 100%|██████████| 232k/232k [00:00<00:00, 1.97MB/s]
Downloading (…)/main/tokenizer.json: 100%|██████████| 466k/466k [00:00<00:00, 8.48MB/s]
Downloading (…)lve/main/config.json: 100%|██████████| 570/570 [00:00<00:00, 2.33MB/s]
Downloading model.safetensors: 100%|██████████| 440M/440M [01:03<00:00, 6.93MB/s] 


NameError: name 'KMeans' is not defined