# Find total vocabulary for llm

In [55]:
#imports 
import pandas as pd
import numpy as np
import re # used in vocab_cleaner

In [3]:
#read in the csv 
data = pd.read_csv("2deep_output.csv")

In [9]:
extractedtext = data['ExtractedText'].astype('str')
extractedtext

0       Stay connected to Grounds no matter where you ...
1       Everything is made of something. In Materials ...
2       October 20, 2022•By\n\n\t\t\t\t\t             ...
3       Learn more about our degree and certificate pr...
4                                                     nan
                              ...                        
1193    A full-time, 21-month curriculum and a global ...
1194    Javascript is currently not supported, or is d...
1195    In accordance with section 23.1-409 of the Cod...
1196    Learn more about our degree and certificate pr...
1197    Make a Gift  Before declaring amajor in Sociol...
Name: ExtractedText, Length: 1198, dtype: object

In [51]:
#create function to feed in dataset, split all words in each row's response, and output the set of vocabulary in the dataset
def vocab_extractor(dataset, a = 1):
    '''
    This functions gathers the full vocabulary for a given dataset.
    input:
        - dataset - 2 columns, one for links and ExtractedText
    output:
        - vocab - set of unique words in dataset
    '''
    extractedtext = dataset['ExtractedText'].astype('str')
    vocab = []
    #for each row, splits and aggregates the words to the total vocab
    for _,text in enumerate(extractedtext):
        words = text.split()
        vocab.extend(words)
    vocab = set(vocab)
    return vocab

In [52]:
#run function
vocab = vocab_extractor(data)

# clean vocabulary 

In [56]:
#create function to parse through vocab and remove things that start/end with punctuation or quotes 
def vocab_cleaner(vocab):
    '''
    need to import re
    This function cleans the punctuation at the beginning or end of words.
    
    input:
       - vocab - a set of unique vocab words
    output:
       - new_vocab - a set of unique vocab words that dont start/end with punction
    '''
    cleaned_vocab = set()
    for word in vocab:
        cleaned = re.sub(r'^\W+|\W+$', '', word)  # remove leading/trailing non-word chars
        if cleaned:  # skip empty strings
            cleaned_vocab.add(cleaned)
    return cleaned_vocab

In [58]:
new_vocab = vocab_cleaner(vocab)
new_vocab

{'in-state',
 'Variants',
 'Wingfield',
 'publication',
 'ProgramsBA',
 'ENROLLMENT',
 'AlgebraORMATH',
 'Aquatics',
 'problÃ¨mes',
 'Bernard',
 'BBCGTransformaÃ§Ã£o',
 'LectureBiology',
 'AirbnbÂ',
 'CoursesESL',
 'transcriptthrough',
 'WorkshopEvolutionary',
 'Battery15',
 'referee',
 'atcharlottesville@solidcore.coto',
 '3180',
 'passage',
 'transcend',
 '繁體中文',
 'nothing',
 'discontinue',
 'linkedin',
 'suitability',
 'Ð¾Ð´Ð¸Ñ',
 'ofResearch',
 'APPLICANTSMASTER',
 'ourGift',
 'hotel/conference',
 'doctrine',
 'Amount',
 'deprive',
 'Clean',
 'what’s',
 'likely',
 'CatholicismCredits',
 'pertinence',
 'modeled',
 'physician’s',
 'entwerfenâ\x80¢1',
 'dietary',
 'afetam',
 'Overviewguide',
 'gymnasium',
 'High-temperature',
 '2025Forging',
 'outcomes.PROGRAM',
 'Ã©tÃ',
 'summer.certificatecollege-of-arts-and-sciencesschool-of-continuing-amp-amp-professional-studiesarts-and-humanities',
 'entered',
 'Institute',
 'SIS)for',
 'linear',
 'walk',
 'IBMâ\x80¢14',
 'seguida',
 'meaningful