# Most Common Words 

The aim of this notebook is to provide the means to keep the reading level of the game dialogue and various written elements to a fairly accessible reading level. An exception is made for the necessity of providing words specific to chemistry and the sciences. The last section of this notebook shows how the various created dictionaries are used to profile samples of text to see if they are using language that is beyond the scope of the goal. 

### Imports

In [1]:
import kagglehub
import pandas as pd
import numpy as np
from nltk.corpus import words
from collections import Counter
import PyPDF2

  from .autonotebook import tqdm as notebook_tqdm


### Download Dataset

In [2]:
# Download latest version
path = kagglehub.dataset_download("rtatman/english-word-frequency")

print("Path to dataset files:", path)
filename = path + "\\" + "unigram_freq.csv"

df = pd.read_csv(filename)
#df.head()
#df.tail()

#"would" in words.words()

Path to dataset files: C:\Users\Kay\.cache\kagglehub\datasets\rtatman\english-word-frequency\versions\1


### Make New Column

In [3]:
all_words = words.words()
word_dict = { word: "" for word in all_words}
my_suspect_list = list(df['word'])
bools_list = [ True if x in word_dict else False for x in my_suspect_list ]
#print(bools_list[0:10])
Counter(bools_list)
df['is_word'] = bools_list
#df.head()

df = df.drop(df[df.is_word == False].index)
#set(df['is_word'])

df.drop('is_word', axis=1,inplace=True)
df.reset_index(drop=True,inplace=True)
#df.tail()

### Make Limited Dictionary

In [4]:
word_list_limited = list(df[0:12000]['word'])
word_list = list(df[0:23000]['word'])
limited_dict = { word: "" for word in word_list_limited}
larger_dict = { word: "" for word in word_list}
full_dict = word_dict

print(len(limited_dict))
print('bailey' in limited_dict)

12000
True


### Make Chemistry Dictionary

In [5]:
 
def make_suspect_list (my_list):
    my_list = [word.lower() for word in my_list]
    my_list = list(set(my_list))
    suspect_words_l = [word if not word in limited_dict else "" for word in my_list]
    suspect_words_l = list(set(suspect_words_l))
    # printing the data 
    #print(my_list) 
    #print(suspect_words_l)
    return suspect_words_l

def make_non_suspect_list_from_larger_dict(my_list):
    my_list = [word.lower() for word in my_list]
    my_list = list(set(my_list))
    non_suspect_words_l = [word if word in larger_dict else "" for word in my_list]
    non_suspect_words_l = list(set(non_suspect_words_l))
    return non_suspect_words_l

def make_suspect_list_from_full_dict(my_list):
    my_list = [word.lower() for word in my_list]
    my_list = list(set(my_list))
    suspect_words_l = [word if not word in full_dict else "" for word in my_list]
    suspect_words_l = list(set(suspect_words_l))
    return suspect_words_l

def make_non_suspect_list_from_full_dict(my_list):
    my_list = [word.lower() for word in my_list]
    my_list = list(set(my_list))
    non_suspect_words_l = [word if word in full_dict else "" for word in my_list]
    non_suspect_words_l = list(set(non_suspect_words_l))
    return non_suspect_words_l

def flatten(xss):
    return [x for xs in xss for x in xs]

def open_and_split_txtfile(myfilename):
    # opening the file in read mode 
    my_file = open(myfilename, "r") 
  
    # reading the file 
    data = my_file.read() 
  
    # replacing end of line('/n') with ' ' and 
    # splitting the text it further when '.' is seen. 
    data = data.replace('\n', ' ')
    data = data.replace(',', '')
    data = data.replace("\'", '')
    data = data.replace("\"", '')
    data = data.replace('.', '')
    data = data.replace('(', '')
    data = data.replace(')', '')
    my_list = data.split(" ") 
    my_file.close()
    
    return my_list

def open_txtfile(myfilename):
        # opening the file in read mode 
    my_file = open(myfilename, "r") 
  
    # reading the file 
    data = my_file.read() 
    my_file.close()
    my_list = data
    return my_list

In [6]:
pages = []
pdf_file = "..\\Chemistry Books\\Chemistry2e-WEB.pdf"
reader = PyPDF2.PdfReader(pdf_file)
number_of_pages = len(reader.pages)
for page_number in range(number_of_pages):   
    page = reader.pages[page_number].extract_text().split(" ")  # Extract page wise text then split based on spaces as required by you
    pages.append(page)

In [7]:
symbols = ['- \n', ' \n', '\n', '- ']

for c in (chr(i) for i in range(33,32 +33)):
    symbols.append(c)

symbols.remove('-')

pages = []
pdf_file = "..\\Chemistry Books\\Chemistry2e-WEB.pdf"
reader = PyPDF2.PdfReader(pdf_file)
number_of_pages = len(reader.pages)
for page_number in range(number_of_pages):   
    page = reader.pages[page_number]
    page = page.extract_text()
    for symbol in symbols: 
        page = page.replace(symbol, "")
    page = page.split(" ")  # Extract page wise text then split based on spaces as required by you
    pages.append(page)

In [8]:
book_list = []

book_list = flatten(pages)
book_list = list(set(book_list))
suspect_l = make_suspect_list(book_list)
chemistry_words_l = make_non_suspect_list_from_full_dict(suspect_l)
chemistry_words_l.sort(key=len)
chemistry_words_l = [word if len(word) > 3 else "" for word in chemistry_words_l]
#with open("Chemistry_Words.txt", "w") as output:
    #output.write(str(chemistry_words_l))

#output.close()

In [9]:
chemistry_manual_l = open_and_split_txtfile("Chemistry_Words_Manually_Refined.txt")
len(chemistry_manual_l)

924

In [10]:
chem_dict = { word: "" for word in chemistry_manual_l}

### Make Sixth Grade Words Dictionary

In [11]:
sixthGradeWords_l = open_and_split_txtfile("1st through 6th grade vocabulary.txt")
sixthGradeWords_l = list(set(sixthGradeWords_l))
len(sixthGradeWords_l)

6516

In [12]:
middleschool_words_l = make_non_suspect_list_from_full_dict(sixthGradeWords_l)
middleschool_words_l.sort(key=len)

In [13]:
print(len(middleschool_words_l))

6087


In [14]:
middleschool_dict = { word: "" for word in middleschool_words_l}

### Make Text Sample Tester

In [15]:
def textTester(sample_text_list):
    
    suspect_words_l = []
    sample_text_list = [word.lower() for word in sample_text_list]
    sample_text_list = list(set(sample_text_list))

    for word in sample_text_list:
        if (not word in limited_dict):
             if (not word in chem_dict):
                if (not word in middleschool_dict): 
                    suspect_words_l.append(word)

    suspect_words_l = list(set(suspect_words_l))
    return suspect_words_l

In [16]:
sample_text_l = open_and_split_txtfile("sample_text_from_wikipedia")
print("Original text length:",  len(sample_text_l))
suspects_l = textTester(sample_text_l)
print("Number of suspect words:", len(suspects_l))
print("Here are some of the suspects:", suspects_l[0:10])

Original text length: 211
Number of suspect words: 38
Here are some of the suspects: ['fields', 'bonds', 'changes', 'areas', 'occupies', 'rocks', 'compounds', 'aspects', 'evolved', 'explains']


In [17]:
#with open("candidate_words.txt", "w") as output:
#    output.write(str(suspects_l))

#output.close()