In [1]:
import pandas as pd
import re

import spacy
from textstat.textstat import textstatistics
spacy.load("en_core_web_sm")


# Returns the average number of syllables per
# word in the text
def avg_syllables_per_word(text):
    syllable = syllables_count(text)
    words = word_count(text)
    ASPW = float(syllable) / float(words)
    return ASPW

# Textstat is a python package, to calculate statistics from
# text to determine readability,
# complexity and grade level of a particular corpus.
# Package can be found at https://pypi.python.org/pypi/textstat
def syllables_count(word):
    return textstatistics().syllable_count(word)


# Splits the text into sentences, using
# Spacy's sentence segmentation which can
# be found at https://spacy.io/usage/spacy-101
def break_sentences(text):
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)
    return list(doc.sents)
 
# Returns average sentence length
def avg_sentence_length(text):
    words = word_count(text)
    sentences = sentence_count(text)
    average_sentence_length = float(words / sentences)
    return average_sentence_length
 
# Returns Number of Words in the text
def word_count(text):
    sentences = break_sentences(text)
    words = 0
    for sentence in sentences:
        words += len([token for token in sentence])
    return words    
    
# Returns the number of sentences in the text
def sentence_count(text):
    sentences = break_sentences(text)
    return len(sentences)


# Return total Difficult Words in a text
def difficult_words(text):
     
    nlp = spacy.load('en_core_web_sm')
    doc = nlp(text)
    # Find all words in the text
    words = []
    sentences = break_sentences(text)
    for sentence in sentences:
        words += [str(token) for token in sentence]
 
    # difficult words are those with syllables >= 2
    # easy_word_set is provide by Textstat as
    # a list of common words
    diff_words_set = set()
     
    for word in words:
        syllable_count = syllables_count(str(word))
        if word not in nlp.Defaults.stop_words and syllable_count >= 2:
            diff_words_set.add(word)
 
    return len(diff_words_set)

# A word is polysyllablic if it has more than 3 syllables
# this functions returns the number of all such words
# present in the text
def poly_syllable_count(text):
    count = 0
    words = []
    text = str(text)
    sentences = break_sentences(text)
    for sentence in sentences:
        words += [token for token in sentence]
     
 
    for word in words:
        if len(word)>2:
            syllable_count = syllables_count(str(word))
            if syllable_count >= 3:
                count += 1
    return count


df = pd.read_csv('word_rarity_list.csv')
dict_size = df['word'].size

def strip_character(a_string):
    r = re.compile(r"[^a-zA-Z- ]")
    return r.sub(' ', a_string)

def remove_spaces(a_string):
    return re.sub(' +', ' ', a_string)

def remove_apos_s(a_string):
    return re.sub("'s", '', a_string)

def clean_input_text(input_text):
    # convert input to lowercase
    input_text = input_text.lower()
    # remove apostrophe-s from words
    input_text = remove_apos_s(input_text)
    # strip non-essential characters
    input_text = strip_character(input_text)
    # remove internal spaces
    input_text = remove_spaces(input_text)
    # remove end spaces
    input_text = input_text.strip()
    return input_text

def tokenize(cleaned_text):
    # split string on spaces
    tokens = cleaned_text.split(" ")
    # create empty set
    token_set = set()
    # add one of each unqiue word from 'tokens' to set
    for word in tokens:
        if word in token_set:
            pass
        else:
            token_set.add(word)
    return token_set

def flesch_reading_ease(text):
    
    text = str(text)
    """
        Implements Flesch Formula:
        Reading Ease score = 206.835 - (1.015 × ASL) - (84.6 × ASW)
        Here,
          ASL = average sentence length (number of words
                divided by number of sentences)
          ASW = average word length in syllables (number of syllables
                divided by number of words)
    """
    FRE = 206.835 - float(1.015 * avg_sentence_length(text)) -\
          float(84.6 * avg_syllables_per_word(text))
    return FRE

# returns rarity values of each word
# frequency, zscore, count, index
def fetch_rarity(input_text, type):
    results = []
    for word in input_text:
        fetched_values = df[df['word'] == word]
        if fetched_values.size == 0:
            results.append((word, 0))
        else:
            if type == 'frequency':
                results.append((word, fetched_values['frequency'].values[0]))
            elif type == 'zscore':
                results.append((word, fetched_values['zscore'].values[0]))
            elif type == 'count':
                results.append((word, fetched_values['count'].values[0]))
            else:
                results.append((word, fetched_values.index.values[0]))
    return results

def fetch_mean(tuple_list):
    running_total = 0.0
    size = len(tuple_list)
    for tuple in tuple_list:
        running_total += tuple[1]
    list_mean = running_total/size
    return list_mean

def rare_finder(tuple_list, top, bottom):
    rare_words = []
    bottom_index = round(dict_size * (bottom/100))
    top_index = round(dict_size * (top/100))
    for tuple in tuple_list:
        if tuple[1] >= top_index and tuple[1] <= bottom_index:
            rare_words.append(tuple[0])
        elif tuple[1] == 0 and tuple[0] != "the":
            rare_words.append(tuple[0])
    return rare_words

#   'w' word mode returns rarity values for each word
#   'a' aggregate mode returns average rarity values
#   's' set-aggregate mode returns average rarity of only unique input values
#   'f' finder mode returns words within rare range
def word_rarity(input_text, mode='w', type='frequency', top=13, bottom=95):
    if mode == 'f':
        type = 'index'
    cleaned_input_text = clean_input_text(input_text)
    if mode == 'a':
        tokens = cleaned_input_text.split(" ")
    else:
        tokens = tokenize(cleaned_input_text)
    token_values = fetch_rarity(tokens, type)
    if mode == 'a' or mode == 's':
        mean_of_tokens = fetch_mean(token_values)
        return mean_of_tokens
    elif mode == 'f':
        rare_words = rare_finder(token_values, top, bottom)
        return rare_words
    else: #'w'
        return token_values

2023-01-12 11:46:45.295016: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 AVX512F AVX512_VNNI FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.
2023-01-12 11:46:45.447901: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-01-12 11:46:45.488300: E tensorflow/stream_executor/cuda/cuda_blas.cc:2981] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2023-01-12 11:46:46.245975: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libnvinfer.so.7'; 

In [2]:

word_rarity("the dog house", mode='a')

0.3377789128563968

In [4]:
word_rarity(s4, mode='a')

0.17227984024747503

In [5]:
# X --> validation, train, test
with open('wiki.X.raw') as f:
    lines = f.readlines()

In [6]:
lines_2 = []
line = ''

for i in range(len(lines)):
    if i > 0:
        if lines[i].startswith(" = ") or lines[i].startswith(" =") or lines[i].startswith(" ="):
            line = '** ' + lines[i]
        else:
            line = lines[i]
        
    lines_2.append(line)

In [7]:
smallerlist = [l.replace('\n','') for l in ' '.join(lines_2).split('** ')]

In [None]:
import pandas as pd
from readability import Readability


l_sentence = []
l_word_count = []
l_sentence_count = []
l_rarity = []
l_flesch_reading_ease = []

i = 0

for s in smallerlist:
    s = str(s)
    if len(s) > 120:
        
        sentence_count_value = -1 
        word_count_value = -1
        rarity = -1
        fk_g_l = -1
        try:
            sentence_count_value = sentence_count(s)
            rarity = word_rarity(s, mode='a')
            word_count_value = word_count(s)
            r = Readability(s)
            fk = r.flesch_kincaid()
            fk_g_l = fk.grade_level 
        except:
            i = i+1
        
        l_sentence.append(s)
        l_word_count.append(word_count_value) 
        l_sentence_count.append(sentence_count_value)
        l_rarity.append(round(rarity, 3))
        l_flesch_reading_ease.append(fk_g_l)    
            

df = pd.DataFrame(l_sentence, columns=['sentence'])


df['word_count'] = l_word_count
df['sentence_count'] = l_sentence_count
df['rarity'] = l_rarity
df['flesch_reading_ease'] = l_flesch_reading_ease



df.to_csv('data.csv', index=False)


print(i)

In [None]:
max_word_count = df['word_count'].max()
max_sentence_count = df['sentence_count'].max()
max_rarity = df['rarity'].max()
max_flesch_reading_ease = df['flesch_reading_ease'].max()


min_word_count = 1
min_sentence_count = 1
min_rarity = df['rarity'].min()
min_flesch_reading_ease = 1

sum_word_count = df['word_count'].sum()
sum_sentence_count = df['sentence_count'].sum()
sum_rarity = df['rarity'].sum()
sum_flesch_reading_ease = df['flesch_reading_ease'].sum()

df.loc[df['flesch_reading_ease'] == -1, 'flesch_reading_ease'] = 14


In [None]:
def aggregate_list(df):
    aggregate_list = []
    for i in range(len(df)):
        word_count_norm = (df.iloc[i].word_count - min_word_count) / (max_word_count - min_word_count)
        #sentence_count_norm = (df.iloc[i].sentence_count - min_sentence_count) / (max_sentence_count - min_sentence_count)
        rarity_norm = (df.iloc[i].rarity - min_rarity) / (max_rarity - min_rarity)
        flesch_reading_ease_norm = (df.iloc[i].flesch_reading_ease - min_flesch_reading_ease) / (max_flesch_reading_ease - min_flesch_reading_ease)

        total_norm = word_count_norm + rarity_norm + flesch_reading_ease_norm
        
        aggregate_list.append(total_norm)
    return aggregate_list

In [None]:
df['aggregate_values'] = aggregate_list(df)
df2 = df.sort_values(by='aggregate_values', ascending=True)
#df = df.sort_values(by='sentence_count', ascending=True)

In [None]:
text = []
for s in df2['sentence']:
    if len(s)>2:
        s = str(s)
        text.append(s.replace('\n',''))

# X --> validation, train, test
textfile = open("wiki.X.raw", "w")
for element in text:
    element  = str(element)
    textfile.write(element)
textfile.close()   