In [155]:
import os
import io
import re
import pandas as pd
import nltk
from os.path import abspath
nltk.download('punkt')


# Using dataset of OneStopEnglishCorpus from the below github link and cleaning the data.

# Main directory for OneStopEnglishCorpus from https://github.com/nishkalavallabhi/OneStopEnglishCorpus
main_path = 'OneStopEnglishCorpus/Texts-SeparatedByReadingLevel/'

# Sub path list for documents containing individual levels
sub_paths = ['Ele-Txt/', 'Int-Txt/', 'Adv-Txt/']

def get_docs_one_stop(document_id=True):
    """Returns a list of sentences of the parsed documents and the associated level as a tuple
    with an option to include an original document_id tag to track the original document.
    

    """
    docs = []
    doc_id = 0

    for path in sub_paths:
        directory_path = main_path + path
        directory = os.fsencode(directory_path)
        level = path[:3]
        for file in os.listdir(directory):
            filename = os.fsdecode(file)
            if filename.endswith('.txt'):
                with io.open(directory_path + filename, encoding = 'utf-8-sig') as f:
                    document = f.read()
                    sents = document.split('\n')
                    sents = [sen for sen in sents if len(sen) > 0]
                    if document_id:
                        docs.append((sents, level, doc_id))
                    else:
                        docs.append((sents, level))
                doc_id += 1

    return docs

def split_docs_one_stop(document_tuple, document_id=True):
    """Returns a parsed string of the document sentences, the sentences in list-form, the average number of
    words per sentence, the total number of sentences, the total words in the segment, the number of words 
    in each sentence segment, and the associated level.
    

    """
    seperated_doc = []
    for doc in document_tuple:
        level = doc[1]
        doc_id = doc[2]
        for segment in doc[0]:
            sentences = nltk.sent_tokenize(segment)
            num_of_sen = len(sentences)
            total_words = sum(len(sen.split()) for sen in sentences)
            sen_len_tuple = [len(sen.split()) for sen in sentences]
            avg_num_words = round(total_words/num_of_sen, 2)
            sent_string = " ".join([sen for sen in sentences])
            sent_string = sent_string.strip()
            if document_id:
                seperated_doc.append((sent_string, sentences, avg_num_words, num_of_sen, total_words, sen_len_tuple, level, doc_id))
            else:
                seperated_doc.append((sent_string, sentences, avg_num_words, num_of_sen, total_words, sen_len_tuple, level))
    
    return seperated_doc

def get_one_stop_dataframe(levels=True, document_id=True):
    """Returns the OneStopEnglishCorpus Dataframe. If levels = False, will return numberic levels:
    Ele = 0, Int =1, Adv = 2
    
 
    """
    records = split_docs_one_stop(get_docs_one_stop())
    
    if document_id:
        df = pd.DataFrame.from_records(records, columns=['documents', 'doc_list', 'avg_num_words', 'total_num_sents', 'total_num_words', 
                                                       'words_per_sents', 'level', 'doc_id'])
    else:
        df = pd.DataFrame.from_records(records, columns=['documents', 'doc_list', 'avg_num_words', 'total_num_sents', 'total_num_words', 
                                                       'words_per_sents', 'level'])
    
    col = df['documents']
    
    new_col = col.apply(lambda x: re.sub("`|’", "'", x))
    
    df['documents'] = new_col
    
    if not levels:
        col = df['level']
        new_col = []
        for level in col:
            if level == 'Ele':
                new_col.append(0)
            elif level == 'Int':
                new_col.append(1)
            else:
                new_col.append(2)
        df['level'] = new_col
     
    return df


# CEFR and Cambridge Reading Related Functions

# Path or file where the reading-level files are located
main_directory = 'Readability_dataset/'

# List of reading-level files' names matching CEFR levels from A2 to C2
level_directories = ['KET/', 'PET/', 'FCE/', 'CAE/', 'CPE/']

# List of CEFR Ratings from beginner (A2) to Advanced (C2)
cefr_ratings = ['A2', 'B1', 'B2', 'C1', 'C2']


def process_directory(cefr=True):
    """Cleans and Returns a tuple of the three lists used for data processing, analysis, and cleaning.

    Document parsing and cleaning for CEFR Reading Level Files from KET to CPE. Removes unnecessary first lines or titles,
    most subtitles, list header items, and other unnecessary lines from the files.
    Default level rating is returned in CEFR format. If set to False, will return an integer level from 0 to 5 
    for multinomial-classification.


    """
    documents = []
    document_list = []
    cefr_levels = []
    first_line_lens = []
    level = 0
    
    for directory_name in level_directories:
        path = main_directory + directory_name
        directory = os.fsencode(main_directory + directory_name)
        count = 0
        first_line_delete_count = 0
        print(f'Currently processing: {path[-4:-1]}\n')
        
        for file in os.listdir(directory):
            words = []
            filename = os.fsdecode(file)
            if filename.endswith('.txt'):

                # to make a list of corresponding cefr levels as string or integer types
                if cefr:
                    cefr_levels.append(cefr_ratings[level])
                else:
                    cefr_levels.append(level)

                # to make a list of cleaned documents as a string 
                # to make a list first line lengths: 
                # Inspected above list: 45 length limit to keep first lines that are part of document with no title
                # -> most lines under this length are title lines, most over are legitmate first lines (few exceptions below)
                
                file = open(path + filename, 'r')
                
                line = file.readline()
                
                first_line_lens.append(len(line)) 
                         
                skip_first_line = True # boolean for tracking whether to skip the first line or not
                
                if len(line) > 45:
                    skip_first_line = False
                
                # First lines over 45 characters long that should be deleted: exceptions to the above rule
                first_line_deletes = ['Careless tourists', 'Build it your', 'Explore', 'BROAD']
                
                for first_words in first_line_deletes:
                    if line.startswith(first_words):
                        skip_first_line = True
                    
                if line.isupper():
                    skip_first_line = True
                
                # First lines under 45 characters long that should be kept: exceptions to the above rule
                first_line_exceptions = ['Dear', 'To:', 'TO:']
                
                for first_word in first_line_exceptions: 
                    if line.startswith(first_word):
                        skip_first_line = False
                
                if skip_first_line:
                    print('Removed First line:', line)
                    line = file.readline()
                    first_line_delete_count += 1
                
                document_string = '' # to rebuild the cleaned document
                while line != '':
                    
                    # Remove dates or address from business letters (kept one edge case with doc spacing issues)
                    if line[0].isdigit() and line.startswith(' 10 kilometers') : 
                        print('Removed:', line)
                        line = ''
                    
                    if (len(line) == 3): # Remove list header items that had space at beginning (eg. ' B.' i.e. had length 3)
                        print('Removed:', line)
                        line = ''
                    
                    if line.isupper(): # Remove any all upper case words-> usually names
                        print('Removed:', line)
                        line = ''
                    
                    # Remove edge cases and sentences which were list header items by letter
                    other_line_deletes = ['A report by', 'by', 'By Kat', 'Memo', 'Itinerary', 'Ivan Pet', 'Peter Pres', 'Stuart Har', 
                                          'Publisher ', 'transition', 'BOOK', 'Office ', 'Women on','Easter quiz', 'A Jen', 'B Mich', 
                                          'C Lisa', 'D Barb', 'E Kim', 'A.', 'B.', 'C.','D.', 'E.', 'F.', 'G.', 'H.', 'I.', 'J.']
                    
                    for start_of in other_line_deletes:
                        if line.startswith(start_of):
                            # Edge case where the complete line was deleted, so kept importance part of sentence
                            if line.startswith('E. Jane'):
                                print('Removed:', line[:3])
                                line = line[3:]
                            else:
                                print('Removed:', line)
                                line = ''
                    
                    # Reconstruct the document
                    if line:
                        current_line = line.strip()
                        document_string = " ".join((document_string, current_line))
                        # if document_string:
                        #     print(document_string)
                        #     document_list.append((current_line, level)) # keep a list of sentences and connected level
                    line = file.readline()
                file.close()
                documents.append(document_string.strip())
                # document_list.append(nltk.sent_tokenize(document_string.split()))
                count += 1
        print(f'{path[-4:-1]} has {count} files\n')
        print(f'Number of First Line Deletions: {first_line_delete_count}\n')
        level += 1
    
    return documents, cefr_levels, first_line_lens

# Original Function but not currently used in the Jupyter notebooks

def parse_directory(cefr=True):
    """Returns a tuple of the four items used for data processing, analysis, and cleaning.

    Document parsing for CEFR Reading Level Files from KET to CPE. Default level rating is returned in CEFR format.
    If set to False, will return an integer level from 0 to 5 for multinomial-classification.


    """
    documents = []
    documents_list = []
    cefr_levels = []
    words_string = ''
    level = 0
    for directory_name in level_directories:
        path = main_directory + directory_name
        directory = os.fsencode(main_directory + directory_name)
        print(f'Currently processing: {path[-4:-1]}')
        count = 0
        for file in os.listdir(directory):
            words = []
            filename = os.fsdecode(file)
            if filename.endswith('.txt'):

                # to make a single list with the whole reading as one string
                file = open(path + filename, 'r')
                
                complete_file = file.read()
                complete_file = re.sub(r'^.+?\n\n', '', complete_file)
                documents_list.append(complete_file.replace('\n', ' '))

                file.close()

                # to make a list of corresponding cefr levels
                if cefr:
                    cefr_levels.append(cefr_ratings[level])
                else:
                    cefr_levels.append(level)

                # to make a list of list of the reading broken down into individual sentences
                # to make a string of all the documents as one string
                file = open(path + filename, 'r')
                line = file.readline()
                while line != '':
                    current_line = line.strip()
                    if len(current_line) > 0:
                        words.append(current_line) 
                    words_string = " ".join((words_string, current_line))
                    line = file.readline()
                file.close()
                documents.append(words)
                count += 1
        print(f'{path[-4:-1]} has {count} files')
        level += 1
    return documents, documents_list, cefr_levels, words_string


def cefr_to_data_frame(col1, col2, col1_name='documents', col2_name='cefr_level'):
    """Returns Pandas dataframe of the CEFR reading data.

    Expects column 1 to be the reading data, and column 2 to be the level ratings for each reading.
    """
    return pd.DataFrame({col1_name: col1, col2_name: col2})
    

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ravis\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


In [156]:
documents, levels, first_line_lens = process_directory()

Currently processing: KET

Removed First line: New books this month

Removed First line: BURGLARS LOVE THE AFTERNOON

Removed First line: Madame Tussaud's

Removed First line: Memo 

Removed: Memo

Removed First line: CANADA GEESE

Removed First line: BICYCLES 

Removed First line: ESTHER'S STORY

Removed First line: A HISTORY OF AIR TRAVEL 

Removed First line: REBECCCA STEVENS

Removed First line: Schnauzer Dogs

Removed First line: THE OSTRICH 

Removed First line: WORKING FOR AN AIRLINE

Removed First line: The Ruwenzori Mountains 

Removed First line: HOWARD BONNIER

Removed First line: Line dancing 

Removed First line: Bill Prince-Smith

Removed First line: Otters 

Removed First line: 28 Long Road Brighton 

Removed First line: TO ALL STUDENTS:

Removed First line: HOW DO THE IRISH POP-GROUP 'BOYZONE' LIVE A HEALTHY LIFE?

Removed First line: The Cerne Giant 

Removed First line: John Pickering

Removed First line: POSTCARDS 

Removed First line: Bill Bryson

Removed First line

In [157]:
# Final Manual Inspection after Cleaning
for row in cefr_to_data_frame(documents, levels)['documents']:
     print(row)
     print('\n')

The Long Night This is David Reilly's first book. David became a writer after teaching English for several years. Maha is a nurse in northern Australia, where she works in a small hospital. One day a baby is so ill that Maha has to drive all night to get her to the nearest big city. They have a lot of problems getting there and ...  Hard Work This exciting story is Joanna Jones's twentieth. Hard Work is about Sombat, who works with his father, a carpenter, in Thailand. They work long, hard hours making tables and chairs, but they do not have any money. Then one day a man dressed all in black buys the most beautiful table in the shop ...  Hospital or Cinema? Marcie Jacome, who studies English in London, wrote this story earlier this year. Tina is a young Brazilian woman whose dream is to become a doctor. She goes to London to study English and medicine but one day she meets a man who asks her to go to the USA with him to become a film star ... What will Tina do?


Most house burglaries 

In [158]:
dataframe = cefr_to_data_frame(documents, levels)

In [159]:
dataframe

Unnamed: 0,documents,cefr_level
0,The Long Night This is David Reilly's first bo...,A2
1,Most house burglaries happen between 2 p.m. an...,A2
2,One very famous place for tourists in London i...,A2
3,To: All staff Subject: Holidays From: D Brown ...,A2
4,Canada Geese are large blue and white birds. W...,A2
...,...,...
326,"Some time ago, a website highlighted the risks...",C2
327,"A course at the Bamboo Bike Club, run by engin...",C2
328,The two sisters kept Lily's driving a secret f...,C2
329,I have never begun a novel with more misgiving...,C2


In [160]:
# Write the cleaned data to csv file

dataframe.to_csv('data/cefr_readings.csv', index=False)
dataframe.to_csv('data/cefr_readings_numeric.csv', index=False)

In [161]:
one_stop_df = get_one_stop_dataframe()

In [162]:
one_stop_df

Unnamed: 0,documents,doc_list,avg_num_words,total_num_sents,total_num_words,words_per_sents,level,doc_id
0,"When you see the word Amazon, what's the first...","[When you see the word Amazon, what’s the firs...",35.0,1,35,[35],Ele,0
1,These are the questions in a debate about the ...,[These are the questions in a debate about the...,15.0,2,30,"[10, 20]",Ele,0
2,Amazon has asked for its company name to be a ...,[Amazon has asked for its company name to be a...,40.0,1,40,[40],Ele,0
3,"There are many other disputed claims to names,...",[There are many other disputed claims to names...,10.0,1,10,[10],Ele,0
4,"Until now, the differences between commercial,...","[Until now, the differences between commercial...",30.0,1,30,[30],Ele,0
...,...,...,...,...,...,...,...,...
7390,“We believe zero-hours contracts are essential...,[“We believe zero-hours contracts are essentia...,23.0,2,46,"[18, 28]",Adv,566
7391,Politician Chuka Umunna said the contracts sho...,[Politician Chuka Umunna said the contracts sh...,19.0,2,38,"[13, 25]",Adv,566
7392,Some people have argued that the flexibility o...,[Some people have argued that the flexibility ...,52.0,1,52,[52],Adv,566
7393,The institute's figures also suggest that 17% ...,[The institute’s figures also suggest that 17%...,36.0,1,36,[36],Adv,566


In [163]:
one_stop_df.to_csv('data/one_stop.csv', index=False)

In [167]:
for i in range(6800, 7300, 2):
    print(one_stop_df.documents[i])

David Cameron has declared a “clear result” in the Scottish independence referendum after Scotland voted by a 10.6-point margin against ending the 307-year-old union with England and Wales. The prime minister promised a devolution revolution across Great Britain, including votes on English issues by English MPs at Westminster, as he welcomed Scotland's decision to remain inside the UK. “There can be no disputes, no reruns – we have heard the settled will of the Scottish people,” Cameron said in a statement
The yes campaign scored four big successes, winning 53% of the vote in Scotland's largest city, Glasgow, 57% in Dundee and 51% in North Lanarkshire. However, the no camp was victorious in 28 authorities. It won overwhelmingly in areas where it was expected to do well, including Edinburgh, Aberdeenshire and Borders, but also in areas that could have gone to the yes campaign, including the Western Isles. In the final count, the no camp won 2,001,926 votes (55.3%) to 1,617,989 for yes (

### Data Cleaning Inspection 
Below are some samples of earlier work from the data inspection which helped inform me how to clean and make my process_directory file

In [168]:
#Inspection of first lines used to determine the cut off point for first line lengths
for i, e in enumerate(first_line_lens):
    if (e > 45):# and (e > 54):
         print(i, e)
         print(documents[i][:e])

24 58
Sleeping well is very important. When I can get home to my
70 50
Like any other university, the Open University can
71 55
Millions of people of all ages enjoy a hobby which is b
73 46
When Bo the cat decided to explore a furniture
98 59
In the 1880s, gold was discovered in what is now the most n
143 445
Miss Rita Cohen, a tiny, pale-skinned girl who looked half the age of Seymour's daughter, Marie, but claimed to be some six years older, came to his facto1y one day. She was dressed in overalls and ugly big shoes, and a bush of wiry hair framed her pretty face. She was so tiny, so young that he could barely believe that she was at the University of Pennsylvania, doing research into the leather industry in New Jersey for her Master's degree. 
144 52
It's true - we're all getting too big for our boots 
145 52
It's true - we're all getting too big for our boots 
154 392
There is nothing more disappointing than arriving at an airport overseas to discover that your baggage has been lef

In [171]:
# Inspection code before final inspection
c = 0
for i in range(len(documents)):
     print('=============Document {n} ==========='.format(n=i))
     print(documents[i])
     if c == 190:
         break
     c += 1

for i in range(200, 300, 1):
     print('==========Document {n} ========='.format(n=i))
     print(documents[i])

The Long Night This is David Reilly's first book. David became a writer after teaching English for several years. Maha is a nurse in northern Australia, where she works in a small hospital. One day a baby is so ill that Maha has to drive all night to get her to the nearest big city. They have a lot of problems getting there and ...  Hard Work This exciting story is Joanna Jones's twentieth. Hard Work is about Sombat, who works with his father, a carpenter, in Thailand. They work long, hard hours making tables and chairs, but they do not have any money. Then one day a man dressed all in black buys the most beautiful table in the shop ...  Hospital or Cinema? Marcie Jacome, who studies English in London, wrote this story earlier this year. Tina is a young Brazilian woman whose dream is to become a doctor. She goes to London to study English and medicine but one day she meets a man who asks her to go to the USA with him to become a film star ... What will Tina do?
Most house burglaries ha