In [6]:
import csv

In [3]:
def append_period(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    
    processed_lines = []
    for line in lines:
        stripped_line = line.rstrip()
        if not stripped_line.endswith('.'):
            stripped_line += '.'
        processed_lines.append(stripped_line + '\n')
    
    with open(file_path, 'w') as file:
        file.writelines(processed_lines)

In [5]:
def remove_empty_lines(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    
    cleaned_lines = [line for line in lines if line.strip()]

    with open(file_path, 'w') as file:
        file.writelines(cleaned_lines)


In [8]:
def combine_lines(file_path):
    with open(file_path, 'r') as file:
        lines = file.readlines()
    
    processed_lines = []
    
    keywords = {"Summary", "Level", "FRE", "Total words", "Total sentences", "Total syllables"}
    
    buffer = ""
    
    for line in lines:
        if line.strip():
            parts = line.split(':')
            if parts[0].strip() in keywords and buffer:
                buffer = buffer.rstrip() + '. ' + line.strip()
            else:
                if buffer:
                    processed_lines.append(buffer)
                buffer = line.strip()
    
    if buffer:
        processed_lines.append(buffer)
    
    with open(file_path, 'w') as file:
        for processed_line in processed_lines:
            file.write(processed_line + '\n')

In [33]:
def replace_duplicates(file_path):
    with open(file_path, 'r') as file:
        content = file.read()
    
    content = content.replace('..', '.|')
    content = content.replace('?.', '?|')
    content = content.replace('!.', '!|')
    content = content.replace('| ', '|')
    
    with open(file_path, 'w') as file:
        file.write(content)

In [39]:
def parse_to_csv(input_file, output_file):
    with open(input_file, 'r') as infile, open(output_file, 'w', newline='') as outfile:
        writer = csv.DictWriter(outfile, fieldnames=[
            'original_sentence', 'summary', 'level'])
        writer.writeheader()
        
        for line in infile:
            summary_idx = line.find('Summary:')
            level_idx = line.find('Level:')

            original_sentence = line[19: summary_idx].strip()
            summary = line[summary_idx + 9: level_idx].strip()
            level = line[level_idx + 7: level_idx + 8].strip()
            
            row = {
                'original_sentence': original_sentence,
                'summary': summary,
                'level' : level
            }
            writer.writerow(row)


In [40]:
sentence_counts = [1, 2, 3, 4, 5, 6]

for sentence_count in sentence_counts:
    input_file = f'by_sentence_summary/{sentence_count}_sentence_summary/{sentence_count}_sentence_summary.txt'
    output_file = f'by_sentence_summary/{sentence_count}_sentence_summary/{sentence_count}_sentence_summary.csv'
    remove_empty_lines(input_file)
    combine_lines(input_file)
    replace_duplicates(input_file)
    parse_to_csv(input_file, output_file)

In [41]:
word_counts = [10, 20, 25, 30, 40, 50]

for word_count in word_counts:
    input_file = f'by_word_summary/{word_count}_word_summary/{word_count}_word_summary.txt'
    output_file = f'by_word_summary/{word_count}_word_summary/{word_count}_word_summary.csv'
    remove_empty_lines(input_file)
    combine_lines(input_file)
    replace_duplicates(input_file)
    parse_to_csv(input_file, output_file)

For experimentation purposes

In [14]:
line = "Original sentence: Obvious is what is a cryptocurrency? Summary: What is a cryptocurrency? Level: 10. FRE: 122. Total words: 4. Total sentences: 1. Total syllables: 7."

summary_idx = line.find('Summary:')
level_idx = line.find('Level:')

original_sentence = line[19: summary_idx]
summary = line[summary_idx + 9: level_idx]
level = line[level_idx + 7: level_idx + 8] if line[level_idx + 8] == '.' else line[level_idx + 7: level_idx + 9]

print(original_sentence)
print(summary)
print(level)

Obvious is what is a cryptocurrency? 
What is a cryptocurrency? 
10
