Basic Data corpus scan

In [12]:
import os

# Corpus data path
folder_path = 'C:/Users/8897p/OneDrive/Desktop/NLP/Project/AXR/Raw_data_corpus'

# List all files using os module
files = os.listdir(folder_path)
print(f"\nTotal files: {len(files)}\n\nSample Raw File name display:")
for i in range(len(files)):
    print(f"File Name: {files[i]}")
    if i == 5:
        break # Displaying first 5 file names as sample



Total files: 200

Sample Raw File name display:
File Name: 000128cbd36642ced67ac90bd7d4d1dd5e8cf554.story
File Name: 0001d1afc246a7964130f43ae940af6bc6c57f01.story
File Name: 00027e965c8264c35cc1bc55556db388da82b07f.story
File Name: 0002c17436637c4fe1837c935c04de47adb18e9a.story
File Name: 0005d61497d21ff37a17751829bd7e3b6e4a7c5c.story
File Name: 00077395f92430e209a0b3f781b143b5e9af2348.story


Meta Data exploration of the raw files

In [14]:
#from collections import Counter

file_stats = []

# looping through the file list created above
for file in files:
    file_path = os.path.join(folder_path, file) # create the file path for each file iteratively
    with open(file_path, 'r', encoding='utf-8') as f: # reading the current file
        text = f.read()
        words = text.split() # basic split on white space
        word_count = len(words)
        unique_word_count = len(set(words))
        char_count = len(text)

        # Making the list of dictionary for th file meta data
        file_stats.append({
            'file_name': file,
            'word_count': word_count,
            'unique_word_count': unique_word_count,
            'char_count': char_count
        })

# Displaying the stats
import pandas as pd
file_stats_df = pd.DataFrame(file_stats)
print(file_stats_df.head())  # Preview first few entries

# Csv output path
output_path = 'C:/Users/8897p/OneDrive/Desktop/NLP/Project/AXR/file_stats_selected_raw.csv'

# File stats logged into the created csv
file_stats_df.to_csv(output_path, index=False)

                                        file_name  word_count  \
0  000128cbd36642ced67ac90bd7d4d1dd5e8cf554.story        1499   
1  0001d1afc246a7964130f43ae940af6bc6c57f01.story        1586   
2  00027e965c8264c35cc1bc55556db388da82b07f.story        1054   
3  0002c17436637c4fe1837c935c04de47adb18e9a.story         986   
4  0005d61497d21ff37a17751829bd7e3b6e4a7c5c.story        1279   

   unique_word_count  char_count  
0                648        9262  
1                710        9781  
2                515        6494  
3                485        6223  
4                612        8045  


EDA: word count and unique word distribution

In [3]:
import os
import pandas as pd

# Data path
folder_path = 'C:/Users/8897p/OneDrive/Desktop/NLP/Project/AXR/Raw_data_corpus'

# Initialize variables to store unique words and total word count
unique_words = set()
total_words = 0

# Iterate over each file to update total word count and unique words
for file in os.listdir(folder_path):
    file_path = os.path.join(folder_path, file)
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
        words = text.split()  # Split text into words
        total_words += len(words)  # Add to total word count
        unique_words.update(words)  # Add words to the unique word set

# Calculate total unique words
total_unique_words = len(unique_words)

# Create a DataFrame to display stats in a table format
stats_df = pd.DataFrame({
    "Statistic": ["Total Words", "Total Unique Words"],
    "Count": [total_words, total_unique_words]
})

# Print the table
print(stats_df)



            Statistic   Count
0         Total Words  252381
1  Total Unique Words   37100


Check if all the highlight sections have some text under them

In [None]:
# List to store file names matching the criteria
files_with_highlight = []
files_without_highlight = []

# Define the highlight pattern you are looking for
highlight_pattern = "@highlight"

# Check each file
for filename in os.listdir(folder_path):
    if filename.endswith('.story'):
        filepath = os.path.join(folder_path, filename)
        
        with open(filepath, 'r', encoding='utf-8') as file:
            content = file.read()
            
            # Check if '@highlight' followed by the summary exists
            if highlight_pattern in content:
                # Further check if there's any content after @highlight
                highlight_section = content.split(highlight_pattern)[-1].strip()
                
                # Check if thereâ€™s text after @highlight
                if highlight_section:
                    files_with_highlight.append(filename)
                else:
                    files_without_highlight.append(filename)
            else:
                files_without_highlight.append(filename)

# Summary of files
print(f"Files with '@highlight' section: {len(files_with_highlight)}")
print(f"Files without '@highlight' section: {len(files_without_highlight)}")


Files with '@highlight' section: 200
Files without '@highlight' section: 0


Imports for nltk and spacy. Downloading the stopwords and spacy small model.

In [16]:
import nltk
import spacy

# Download stop words and load the spaCy language model
nltk.download('stopwords')
nlp = spacy.load("en_core_web_sm")



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\8897p\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Data Preprocessing custom functions:

Actions performed

1. Process the data section and implement spell correction
2. Process the highlights and implement spell correction

Pyspell checker implementation

In [4]:
# Data imports
import os
import re
import sqlite3
from spellchecker import SpellChecker
import nltk
from nltk.corpus import stopwords
import spacy
from collections import namedtuple

# initializing spacy small model
nlp = spacy.load('en_core_web_sm')

# Downloading and initializing the nltk stopwords corpus
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

# SpellChecker instance creation
spell = SpellChecker()

# Raw data file path to ingest data .story files
folder_path = 'C:/Users/8897p/OneDrive/Desktop/NLP/Project/AXR/Raw_data_corpus'

# Empty named tuple for effective data retrieval
ProcessedText = namedtuple('ProcessedText', ['text', 'corrections_applied', 'word_count'])

# creation of db instance
conn = sqlite3.connect("processed_articles.db")
cursor = conn.cursor()

# Database table creation
cursor.execute("""
CREATE TABLE IF NOT EXISTS articles (
    filename TEXT,
    main_article TEXT,
    highlights TEXT,
    main_corrections_applied INTEGER,
    highlight_corrections_applied INTEGER,
    main_word_count INTEGER,
    highlight_word_count INTEGER
)
""")
conn.commit()

# Data preprocessor and cleaner function
def text_preprocessor(text, lower=True, remove_stopwords=True, lemmatize=True):
    text = re.sub(r'\s+', ' ', text)  # Remove extra whitespace and newlines
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)  # Remove special characters
    text = text.strip()
    if lower: # lower case flag
        text = text.lower()
    if remove_stopwords: # stop words flag
        words = [word for word in text.split() if word not in stop_words]
    else:
        words = text.split()
    if lemmatize: # Lemma flag
        doc = nlp(" ".join(words))
        words = [token.lemma_ for token in doc]
    return " ".join(words)

# custom function for spell checker
def custom_py_spell_check(text):
    words = text.split()
    corrections_applied = False
    corrected_words = []
    
    for word in words:
        corrected = spell.correction(word)
        if corrected is None:  # if no correction, use the original word
            corrected = word
        elif corrected != word:  # If a correction was applied, mark it
            corrections_applied = True
        corrected_words.append(corrected)
        
    return " ".join(corrected_words), corrections_applied

# function to implement the preprocessing on articles and highlights
def process_article(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()

    # Split the content into main article and highlights using @highlight as delimiter
    parts = content.split("@highlight")
    main_article = parts[0].strip()
    highlights = [highlight.strip() for highlight in parts[1:]]

    # Main Article preprocessor
    main_article_cleaned = text_preprocessor(main_article)
    main_article_spellchecked, main_corrections = custom_py_spell_check(main_article_cleaned)
    main_word_count = len(main_article_spellchecked.split())

    # Process the Highlights
    highlights_cleaned = []
    highlight_corrections = False
    for highlight in highlights:
        cleaned_highlight = text_preprocessor(highlight, lower=False, remove_stopwords=False, lemmatize=False)
        spellchecked_highlight, corrections = custom_py_spell_check(cleaned_highlight)
        highlights_cleaned.append(spellchecked_highlight)
        if corrections:
            highlight_corrections = True

    highlights_text = " ".join(highlights_cleaned)
    highlight_word_count = len(highlights_text.split())

    # returning the clean text text as two parts
    return ProcessedText(
        text=main_article_spellchecked,
        corrections_applied=main_corrections,
        word_count=main_word_count
    ), ProcessedText(
        text=highlights_text,
        corrections_applied=highlight_corrections,
        word_count=highlight_word_count
    )

# Iterating over files in the directory
for filename in os.listdir(folder_path):
    if filename.endswith('.story'):
        file_path = os.path.join(folder_path, filename)
        main_article, highlights = process_article(file_path)

        # insertion into data base
        cursor.execute("""
        INSERT INTO articles (filename, main_article, highlights, main_corrections_applied, 
                              highlight_corrections_applied, main_word_count, highlight_word_count)
        VALUES (?, ?, ?, ?, ?, ?, ?)
        """, (filename, main_article.text, highlights.text, int(main_article.corrections_applied),
              int(highlights.corrections_applied), main_article.word_count, highlights.word_count))
        conn.commit()

# Close the database connection
conn.close()

# simple indicator
print("Processing and insertion completed.")


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\8897p\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Processing and insertion completed.


In [12]:
# panda import
import sqlite3
import pandas as pd

# data base path
db_path = 'C:/Users/8897p/OneDrive/Desktop/NLP/Project/AXR/processed_articles.db'

# open the connection to the data base
conn = sqlite3.connect(db_path)

# create the pandas data frame
query = "SELECT * FROM articles"
df_processed_data = pd.read_sql_query(query, conn)

# Close the connection after loading the data
conn.close()

# Display the DataFrame
print(df_processed_data.columns)

print(df_processed_data.head())

Index(['filename', 'main_article', 'highlights', 'main_corrections_applied',
       'highlight_corrections_applied', 'main_word_count',
       'highlight_word_count'],
      dtype='object')
                                         filename  \
0  000128cbd36642ced67ac90bd7d4d1dd5e8cf554.story   
1  0001d1afc246a7964130f43ae940af6bc6c57f01.story   
2  00027e965c8264c35cc1bc55556db388da82b07f.story   
3  0002c17436637c4fe1837c935c04de47adb18e9a.story   
4  0005d61497d21ff37a17751829bd7e3b6e4a7c5c.story   

                                        main_article  \
0  japan's official today say ready provide antif...   
1  official us president barrack mama want lawmak...   
2  canvas city missouri can general services admi...   
3  lot angels can medical doctor vancouver brutis...   
4  can four group advocate immigrant right say th...   

                                          highlights  \
0  japan's chief cabinet secretary Yoshihide suga...   
1  syria official mama climbed to the top 

Plotly plots for pecentages of articles corrected

In [15]:
import plotly.graph_objects as go


# Getting the counts and sorting them for the pie charts
main_corrections = df_processed_data['main_corrections_applied'].value_counts().sort_index()
highlight_corrections = df_processed_data['highlight_corrections_applied'].value_counts().sort_index()

# setting the labels for the plot
labels = ['No Corrections', 'Corrections Applied']

# Pie Chart for Main Article Corrections
fig_main_corrections = go.Figure( # plotly go object
    data=[go.Pie(
        labels=labels,
        values=main_corrections,
        hole=0.5, # center ring size
        textinfo='percent+label'
    )]
)
fig_main_corrections.update_layout(title_text="Main Article Spell Corrections") # plot for main article text corrections applied

# Pie Chart for Highlight Corrections
fig_highlight_corrections = go.Figure(
    data=[go.Pie(
        labels=labels,
        values=highlight_corrections,
        hole=0.5, # center ring size
        textinfo='percent+label'
    )]
)
fig_highlight_corrections.update_layout(title_text="Highlight Spell Corrections") # plot for highlight text corrected

# plotting the graphs
fig_main_corrections.show()
fig_highlight_corrections.show()

In [14]:
# Check for any missing (NaN or None) values in each column
empty_entries = df_processed_data.isnull().sum()

# Check if there are any empty entries by summing up the total missing values
if empty_entries.sum() > 0:
    print("There are empty entries in the data")
else:
    print("There are no empty entries in the data")

There are no empty entries in the data


Histogram visualization for word count distribution after text preprocessing

In [None]:
import plotly.express as px

# Creating the plotly go object for histogram visualization
fig_word_count = px.histogram(
    df_processed_data,
    x=['main_word_count', 'highlight_word_count'], # setting main and highlights on x axis vs frequency on y
    labels={'value': 'Word Count', 'variable': 'Document Type'},
    nbins=30,  # setting the bins for histogram
    title="Word Count Distribution for Main Articles and Highlights"
)

# Customize the layout
fig_word_count.update_layout(
    xaxis_title="Word Count",
    yaxis_title="Frequency",
    barmode='overlay'  # plotting both main and highlights on same grid
)
fig_word_count.update_traces(opacity=0.75)  # opacity for overlay

# Display the histogram
fig_word_count.show()
