In [2]:
import os

# Corpus data path
folder_path = 'C:/Users/8897p/OneDrive/Desktop/NLP/Project/AXR/Raw_data_corpus'

# List all files
files = os.listdir(folder_path)
print(f"\nTotal files: {len(files)}\n\nSample Raw File name display:")
for i in range(len(files)):
    print(f"File Name: {files[i]}")
    if i == 5:
        break # Displaying first 5 file names as sample



Total files: 4000

Sample Raw File name display:
File Name: 0000bf554ca24b0c72178403b54c0cca62d9faf8.story
File Name: 0000dfd9f52a470b9f29957686c2704b68cd0635.story
File Name: 000128cbd36642ced67ac90bd7d4d1dd5e8cf554.story
File Name: 0001d1afc246a7964130f43ae940af6bc6c57f01.story
File Name: 0001d4ce3598e37f20a47fe609736f72e5d73467.story
File Name: 0001dc22494415d03319a6833a00cd9c559f1395.story


Meta Data exploration of the raw files

In [11]:
from collections import Counter

file_stats = []

for file in files:
    file_path = os.path.join(folder_path, file) # create the file path
    with open(file_path, 'r', encoding='utf-8') as f: # reading the file
        text = f.read()
        words = text.split()
        word_count = len(words)
        unique_word_count = len(set(words))
        char_count = len(text)

        # Making the list of dictionary for th file meta data
        file_stats.append({
            'file_name': file,
            'word_count': word_count,
            'unique_word_count': unique_word_count,
            'char_count': char_count
        })

# Displaying the stats
import pandas as pd
file_stats_df = pd.DataFrame(file_stats)
print(file_stats_df.head())  # Preview first few entries

# Csv output path
output_path = 'C:/Users/8897p/OneDrive/Desktop/NLP/Project/AXR/file_stats_selected_raw.csv'  # Replace with desired path

# Save DataFrame to CSV
file_stats_df.to_csv(output_path, index=False)

                                        file_name  word_count  \
0  0000bf554ca24b0c72178403b54c0cca62d9faf8.story         838   
1  0000dfd9f52a470b9f29957686c2704b68cd0635.story         996   
2  000128cbd36642ced67ac90bd7d4d1dd5e8cf554.story        1499   
3  0001d1afc246a7964130f43ae940af6bc6c57f01.story        1586   
4  0001d4ce3598e37f20a47fe609736f72e5d73467.story         949   

   unique_word_count  char_count  
0                438        5228  
1                499        6006  
2                648        9262  
3                710        9781  
4                432        5515  


EDA: word count and unique word distribution

In [8]:
import os
import pandas as pd

# Data path
folder_path = 'C:/Users/8897p/OneDrive/Desktop/NLP/Project/AXR/Raw_data_corpus'

# Initialize variables to store unique words and total word count
unique_words = set()
total_words = 0

# Iterate over each file to update total word count and unique words
for file in os.listdir(folder_path):
    file_path = os.path.join(folder_path, file)
    with open(file_path, 'r', encoding='utf-8') as f:
        text = f.read()
        words = text.split()  # Split text into words
        total_words += len(words)  # Add to total word count
        unique_words.update(words)  # Add words to the unique word set

# Calculate total unique words
total_unique_words = len(unique_words)

# Create a DataFrame to display stats in a table format
stats_df = pd.DataFrame({
    "Statistic": ["Total Words", "Total Unique Words"],
    "Count": [total_words, total_unique_words]
})

# Print the table
print(stats_df)



            Statistic    Count
0         Total Words  4560643
1  Total Unique Words   225706


In [19]:
# List to store file names matching the criteria
files_with_highlight = []
files_without_highlight = []

# Define the highlight pattern you are looking for
highlight_pattern = "@highlight"

# Check each file
for filename in os.listdir(folder_path):
    if filename.endswith('.story'):
        filepath = os.path.join(folder_path, filename)
        
        with open(filepath, 'r', encoding='utf-8') as file:
            content = file.read()
            
            # Check if '@highlight' followed by the summary exists
            if highlight_pattern in content:
                # Further check if there's any content after @highlight
                highlight_section = content.split(highlight_pattern)[-1].strip()
                
                # Check if there’s text after @highlight
                if highlight_section:
                    files_with_highlight.append(filename)
                else:
                    files_without_highlight.append(filename)
            else:
                files_without_highlight.append(filename)

# Summary of files
print(f"Files with '@highlight' section: {len(files_with_highlight)}")
print(f"Files without '@highlight' section: {len(files_without_highlight)}")


Files with '@highlight' section: 4000
Files without '@highlight' section: 0


Imports for sqlite, nltk and spacy

In [1]:
import sqlite3
import nltk
import spacy

# Download stop words and load the spaCy language model
nltk.download('stopwords')
nlp = spacy.load("en_core_web_sm")



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\8897p\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\8897p\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

Data Preprocessing custom functions:

Actions performed

1. Process the data section 
2. Process the highlights

In [9]:
# Data Preprocessing

import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')
nltk.download('punkt_tab')

# retreiving the stop words
stop_words = set(stopwords.words('english'))

def preprocess_main_text(text):
    # Remove special characters
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)
    # Lowercase the text
    text = text.lower()
    # Tokenize and remove stop words
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in stop_words]
    # Lemmatization using spaCy
    tokens = [token.lemma_ for token in nlp(" ".join(tokens))]
    # Joining the data back
    clean_text = ' '.join(tokens).strip()
    return clean_text

def preprocess_highlight_text(text):
    # Remove special characters
    text = re.sub(r'[^A-Za-z0-9\s]', '', text)
    # Remove additional spaces
    text = re.sub(r'\s+', ' ', text).strip()
    return text

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\8897p\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\8897p\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


In [10]:
import os
import sqlite3

# Data corpus
folder_path = 'C:/Users/8897p/OneDrive/Desktop/NLP/Project/AXR/Raw_data_corpus'

# Initializing storage lists
cleaned_data = []
highlight_paragraphs = []
data_word_counts = []

# SQLite setup and table creation
conn = sqlite3.connect('preprocessed_documents.db')
cursor = conn.cursor()
cursor.execute('''
    CREATE TABLE IF NOT EXISTS documents (
        id INTEGER PRIMARY KEY,
        filename TEXT,
        cleaned_data TEXT,
        highlight_paragraph TEXT,
        data_word_count INTEGER
    )
''')

# Looping through each file to process the data
for filename in os.listdir(folder_path):
    if filename.endswith('.story'):
        filepath = os.path.join(folder_path, filename) # each file path
        with open(filepath, 'r', encoding='utf-8') as file:
            content = file.read() # file raw text

            # Split content at the first occurrence of '@highlight'
            parts = content.split('@highlight', 1)
            main_text = parts[0]
            highlights = parts[1] if len(parts) > 1 else ""

            # Preprocess body text section
            cleaned_main_text = preprocess_main_text(main_text)
            data_word_count = len(cleaned_main_text.split())

            # Extract and preprocess each highlights
            highlight_points = highlights.split('@highlight')
            processed_highlights = [preprocess_highlight_text(point) for point in highlight_points if point.strip()]
            highlight_paragraph = ' '.join(processed_highlights)

            # Append results for insertion into SQLite
            cleaned_data.append(cleaned_main_text)
            highlight_paragraphs.append(highlight_paragraph)
            data_word_counts.append(data_word_count)

            # Insert the data into SQLite database
            cursor.execute('''
                INSERT INTO documents (filename, cleaned_data, highlight_paragraph, data_word_count)
                VALUES (?, ?, ?, ?)
            ''', (filename, cleaned_main_text, highlight_paragraph, data_word_count))

# Commit and close database connection
conn.commit()
conn.close()


The below sections will deal with data vaildation and visualizations on clean text

In [16]:
# panda import
import pandas as pd

# data base file path
db_path = 'C:/Users/8897p/OneDrive/Desktop/NLP/Project/AXR/preprocessed_documents.db'

# Connect to the database
conn = sqlite3.connect(db_path)

# Load data from the SQLite database into a Pandas DataFrame
# Replace 'documents' with your table name
query = "SELECT * FROM documents"
df = pd.read_sql_query(query, conn)

# Close the connection after loading the data
conn.close()

# Display the DataFrame
print(df.head())




   id                                        filename  \
0   1  0000bf554ca24b0c72178403b54c0cca62d9faf8.story   
1   2  0000dfd9f52a470b9f29957686c2704b68cd0635.story   
2   3  000128cbd36642ced67ac90bd7d4d1dd5e8cf554.story   
3   4  0001d1afc246a7964130f43ae940af6bc6c57f01.story   
4   5  0001d4ce3598e37f20a47fe609736f72e5d73467.story   

                                        cleaned_data  \
0  alex ward city trader con million pound wealth...   
1  helen pow publish 0916 est 29 may 2013 update ...   
2  japanese official today say ready provide anti...   
3  official us president barack obama want lawmak...   
4  parent nineyearold girl accidentally kill shoo...   

                                 highlight_paragraph  data_word_count  
0  Nicholas Levene must pay the nominal sum becau...              474  
1  Bella RodriguezTorres was diagnosed with stage...              526  
2  Japanese chief cabinet secretary Yoshihide Sug...              787  
3  Syrian official Obama climbed

In [22]:
# Check for any missing (NaN or None) values in each column
empty_entries = df.isnull().sum()

# Check if there are any empty entries by summing up the total missing values
if empty_entries.sum() > 0:
    print("There are empty entries in the data")
else:
    print("There are no empty entries in the data")

There are no empty entries in the data


Raw Data word count vs the each doc line visualization

In [24]:
import plotly.express as px


# defining the plot
fig = px.line(
    df, 
    x='id', 
    y='data_word_count', 
    title='Document Word Count by Document ID',
    labels={
        'id': 'Document ID',
        'data_word_count': 'Word Count'
    }
)

# Layout customizations
fig.update_layout(
    title_font=dict(size=24, family='Arial', color='darkblue'),
    xaxis_title='Document ID',
    yaxis_title='Word Count',
    legend_title='Legend',
    legend=dict(yanchor="top", y=0.99, xanchor="left", x=0.01),
    plot_bgcolor='rgba(240, 240, 240, 0.9)'
)

# line style and markers
fig.update_traces(
    line=dict(color='royalblue', width=2),
    marker=dict(size=5, color='darkblue', line=dict(width=1, color='royalblue')),
    name='Word Count'
)

# plot the figure
fig.show()


In [30]:
# Grouping the documents into ranges by id
df['doc_id_range'] = pd.cut(df['id'], bins=5, labels=['0-20%', '20-40%', '40-60%', '60-80%', '80-100%'])

fig = px.box(
    df, 
    x='doc_id_range', 
    y='data_word_count', 
    title='Word Count Distribution by Document ID Range',
    labels={'doc_id_range': 'Document ID Range', 'data_word_count': 'Word Count'},
)

fig.update_layout(
    xaxis_title='Document ID Range',
    yaxis_title='Word Count',
    title_font=dict(size=24, family='Arial', color='darkred'),
    plot_bgcolor='rgba(2600, 220, 140, 0.8)'
)

fig.show()
