Dataset Cleaning

In [1]:
# Import the necessary libraries

import spacy
import spacy.cli
import pandas as pd
import string
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from contractions import fix
from nltk.corpus import wordnet

In [2]:
# Download necessary resources
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to C:\Users\Benjamin
[nltk_data]     Kam\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\Benjamin
[nltk_data]     Kam\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\Benjamin
[nltk_data]     Kam\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Load SpaCy model (download if not already installed)
try:
    nlp = spacy.load("en_core_web_sm")
except OSError:
    print("Downloading 'en_core_web_sm' model...")
    spacy.cli.download("en_core_web_sm")
    nlp = spacy.load("en_core_web_sm")

In [4]:
# Set of stop words
stop_words = set(stopwords.words('english'))

In [5]:
# Abbreviation mapping
abbreviation_map = {
    "isd": "internal security department",
    "ltd": "limited",
    "ml": "machine learning",
}

In [6]:
# Domain-specific stopwords
domain_stopwords = {"report", "file", "document", "record"}

In [7]:
# Function to clean and preprocess text
def preprocess_text(text):
    if isinstance(text, str):
        # Expand contractions
        text = fix(text)
        # Convert to lowercase
        text = text.lower()
        # Normalize abbreviations
        words = text.split()
        text = " ".join([abbreviation_map.get(word, word) for word in words])
        # Remove punctuation
        text = text.translate(str.maketrans('', '', string.punctuation))
        # Remove standalone numbers (but retain numbers in meaningful patterns)
        text = re.sub(r'\b\d+\b', '', text)
        # Normalize currency (e.g., "€4278" -> "4278 euros")
        text = re.sub(r'€(\d+)', r'\1 euros', text)
        # Remove stop words
        text = " ".join([word for word in text.split() if word not in stop_words and word not in domain_stopwords])
        return text
    return text  # Return as is for non-string inputs

In [8]:
# Function to lemmatize text
def lemmatize_text(text):
    if isinstance(text, str):
        doc = nlp(text)
        return " ".join([token.lemma_ for token in doc])
    return text

In [9]:
# Function to tokenize text into words
def tokenize_text(text):
    if isinstance(text, str):
        tokens = word_tokenize(text)
        return tokens
    return []

In [10]:
# Function to segment text into sentences
def segment_sentences(text):
    if isinstance(text, str):
        sentences = sent_tokenize(text)
        return sentences
    return []

In [11]:
# Function to extract dependency relations
def extract_dependency_relations(text):
    if isinstance(text, str):
        doc = nlp(text)
        relations = [(token.text, token.dep_, token.head.text) for token in doc]
        return relations
    return []

Preprocessing datasets

In [12]:
# Load the Excel file
file_path = 'wikileaks_parsed.xlsx'  # Replace with your file path
df = pd.read_excel(file_path)

In [13]:
# Apply preprocessing steps
df['Cleaned_Text'] = df['Text'].apply(preprocess_text)
df['Lemmatized_Text'] = df['Cleaned_Text'].apply(lemmatize_text)
df['Tokenized_Text'] = df['Lemmatized_Text'].apply(tokenize_text)
df['Cleaned_Sentences'] = df['Cleaned_Text'].apply(segment_sentences)
df['Dependency_Relations'] = df['Cleaned_Text'].apply(extract_dependency_relations)

In [14]:
# Save intermediate output to Excel
output_file_path = 'preprocessed_text.xlsx'
df.to_excel(output_file_path, index=False)

In [15]:
# Function to extract named entities
def extract_entities(text):
    if isinstance(text, str):
        doc = nlp(text)
        entities = [(ent.text, ent.label_) for ent in doc.ents]
        return entities
    return []

Entity extraction

In [16]:
# Apply Named Entity Recognition (NER)
df['Entities'] = df['Cleaned_Text'].apply(extract_entities)

In [17]:
# Simplify entities for readability
def simplify_entities(entities):
    if isinstance(entities, list) and all(isinstance(e, tuple) for e in entities):
        return "; ".join([f"{ent[0]} ({ent[1]})" for ent in entities])
    return "No entities found"

df['Simplified_Entities'] = df['Entities'].apply(simplify_entities)

In [18]:
# Save entities output to Excel
entities_output_path = 'entities.xlsx'
df.to_excel(entities_output_path, index=False)

Relationship extraction

In [19]:
import pandas as pd
import scipy
import spacy
import tqdm
import networkx as nx
import stanza
import matplotlib.pyplot as plt
from transformers import pipeline
from stanza.server import CoreNLPClient
stanza.install_corenlp()

  from .autonotebook import tqdm as notebook_tqdm


In [20]:
# Load preprocessed data
file_path = "entities.xlsx"  # Replace with your actual file path
df = pd.read_excel(file_path)

In [21]:
# Load SpaCy model for basic linguistic parsing
nlp = spacy.load("en_core_web_sm")

In [22]:
# Set up a persistent connection to the CoreNLP server (reuse instead of restarting)
corenlp_client = CoreNLPClient(annotators=['openie'], timeout=30000, memory='4G', be_quiet=False, endpoint='http://localhost:9000')

2025-01-31 03:56:14 INFO: Writing properties to tmp file: corenlp_server-0983ed8c9cd84bac.props


In [23]:
# Function to extract relationships using Stanford CoreNLP
def extract_relationships_stanford(text):
    try:
            ann = corenlp_client.annotate(text)
            relationships = []
            for sentence in ann.sentence:
                for triple in sentence.openieTriple:
                    relationships.append({
                        "entity1": triple.subject,
                        "relation": triple.relation,
                        "entity2": triple.object,
                    })
            return relationships
    except Exception as e:
        print(f"Error in Stanford CoreNLP extraction: {e}")
        return []

In [24]:
# Apply both relationship extraction methods and store results in separate columns
df['Stanford_Relationships'] = df['Cleaned_Text'].apply(extract_relationships_stanford)

# Don't forget to close the CoreNLP client after processing
corenlp_client.stop()

2025-01-31 03:56:16 INFO: Starting server with command: java -Xmx4G -cp C:\Users\Benjamin Kam\stanza_corenlp\* edu.stanford.nlp.pipeline.StanfordCoreNLPServer -port 9000 -timeout 30000 -threads 5 -maxCharLength 100000 -quiet False -serverProperties corenlp_server-0983ed8c9cd84bac.props -annotators openie -preload -outputFormat serialized


In [25]:
# Save results to a new Excel file with separate columns for each model's relationships
df.to_excel("relationships_extracted_separated.xlsx", index=False)

In [None]:
# import sys
# print(sys.executable)

In [None]:
# try:
#     import scipy
# except ModuleNotFoundError:
#     print("The module 'scipy' is not installed.")
#     %pip install scipy
#     # You can include additional instruction here, such as installing the module.
# else:
#     # Code to run if the module  is successfully imported
#     print("Module 'scipy' is installed.")
