In [None]:
import pandas as pd

# This script loads the data from the CSV files generated by the querying the journal database (Web of Science).

# The CSV files are saved in the Data directory.
directory = './Data'

# Weird encoding issue with the first query. Might have saved it using a Mac filesystem.
# The rest of the queries are saved using a Windows filesystem.
combined_df = pd.concat([
    pd.read_csv(f'{directory}/Query1.csv', sep=',', encoding='utf-8-sig', low_memory=False).rename(columns=lambda x: x.strip()),
    pd.read_csv(f'{directory}/Query2.csv', sep=',', encoding='ISO-8859-1', low_memory=False).rename(columns=lambda x: x.strip()),
    pd.read_csv(f'{directory}/Query3.csv', sep=',', encoding='ISO-8859-1', low_memory=False).rename(columns=lambda x: x.strip()),
    pd.read_csv(f'{directory}/Query4.csv', sep=',', encoding='ISO-8859-1', low_memory=False).rename(columns=lambda x: x.strip()),
    pd.read_csv(f'{directory}/Query5.csv', sep=',', encoding='ISO-8859-1', low_memory=False).rename(columns=lambda x: x.strip())
], ignore_index=True)

# Defines the columns to keep from the combined dataframe.
columns = ['Authors', 'Publication Year', 'Article Title', 'Abstract']

# Selects the columns from the combined dataframe.
clean_df = combined_df[columns]

# Shows the dataframe and confirms that the data is loaded and the correct columns are selected.
clean_df

Unnamed: 0,Authors,Publication Year,Article Title,Abstract
0,"Daios, A; Kladovasilakis, N; Kelemis, A; Kosta...",2025,AI Applications in Supply Chain Management: A ...,The advent of Industry 4.0 and the integration...
1,"El Haoud, N; Bachiri, Z",2019,Stochastic Artificial Intelligence benefits an...,Supply chain management (SCM) includes several...
2,"Wamba, SF; Queiroz, MM; Guthrie, C; Braganza, A",2022,Industry experiences of artificial intelligenc...,This editorial aims to present the papers acce...
3,"Hangl, J; Behrens, VJ; Krause, S",2022,"Barriers, Drivers, and Social Considerations f...",Background: The number of publications in supp...
4,"Shrivastav, M",2022,Barriers Related to AI Implementation in Suppl...,The primary objective of this paper is to offe...
...,...,...,...,...
995,"Santos, CR; Azevedo, G; Marques, RP",2024,A Guide to Identifying Artificial Intelligence...,Artificial Intelligence (AI) is a topic that h...
996,"Qasim, A; El Refae, GA; Issa, H; Eletter, S",2021,The Impact of Drone Technology on The Accounti...,The accounting profession has gone through rad...
997,"Kane, S; Moody, V; Harradon, M",2021,Towards Incorporating AI into the Mission Plan...,While there are numerous powerful tools to sup...
998,"Young, A; Tan, KV; Tariq, F; Fin, MX; Blueston...",2024,Rogue AI: Cautionary Cases in Neuroradiology a...,"Introduction In recent years, artificial intel..."


In [None]:
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import re

# Download required NLTK resources
nltk.download('stopwords')
nltk.download('punkt')

# Define stop words
stop_words = set(stopwords.words('english'))

# Function to clean abstract text
def clean_text(text):
    if not isinstance(text, str):
        return ''  # handle NaN or other non-string entries

    # Tokenize the text
    words = word_tokenize(text.lower())

    # Remove stopwords and non-alphanumeric characters
    filtered_words = [
        re.sub(r"(?<!\d)\.(?!\d)|[^a-zA-Z0-9'.]", '', word)
        for word in words if word.lower() not in stop_words
    ]

    # Remove empty strings from list (could result from punctuation-only words)
    filtered_words = [word for word in filtered_words if word]

    return ' '.join(filtered_words)

# Example: assuming your DataFrame is called `clean_df`
clean_df = clean_df.copy()
clean_df['Cleaned Abstract'] = clean_df['Abstract'].apply(clean_text)


# Count total number of words across all cleaned abstracts
total_words = clean_df['Cleaned Abstract'].apply(lambda x: len(x.split())).sum()

print(f"Total number of words in all cleaned abstracts: {total_words}")



[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\etien\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\etien\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


Total number of words in all cleaned abstracts: 119920


In [67]:
partitions = []
words_per_partition = 100

# Iterate through each row in the dataframe
for _, row in clean_df.iterrows():
    cleaned_text = row['Cleaned Abstract']
    if not cleaned_text:
        continue

    words = cleaned_text.split()

    for i in range(0, len(words), words_per_partition):
        partition_text = ' '.join(words[i:i+words_per_partition])
        partitions.append({
            'Title': row['Article Title'],
            'Year': row['Publication Year'],
            'Authors': row['Authors'],
            'Partitioned Abstract': partition_text
        })

# Create the new DataFrame
partition_df = pd.DataFrame(partitions)

# Show the resulting DataFrame
partition_df.head()

# Save the partitioned DataFrame to a CSV file
partition_df.to_csv(f'{directory}/Partitioned_Abstracts.csv', index=False, encoding='utf-8-sig')

