In [4]:
import re
import nltk
import csv
import pandas as pd
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from transformers import BertTokenizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

#### Libraries and Modules:

1. **re**: Regular expression library for pattern matching and manipulation.
2. **nltk**: Natural Language Toolkit, a library for natural language processing.
3. **csv**: Provides functionality for reading from and writing to CSV files.
4. **pandas**: Data manipulation and analysis library.
5. **wordnet**: Part of NLTK, a lexical database of English.
6. **WordNetLemmatizer**: NLTK's lemmatizer for word lemmatization.
7. **BertTokenizer**: Part of the Hugging Face `transformers` library, used for tokenization with BERT models.
8. **word_tokenize**: NLTK's word tokenizer for breaking text into words.
9. **stopwords**: NLTK's list of common stop words.

In [5]:
# Reading the CSV files into pandas DataFrames
physics_df = pd.read_csv('physics_papers.csv')
medicine_df = pd.read_csv('medicine_papers.csv')
cybersecurity_df = pd.read_csv('cybersecurity_papers.csv')

# Add a new column 'category' to each DataFrame
physics_df['category'] = 'Physics'
medicine_df['category'] = 'Medicine'
cybersecurity_df['category'] = 'Cybersecurity'

# Ensure 'category' and 'Abstract' are the first two columns in each DataFrame
physics_df = physics_df[['category', 'Abstract']]
medicine_df = medicine_df[['category', 'Abstract']]
cybersecurity_df = cybersecurity_df[['category', 'Abstract']]

# Concatenate the DataFrames
combined_df = pd.concat([physics_df, medicine_df, cybersecurity_df], ignore_index=True)

# Save the combined DataFrame to a new CSV file
combined_df.to_csv('input_file.csv', index=False)


#### Cell Explanation:

#### Reading CSV Files:

- **physics_df**, **medicine_df**, **cybersecurity_df**: Three pandas DataFrames created by reading data from CSV files ('physics_papers.csv', 'medicine_papers.csv', 'cybersecurity_papers.csv').

#### Adding a New Column 'category':

- A new column named 'category' is added to each DataFrame ('Physics' for physics_df, 'Medicine' for medicine_df, 'Cybersecurity' for cybersecurity_df).

#### Reordering Columns:

- The order of columns in each DataFrame is adjusted to ensure 'category' and 'Abstract' are the first two columns.

#### Concatenating DataFrames:

- The three DataFrames are concatenated into a single DataFrame, **combined_df**, using `pd.concat()`. The `ignore_index=True` argument resets the index of the resulting DataFrame.

#### Saving Combined DataFrame:

- The combined DataFrame is saved to a new CSV file named 'input_file.csv' using `to_csv()` with `index=False` to exclude the index column.

In [7]:
# Downloading necessary NLTK models for processing
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

"""
    Preprocessing the given text by applying several steps:
    - Lowercasing all characters for uniformity.
    - Removing special characters and digits to focus on words.
    - Tokenizing the text into individual words.
    - Removing common words or stop words that don't add much meaning.
"""

def get_pos_tag(word):
    """Map POS tag to first character lemmatize() accepts."""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

def lemmatize_text(text):
    # Tokenize the text into words
    words = word_tokenize(text)

    # Initialize the WordNet Lemmatizer
    lemmatizer = WordNetLemmatizer()

    # Lemmatize each word using its POS (Part of Speech) tag
    lemmatized_words = [lemmatizer.lemmatize(word, get_pos_tag(word)) for word in words]

    # Join the lemmatized words back into a sentence
    lemmatized_text = ' '.join(lemmatized_words)

    return lemmatized_text


def remove_special_characters_and_numbers(text):
    # Define a regular expression pattern to match special characters and numbers
    pattern = r'[^a-zA-Z\s]'  # Keep only letters and whitespaces

    # Apply the pattern to remove special characters and numbers
    cleaned_text = re.sub(pattern, '', text)

    return cleaned_text


def lowercase_text(text):
    return text.lower()

def remove_stopwords(text):
    """Remove stopwords from the text."""
    stop_words = set(stopwords.words('english'))
    words = word_tokenize(text)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    return ' '.join(filtered_words)

def clean_text(text):
    # Identify and replace LaTeX expressions with placeholders
    latex_expressions = []
    def replace_latex(match):
        latex_expressions.append(match.group())
        return 'latexplaceholder'

    # Define a pattern for identifying LaTeX expressions
    latex_pattern = re.compile(r'(\\[(\[{])[\s\S]+?\\[)\]}]|(?<!\\)\$\$[\s\S]+?\$\$|\$(?<!\\)[\s\S]+?\$')

    # Replace LaTeX expressions with placeholders
    text_with_placeholders = latex_pattern.sub(replace_latex, text)

    # Perform standard text cleaning on text_with_placeholders
    # (e.g., lowercasing, removing special characters)
    cleaned_text = remove_special_characters_and_numbers(text_with_placeholders)
    cleaned_text = lowercase_text(cleaned_text)
    cleaned_text = lemmatize_text(cleaned_text)
    cleaned_text = remove_stopwords(cleaned_text)

    # Replace placeholders with original LaTeX expressions
    for latex_expr in latex_expressions:
        cleaned_text = cleaned_text.replace('latexplaceholder', latex_expr, 1)

    return cleaned_text

def bert_tokenize(text, tokenizer):
    # Tokenize the text using BERT tokenizer
    tokens = tokenizer.tokenize(text)
    return tokens

def clean_csv(input_file, output_file):
    try:
        # Load the BERT tokenizer
        tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

        with open(input_file, 'r', newline='', encoding='utf-8') as csvfile:
            reader = csv.reader(csvfile)
            rows = list(reader)

            # Apply cleaning and tokenization
            bert_tokenized_data = [(row[0], bert_tokenize(clean_text(row[1]), tokenizer)) for row in rows]

        with open(output_file, 'w', newline='', encoding='utf-8') as csvfile:
            writer = csv.writer(csvfile)
            writer.writerows(bert_tokenized_data)

        print("File processed and saved successfully.")

    except Exception as e:
        print(f"An error occurred: {e}")

input_file = 'input_file.csv'
output_file = 'output_file.csv'

clean_csv(input_file, output_file)

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/baizid_alhamid/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/baizid_alhamid/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/baizid_alhamid/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/baizid_alhamid/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


File processed and saved successfully.


#### Cell Explanation:

#### Downloading Necessary NLTK Models:

- **nltk.download('stopwords')**: Downloads stopwords for later use.
- **nltk.download('punkt')**: Downloads data required for tokenization.
- **nltk.download('wordnet')**: Downloads WordNet, a lexical database for the English language.
- **nltk.download('averaged_perceptron_tagger')**: Downloads data for POS tagging.

#### Text Preprocessing Functions:

1. **get_pos_tag(word)**:
   - Maps POS (Part of Speech) tag to the first character lemmatize() accepts.

2. **lemmatize_text(text)**:
   - Tokenizes the text into words.
   - Lemmatizes each word using its POS tag.
   - Joins lemmatized words back into a sentence.

3. **remove_special_characters_and_numbers(text)**:
   - Removes special characters and numbers, keeping only letters and whitespaces.

4. **lowercase_text(text)**:
   - Converts the text to lowercase for uniformity.

5. **remove_stopwords(text)**:
   - Removes common English stopwords from the text.

6. **clean_text(text)**:
   - Replaces LaTeX expressions with placeholders.
   - Performs standard text cleaning (lowercasing, removing special characters, lemmatization, removing stopwords).
   - Restores original LaTeX expressions from placeholders.

7. **bert_tokenize(text, tokenizer)**:
   - Tokenizes the text using the BERT tokenizer.

8. **clean_csv(input_file, output_file)**:
   - Loads the BERT tokenizer.
   - Reads a CSV file, applies cleaning and tokenization.
   - Writes the processed data to a new CSV file.

#### Script Execution:

- The script reads data from 'input_file.csv', cleans and tokenizes the abstracts using the defined functions, and saves the results to 'output_file.csv'.
- Any errors during execution are caught and printed.