In [11]:
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk import pos_tag
from nltk.corpus import wordnet
import math

In [12]:
# Download NLTK resources (run once)
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')
nltk.download('averaged_perceptron_tagger_eng')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/shamitha/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/shamitha/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/shamitha/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/shamitha/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /Users/shamitha/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


In [13]:
def get_wordnet_pos(word, pos):
    """Map POS tag to WordNet POS tag for lemmatization."""
    tag = pos[0].upper()
    tag_dict = {'J': wordnet.ADJ, 'N': wordnet.NOUN, 'V': wordnet.VERB, 'R': wordnet.ADV}
    return tag_dict.get(tag, wordnet.NOUN)

In [14]:
def preprocess_text(text):
    """
    Preprocess the input text by:
    1. Lowercasing
    2. Tokenization
    3. Removing stopwords
    4. Removing punctuation (non-alphabetic tokens)
    5. Lemmatization with POS tagging
    """
    # Lowercase
    text = text.lower()
    
    # Tokenize
    tokens = word_tokenize(text)
    
    # Remove stopwords + punctuation + non-alphabetic
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    
    # POS tagging and lemmatization
    pos_tags = pos_tag(tokens)
    tokens = [lemmatizer.lemmatize(word, get_wordnet_pos(word, pos)) for word, pos in pos_tags]
    
    return tokens

In [15]:
# Preprocessing for training set
training = 'train.txt'
tokens = []

with open(training, 'r', encoding='utf-8') as t:
    for line in t:
        # Preprocess each line
        processed_tokens = preprocess_text(line.strip())
        # Add start and end tokens
        tokens_per_line = ['<s>'] + processed_tokens + ['</s>']
        tokens.append(tokens_per_line)

In [16]:
# Preprocessing for validation set
validation = 'val.txt'
validation_tokens = []

with open(validation, 'r', encoding='utf-8') as v:
    for line in v:
        # Preprocess each line
        processed_tokens = preprocess_text(line.strip())
        # Add start and end tokens
        tokens_per_line = ['<s>'] + processed_tokens + ['</s>']
        validation_tokens.append(tokens_per_line)