In [1]:
import os
import string
from typing import List, Set

import pandas as pd

import nltk
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

DATA_DIR = os.path.join('data', '.')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [2]:
def read_data(path: str) -> pd.DataFrame:
    df = pd.read_csv(path, names=['label', 'title', 'text'])

    # Add title to the text
    df['text'] = df['title'] + '\n' + df['text']

    # Drop title as it's not gonna be used
    df = df.drop('title', axis=1)

    # Initially labels start from 1, many models work only when labels start from 0
    df['label'] = df['label'] - 1

    return df


train_df = read_data(os.path.join(DATA_DIR, 'train.csv'))
test_df = read_data(os.path.join(DATA_DIR, 'test.csv'))

In [3]:
train_df['text'] = train_df['text'].apply(word_tokenize)
test_df['text'] = test_df['text'].apply(word_tokenize)

In [4]:
removal_set = set(
    stopwords.words('english') +
    list(string.punctuation)
)

def removal_function(items: List[str], removal_set: Set[str] = removal_set) -> List[str]:
    return [item for item in items if item not in removal_set]

In [5]:
train_df['text'] = train_df['text'].apply(removal_function)
test_df['text'] = test_df['text'].apply(removal_function)

In [6]:
lemmatizer = nltk.stem.WordNetLemmatizer()

def lemmatization(items: List[str], lemmatizer: nltk.stem.WordNetLemmatizer = lemmatizer) -> List[str]:
    return [lemmatizer.lemmatize(item) for item in items]

train_df['text'] = train_df['text'].apply(lemmatization)
test_df['text'] = test_df['text'].apply(lemmatization)

In [7]:
# Some examples
train_df.sample(10)['text'].values

array([list(['IRS', 'seek', 'people', 'claim', 'refund', 'WASHINGTON', '--', 'The', 'IRS', '73', 'million', 'tax', 'refund', 'thousand', 'taxpayer', 'whose', 'check', 'returned', 'tax', 'agency', 'undeliverable']),
       list(['White', 'Sox', 'Sign', 'Hermanson', 'Two-Year', 'Deal', 'CHICAGO', 'Sports', 'Network', 'The', 'Chicago', 'White', 'Sox', 'come', 'term', 'two-year', '5.5', 'million', 'contract', 'pitcher', 'Dustin', 'Hermanson']),
       list(['Dollar', 'fall', '103.75', 'yen', 'remark', 'Snow', 'The', 'dollar', 'slipped', 'close', 'lowest', 'level', 'year', 'Thursday', 'Tokyo', 'entering', '103', 'yen', 'territory', 'market', 'player', 'dumped', 'currency', 'belief', 'United', 'States', 'tacitly', 'approves', 'weakening']),
       list(['Israel', 'accused', 'Syria', 'blast', 'Syria', 'blamed', 'Israel', 'car', 'bomb', 'exploded', 'capital', 'Damascus', 'wounding', 'three', 'people', 'The', 'bomb', 'went', 'soon', 'Palestinian', 'man', 'got', 'car', 'wife', 'daughter']),
    