In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import math
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('omw-1.4')
nltk.download('wordnet')

stop_words = set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to /Users/xy/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /Users/xy/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/xy/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/xy/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

In [3]:
raw_data_folder = "dontpatronizeme_v1.4/"
output_data_folder = "data/"

In [4]:
def preprocess_text(text):
    # remove punctuation
    text = re.sub(r'[^\w\s]', '', text)
    # remove single quotes
    text = re.sub(r'\'', '', text)
    # remove numbers
    text = re.sub(r'\d+', '', text)
    # remove new lines
    text = re.sub(r'   ', ' ', text)
    # remove extra spaces
    text = re.sub(r'\s+', ' ', text)
    # lower case
    text = text.lower()
    # remove stop words
    text = ' '.join([word for word in text.split() if word not in stop_words])
    # lemmatize
    lemmatizer = WordNetLemmatizer()
    text = ' '.join([lemmatizer.lemmatize(word) for word in text.split()])
    return text


def load_and_preprocess_data(path, col_names):

    original_data=[]

    with open (path) as data:
        for line in data:
            original_data.append(line)
            
    print('The original data contains ', len(original_data), ' lines.')
    
    lines = []

    for line in original_data:
        elements=line.strip().split('\t')
        lines.append(elements)
    
    df = pd.DataFrame(lines, columns = col_names)

    # remove the 0-3 rows since they don't contain any data
    df = df.iloc[4:].reset_index(drop=True)

    # replace emptry cells with na
    df = df.replace(r'^\s*$', np.nan, regex=True)

    # remove rows where the "text" column is na
    df = df.dropna(subset=["text"]).reset_index(drop=True)

    return df

def get_ids(path):
    ids = []
    num_lines = 0
    with open(path) as f:
        for line in f:
            if num_lines == 0:
                num_lines += 1
                continue
            string = line.strip().split('\t')[0]
            ## get the number at the start of the string
            string = int(string.split(',')[0])

            ids.append(string)
    return ids

def preprocess_data(data_folder):
    pcl_cols = ["par_id", "art_id", "keyword", "country_code", "text", "label"]
    pcl_df = load_and_preprocess_data(f"{data_folder}/dontpatronizeme_pcl.tsv", pcl_cols)

    ## preprocess the text
    pcl_df['label'] = pcl_df['label'].astype(int)
    pcl_df["class"] = pcl_df.apply(lambda x: 1 if x["label"] > 1 else 0, axis=1)

    pcl_df["preprocessed_text"] = pcl_df['text'].apply(lambda x: preprocess_text(x))

    ## read txt files for train and dev paragraph ids in the raw data folder 
    ## ignore the first line and get all the ids
    train_ids = get_ids(f"{data_folder}/train_semeval_parids-labels.txt")
    dev_ids = get_ids(f"{data_folder}/dev_semeval_parids-labels.txt")

    ## divide into train and dev according to provided ids 
    pcl_df['par_id'] = pcl_df['par_id'].astype(int)

    train_indexes = pcl_df[pcl_df['par_id'].isin(train_ids)].index
    dev_indexes = pcl_df[pcl_df['par_id'].isin(dev_ids)].index

    train_df = pcl_df.iloc[train_indexes].reset_index(drop=True)
    dev_df = pcl_df.iloc[dev_indexes].reset_index(drop=True)

    ## divide train into train_train and train_dev
    train_dev_df = train_df.sample(frac=0.2, random_state=42)
    train_train_df = train_df.drop(train_dev_df.index).reset_index(drop=True)

    print(pcl_df.dtypes)
    print(pcl_df.shape)
    print(pcl_df.isna().sum())

    return train_train_df, train_dev_df, dev_df, pcl_df

In [5]:
train_train_df, train_dev_df, dev_df, pcl_df = preprocess_data(raw_data_folder)

The original data contains  10473  lines.
par_id                int64
art_id               object
keyword              object
country_code         object
text                 object
label                 int64
class                 int64
preprocessed_text    object
dtype: object
(10468, 8)
par_id               0
art_id               0
keyword              0
country_code         0
text                 0
label                0
class                0
preprocessed_text    0
dtype: int64


### Save preprocessed data

In [6]:
train_train_df.to_csv(f'{output_data_folder}/pcl_df_train_train_preprocessed.csv', index=False)
train_dev_df.to_csv(f'{output_data_folder}/pcl_df_train_dev_preprocessed.csv', index=False)
dev_df.to_csv(f'{output_data_folder}/pcl_df_dev_preprocessed.csv', index=False)
pcl_df.to_csv(f'{output_data_folder}/pcl_df_preprocessed.csv', index=False)