In [1]:
import torch
from torchtext import data
from torchtext.vocab import Vectors
import spacy
import pandas as pd 
import numpy as np 
from sklearn.metrics import accuracy_score
import re
import string

In [2]:
from text_preprocess import preprocess

In [2]:
def clean_text(text):
    #2. remove unkonwn characrters
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    text = emoji_pattern.sub(r'', text)
   
    #1. remove http links
    url = re.compile(r'https?://\S+|www\.\S+')
    text = url.sub(r'',text)
    
    #3,4. remove #,@ and othet symbols
    text = text.replace('#',' ')
    text = text.replace('@',' ')
    symbols = re.compile(r'[^A-Za-z0-9 ]')
    text = symbols.sub(r'',text)
    
    #5. lowercase
    text = text.lower()
    
    return text

In [5]:
class Dataset(object):
    def __init__(self, config):
        self.config = config
        self.train_iterator = None
        self.val_iterator = None
        self.test_iterator = None
        self.vocab = {}
        self.word_embeddings = {}
    
    def parse_label(self, label):
        return int(label)
    
    def get_pandas_df(self, PATH, mode='train'):
        df = pd.read_csv(PATH)
        data_text = df.text.tolist()
        if mode in ['train','val']:
            labels = df.target.tolist()
            data_label = list(map(self.parse_label(), labels))
            new_df = pd.DataFrame({"text":data_text, "label":data_label})
        else:
            new_df = pd.DataFrame({"text":data_text,})
        return new_df

    def load_data(self, w2v_file, train_file, test_file, val_file=None):
        NLP = spacy.load('en')
        tokenizer = lambda sent: [x.text for x in NLP.tokenizer(clean_text(sent))]

        TEXT = data.Field(sequential = True, tokenize=tokenizer, lower=True, fix_length=self.config.max_sen_len)
        LABEL = data.Field(sequential=False, use_vocab=False)
        train_datafields = [('text',TEXT),('label',LABEL)]
        test_datafields = [('text', TEXT)]

        #load train and testt data into torchtext.data.Dataset
        train_df = self.get_pandas_df(train_file, mode='train')
        train_examples = [data.Example.fromlist(i, train_datafields) for i in train_df.values.tolist()]
        train_data = data.Dataset(train_examples, train_datafields)

        test_df = self.get_pandas_df(test_file, mode='test')
        test_examples = [data.Example.fromlist(i, test_datafields) for i in test_df.values.tolist()]
        test_data = data.Dataset(test_examples, test_datafields)

        #if validation file exists, then load in the train way; otherwise spilt train_data
        if not val_file:
            val_df = self.get_pandas_df(val_file, mode='val')
            val_examples = [data.Example.fromlist(i, train_datafields) for i in val_df.values.tolist()]
            val_data = data.Dataset(val_examples, train_datafields)
        else:
            train_data, val_data = train_data.split(split_ratio=0.8)
        
        TEXT.build_vocab(train_data, vectors = Vectors(w2v_file))
        self.word_embeddings = TEXT.vocab.vectors
        self.vocab = TEXT.vocab 

        #get train/val[if]/test dataiterator
        self.train_iterator = data.BucketIterator(
            train_data,
            batch_size = self.config.batch_size,
            sort_key = lambda x: len(x.text),
            repeat = False,
            shuffle = True,
        )
        self.val_iterator, self.test_iterator = data.BucketIterator.splits(
            (val_data, test_data),
            batch_size = self.config.batch_size,
            sort_key = lambda x: len(x.text),
            repeat = False,
            shuffle = False,
        )
        print("Loaded {} training examples".format(len(train_data)))
        print("Loaded {} val examples".format(len(val_data)))
        print("Loaded {} test examples".format(len(test_data)))

In [6]:
def evaluate(model, iterator):
    total_preds = []
    total = []
    for idx,batch in enumerate(iterator):
        if torch.cuda.is_available():
            x = batch.text.cuda()
        else:
            x = batch.text
        output = model(x)
        y_pred = output.cpu().data.max(1)[1].tolist()
        total_preds += y_pred 
        total += batch.label.cpu().data.tolist()
    score = accuracy_score(total, total_preds)
    return score


In [4]:
train_df = pd.read_csv('./data/train.csv')
train_text=train_df.text.tolist()

In [7]:
import re
train_text = list(map(clean_text,train_text))

In [9]:
tokenizer = lambda sent : list(sent[::-1])

In [10]:
train_text = list(map(tokenizer, train_text))

In [12]:
max(map(len,train_text))
min(map(len,train_text))

6

In [13]:
test_df = pd.read_csv('./data/test.csv')
test_text=test_df.text.tolist()
test_text = list(map(clean_text,test_text))
test_text = list(map(tokenizer, test_text))
print(max(map(len,test_text)),min(map(len,test_text)))

144 5
