In [53]:
import sys
sys.path.insert(0, '../utils')

import pandas as pd
import text_utils as txt_u
from sklearn.model_selection import train_test_split

In [54]:
sentiment140 = pd.read_csv('../data/sentiment140.csv')

In [55]:
sentiment140.head()

Unnamed: 0,id,target,text
0,1467810369,0,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,1467810672,0,is upset that he can't update his Facebook by ...
2,1467810917,0,@Kenichan I dived many times for the ball. Man...
3,1467811184,0,my whole body feels itchy and like its on fire
4,1467811193,0,"@nationwideclass no, it's not behaving at all...."


### Sample

In [56]:
sample = sentiment140.sample(n=20000, random_state=10).reset_index(drop=True)

In [57]:
sample.head()

Unnamed: 0,id,target,text
0,2250819430,0,"@MrDAprano, honnestly I'm not going to miss th..."
1,2068515884,0,Tired after spending a restless night with Tie...
2,1991871464,1,@bradgallaway Yup popiscle Not sure if you've...
3,2070090087,1,@candydiaz LOL save a lil arse for me candy!
4,1795457800,1,@StephanieFizer No problem!! Your stuff is so ...


### Text processing

In [58]:
sample['processed_text'] = sample['text'].apply(txt_u.remove_mentions)

In [59]:
sample['processed_text'] = sample['processed_text'].apply(txt_u.remove_urls)

In [60]:
# En el caso concreto del dataset utilizado los emojis ya han sido eliminados.
# sample['processed_text'] = sample['processed_text'].apply(txt_u.remove_emojis)

In [61]:
sample['processed_text'] = sample['processed_text'].apply(txt_u.lemmatization)

In [62]:
sample['processed_text'] = sample['processed_text'].apply(txt_u.remove_symbols)

In [63]:
sample.head()

Unnamed: 0,id,target,text,processed_text
0,2250819430,0,"@MrDAprano, honnestly I'm not going to miss th...",honnestly go miss janitor seriously lose trac...
1,2068515884,0,Tired after spending a restless night with Tie...,tired spend restless night tiernan doctor t...
2,1991871464,1,@bradgallaway Yup popiscle Not sure if you've...,yup popiscle sure get strawberry split st...
3,2070090087,1,@candydiaz LOL save a lil arse for me candy!,lol save lil arse candy
4,1795457800,1,@StephanieFizer No problem!! Your stuff is so ...,problem stuff cute wait mini card make


#### Train/Test split

In [64]:
train, test = train_test_split(sample, test_size=.2, random_state=10)

#### Vectorization

In [65]:
tokenizer = txt_u.train_tokenizer(sample['processed_text'].values)

In [66]:
train_vec = txt_u.vectorization(tokenizer, train['processed_text'].values)
test_vec = txt_u.vectorization(tokenizer, test['processed_text'].values)

In [67]:
vector_length = max(train_vec.shape[1], test_vec.shape[1])

train_vec = txt_u.zero_pad(train_vec, vector_length)
test_vec = txt_u.zero_pad(test_vec, vector_length)

In [68]:
print(train_vec.shape)
print(test_vec.shape)

(16000, 20)
(4000, 20)


In [69]:
train['feat_vector'] = train_vec.tolist()
test['feat_vector'] = test_vec.tolist()

In [70]:
train.to_csv('../data/train.csv', index=False)
test.to_csv('../data/test.csv', index=False)