# Data Import

In [8]:
import pandas as pd 

train_df = pd.read_csv('archive/train.txt' ,names=['text', 'sentiment'], delimiter=';')
test_df = pd.read_csv('archive/test.txt' ,names=['text',  'sentiment'], delimiter=';')
validate_df = pd.read_csv('archive/val.txt' ,names=['text' , 'sentiment'], delimiter=';')

In [9]:
dataset = pd.concat([train_df, test_df, validate_df], axis=0)
dataset

Unnamed: 0,text,sentiment
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger
...,...,...
1995,im having ssa examination tomorrow in the morn...,sadness
1996,i constantly worry about their fight against n...,joy
1997,i feel its important to share this info for th...,joy
1998,i truly feel that if you are passionate enough...,joy


In [10]:
dataset.shape

(20000, 2)

# preprocessing data

In [11]:
import text_hammer as th
from tqdm import tqdm_notebook

def text_preprocessing(df, col_name): 
    column = col_name
    df[column] = df[column].progress_apply(lambda x:str(x).lower())
    df[column] = df[column].progress_apply(lambda x:th.cont_exp(x))
    df[column] = df[column].progress_apply(lambda x:th.remove_emails(x))
    df[column] = df[column].progress_apply(lambda x:th.remove_html_tags(x))
    df[column] = df[column].progress_apply(lambda x:th.remove_special_chars(x))
    df[column] = df[column].progress_apply(lambda x:th.remove_accented_chars(x))
    
    return df

text_preprocessing(dataset, 'text')

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

  0%|          | 0/20000 [00:00<?, ?it/s]

Unnamed: 0,text,sentiment
0,i didnt feel humiliated,sadness
1,i can go from feeling so hopeless to so damned...,sadness
2,im grabbing a minute to post i feel greedy wrong,anger
3,i am ever feeling nostalgic about the fireplac...,love
4,i am feeling grouchy,anger
...,...,...
1995,im having ssa examination tomorrow in the morn...,sadness
1996,i constantly worry about their fight against n...,joy
1997,i feel its important to share this info for th...,joy
1998,i truly feel that if you are passionate enough...,joy


In [12]:
def return_mx(text): 
    return len(text.split())

dataset['word_len'] =  dataset.text.progress_apply(lambda x: return_mx(x))

  0%|          | 0/20000 [00:00<?, ?it/s]

In [13]:
dataset.word_len.max()

66

In [14]:
dataset['sentiment'] = dataset.sentiment.astype('category')

In [15]:
sent_code = {'anger':0 , 'fear':1, 'joy':2, 'love':3, 'sadness':4, 'surprise':5}
sent_code

{'anger': 0, 'fear': 1, 'joy': 2, 'love': 3, 'sadness': 4, 'surprise': 5}

In [16]:
dataset.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 20000 entries, 0 to 1999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype   
---  ------     --------------  -----   
 0   text       20000 non-null  object  
 1   sentiment  20000 non-null  category
 2   word_len   20000 non-null  int64   
dtypes: category(1), int64(1), object(1)
memory usage: 488.5+ KB


In [17]:
dataset['sentiment'] = dataset.sentiment.cat.codes

In [18]:
from tensorflow.keras.utils import to_categorical

y_input = to_categorical(dataset.sentiment)

In [19]:
y_input

array([[0., 0., 0., 0., 1., 0.],
       [0., 0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0., 0.],
       ...,
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.],
       [0., 0., 1., 0., 0., 0.]], dtype=float32)

In [20]:
total =0 
for row in dataset.text.values:
    total += len(row.split())

print('total words is :', total)

total words is : 383741


# further deep learning processing

In [21]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn import model_selection

In [22]:
dataset

Unnamed: 0,text,sentiment,word_len
0,i didnt feel humiliated,4,4
1,i can go from feeling so hopeless to so damned...,4,21
2,im grabbing a minute to post i feel greedy wrong,0,10
3,i am ever feeling nostalgic about the fireplac...,3,18
4,i am feeling grouchy,0,4
...,...,...,...
1995,im having ssa examination tomorrow in the morn...,4,34
1996,i constantly worry about their fight against n...,2,29
1997,i feel its important to share this info for th...,2,15
1998,i truly feel that if you are passionate enough...,2,19


In [23]:
# splitting dataset 

X_train, X_test, y_train, y_test  = \
model_selection.train_test_split(dataset['text'] , dataset.sentiment , test_size=0.1 , stratify=dataset.sentiment)

In [24]:
token = Tokenizer(num_words=30000 , oov_token='<OOV>')
token.fit_on_texts(X_train)
XtrainSeq = token.texts_to_sequences(X_train)
XtestSeq = token.texts_to_sequences(X_test)

In [25]:
pad_len = 70
train_pad = pad_sequences(XtrainSeq, maxlen=pad_len, padding='post')
test_pad = pad_sequences(XtestSeq, maxlen=pad_len,  padding='post')

In [26]:
train_pad

array([[   2,   55,   23, ...,    0,    0,    0],
       [   2,    3,  103, ...,    0,    0,    0],
       [   2, 1126,    5, ...,    0,    0,    0],
       ...,
       [   8,   15,   47, ...,    0,    0,    0],
       [   2,    3,   14, ...,    0,    0,    0],
       [   2,   72,    2, ...,    0,    0,    0]])

In [97]:
vocab_size = 30000
embedding_dim = 100
pad_length =70

lstm_model = tf.keras.Sequential([
    
    tf.keras.layers.Embedding(vocab_size, embedding_dim, input_length=pad_length),
#   tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(128, dropout=0.2, recurrent_dropout=0.5)),
    tf.keras.layers.LSTM(100 , activation='relu'),
    tf.keras.layers.Dense(100, activation='relu'),
    tf.keras.layers.Dense(6, activation='sigmoid')
    
])


lstm_model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'] )
lstm_model.summary()

Model: "sequential_9"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_9 (Embedding)     (None, 70, 100)           3000000   
                                                                 
 lstm_9 (LSTM)               (None, 100)               80400     
                                                                 
 dense_18 (Dense)            (None, 100)               10100     
                                                                 
 dense_19 (Dense)            (None, 6)                 606       
                                                                 
Total params: 3,091,106
Trainable params: 3,091,106
Non-trainable params: 0
_________________________________________________________________


In [98]:
lstm_model.fit(train_pad, y_train , epochs=6, validation_data=(test_pad, y_test), verbose=1)

Epoch 1/6
Epoch 2/6

KeyboardInterrupt: 