In [6]:
# Load data
import pandas as pd

col_names = ['sentiment','id','date','query_string','user','text']
data_path = '/content/drive/My Drive/Colab Notebooks/training.1600000.processed.noemoticon.csv'

tweet_data = pd.read_csv(data_path, header=None, names=col_names, encoding="ISO-8859-1").sample(frac=1) # .sample(frac=1) shuffles the data
tweet_data = tweet_data[['sentiment', 'text']] # Disregard other columns
print(tweet_data.head())

         sentiment                                               text
1049857          4                                           with kv 
583013           0    Hey ya'll! Wats up? Haven't been on im a while 
512083           0    Yesterday... I was tired. Today... I am bored. 
1098537          4                                        sunbathin' 
554878           0  @MTVindia Can you provide some of them here......


In [0]:
# Preprocess function
import re
allowed_chars = ' AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz0123456789~`!@#$%^&*()-=_+[]{}|;:",./<>?'
punct = '!?,.@#'
maxlen = 280

def preprocess(text):
    return ''.join([' ' + char + ' ' if char in punct else char for char in [char for char in re.sub(r'http\S+', 'http', text, flags=re.MULTILINE) if char in allowed_chars]])[:maxlen]

In [0]:
# Apply preprocessing
tweet_data['text'] = tweet_data['text'].apply(preprocess)

In [9]:
# Put __label__ in front of each sentiment
tweet_data['sentiment'] = '__label__' + tweet_data['sentiment'].astype(str)

NumExpr defaulting to 2 threads.


In [0]:
# Save data
import os

# Create directory for saving data if it does not already exist
data_dir = './processed-data'
if not os.path.isdir(data_dir):
    os.mkdir(data_dir)

# Save a percentage of the data (you could also only load a fraction of the data instead)
amount = 0.125

tweet_data.iloc[0:int(len(tweet_data)*0.8*amount)].to_csv(data_dir + '/train.csv', sep='\t', index=False, header=False)
tweet_data.iloc[int(len(tweet_data)*0.8*amount):int(len(tweet_data)*0.9*amount)].to_csv(data_dir + '/test.csv', sep='\t', index=False, header=False)
tweet_data.iloc[int(len(tweet_data)*0.9*amount):int(len(tweet_data)*1.0*amount)].to_csv(data_dir + '/dev.csv', sep='\t', index=False, header=False)

In [11]:
# Memory management
del tweet_data
import gc; gc.collect()

39

In [12]:
# Load the data into Corpus format
from flair.data_fetcher import NLPTaskDataFetcher
from pathlib import Path

corpus = NLPTaskDataFetcher.load_classification_corpus(Path(data_dir), test_file='test.csv', dev_file='dev.csv', train_file='train.csv')

'pattern' package not found; tag filters are not available for English
PyTorch version 1.4.0 available.
TensorFlow version 2.2.0-rc2 available.


2020-04-05 16:50:24,827 Reading data from processed-data
2020-04-05 16:50:24,827 Train: processed-data/train.csv
2020-04-05 16:50:24,831 Dev: processed-data/dev.csv
2020-04-05 16:50:24,832 Test: processed-data/test.csv


  after removing the cwd from sys.path.
  train_file, tokenizer=tokenizer, max_tokens_per_doc=max_tokens_per_doc
  test_file, tokenizer=tokenizer, max_tokens_per_doc=max_tokens_per_doc
  dev_file, tokenizer=tokenizer, max_tokens_per_doc=max_tokens_per_doc


In [13]:
label_dict = corpus.make_label_dictionary()

2020-04-05 16:52:16,668 Computing label dictionary. Progress:


100%|██████████| 160000/160000 [00:00<00:00, 326305.79it/s]

2020-04-05 16:52:17,195 [b'4', b'0']





In [14]:
from flair.embeddings import WordEmbeddings, FlairEmbeddings

word_embeddings = [WordEmbeddings('glove'),
#                    FlairEmbeddings('news-forward'),
#                    FlairEmbeddings('news-backward')
                  ]

2020-04-05 16:54:41,759 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/glove.gensim.vectors.npy not found in cache, downloading to /tmp/tmpgg2487zc


100%|██████████| 160000128/160000128 [00:09<00:00, 17258887.88B/s]

2020-04-05 16:54:51,712 copying /tmp/tmpgg2487zc to cache at /root/.flair/embeddings/glove.gensim.vectors.npy





2020-04-05 16:54:52,309 removing temp file /tmp/tmpgg2487zc
2020-04-05 16:54:54,124 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/glove.gensim not found in cache, downloading to /tmp/tmp0gh_ufhz


100%|██████████| 21494764/21494764 [00:02<00:00, 10072804.01B/s]

2020-04-05 16:54:56,913 copying /tmp/tmp0gh_ufhz to cache at /root/.flair/embeddings/glove.gensim
2020-04-05 16:54:56,935 removing temp file /tmp/tmp0gh_ufhz



loading Word2VecKeyedVectors object from /root/.flair/embeddings/glove.gensim
  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL
loading vectors from /root/.flair/embeddings/glove.gensim.vectors.npy with mmap=None
setting ignored attribute vectors_norm to None
loaded /root/.flair/embeddings/glove.gensim


In [0]:
# Initialize embeddings
from flair.embeddings import DocumentRNNEmbeddings

document_embeddings = DocumentRNNEmbeddings(word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256)

In [0]:
# Create model
from flair.models import TextClassifier

classifier = TextClassifier(document_embeddings, label_dictionary=label_dict)

In [0]:
# Create model trainer
from flair.trainers import ModelTrainer

trainer = ModelTrainer(classifier, corpus)

In [32]:
# Train the model
trainer.train('model-saves',
              learning_rate=0.1,
              mini_batch_size=32,
              anneal_factor=0.5,
              patience=8,
              max_epochs=5)

2020-04-05 17:11:25,926 ----------------------------------------------------------------------------------------------------
2020-04-05 17:11:25,931 Model: "TextClassifier(
  (document_embeddings): DocumentRNNEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): WordEmbeddings('glove')
    )
    (word_reprojection_map): Linear(in_features=100, out_features=256, bias=True)
    (rnn): GRU(256, 512, batch_first=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Linear(in_features=512, out_features=2, bias=True)
  (loss_function): CrossEntropyLoss()
  (beta): 1.0
  (weights): None
  (weight_tensor) None
)"
2020-04-05 17:11:25,934 ----------------------------------------------------------------------------------------------------
2020-04-05 17:11:25,936 Corpus: "Corpus: 160000 train + 20000 dev + 20000 test sentences"
2020-04-05 17:11:25,937 ----------------------------------------------------------------------------------------------------
2020-04-05 1

{'dev_loss_history': [],
 'dev_score_history': [],
 'test_score': 0.5799,
 'train_loss_history': []}

In [33]:
# Load the model and make predictions
from flair.data import Sentence

classifier = TextClassifier.load('model-saves/final-model.pt')

pos_sentence = Sentence(preprocess('I love Python!'))
neg_sentence = Sentence(preprocess('Python is the worst!'))

classifier.predict(pos_sentence)
classifier.predict(neg_sentence)

print(pos_sentence.labels, neg_sentence.labels)

2020-04-05 17:11:55,622 loading file model-saves/final-model.pt
[4 (0.7186697721481323)] [4 (0.5976102352142334)]
