In [1]:
# Load data
import pandas as pd

col_names = ['sentiment','id','date','query_string','user','text']
data_path = '/home/cate/Projects/sentiment140/training.1600000.processed.noemoticon.csv'

tweet_data = pd.read_csv(data_path, header=None, names=col_names, encoding="ISO-8859-1").sample(frac=1) # .sample(frac=1) shuffles the data
tweet_data = tweet_data[['sentiment', 'text']] # Disregard other columns
print(tweet_data.head())

         sentiment                                               text
549848           0  damn... i got a final in the AM and i have to ...
1193216          4             Eight demos set for the week, so far. 
292840           0           Doing homework right now... Super bored 
1222387          4  B Back when this storm passes over... shouldn'...
1163382          4  Feedly: Better Support for 1024 Displays: Narr...


In [2]:
import re

allowed_chars = ' AaBbCcDdEeFfGgHhIiJjKkLlMmNnOoPpQqRrSsTtUuVvWwXxYyZz0123456789~`!@#$%^&*()-=_+[]{}|;:",./<>?'
punct = '!?,.@#'
maxlen = 280

def preprocess(text):
    return ''.join([' ' + char + ' ' if char in punct else char for char in [char for char in re.sub(r'http\S+', 'http', text, flags=re.MULTILINE) if char in allowed_chars]])[:maxlen]

In [3]:
tweet_data['text'] = tweet_data['text'].apply(preprocess)

In [4]:
tweet_data['sentiment'] = '__label__' + tweet_data['sentiment'].astype(str)

In [5]:
tweet_data.head()

Unnamed: 0,sentiment,text
549848,__label__0,damn . . . i got a final in the AM and i ha...
1193216,__label__4,"Eight demos set for the week , so far ."
292840,__label__0,Doing homework right now . . . Super bored
1222387,__label__4,B Back when this storm passes over . . . sh...
1163382,__label__4,Feedly: Better Support for 1024 Displays: Narr...


In [6]:
import os

# Create directory for saving data if it does not already exist
data_dir = '/home/cate/Projects/Twitter_sentiment_analysis_app/processed-data'
if not os.path.isdir(data_dir):
    os.mkdir(data_dir)

# Save a percentage of the data (you could also only load a fraction of the data instead)
amount = 0.05

tweet_data.iloc[0:int(len(tweet_data)*0.8*amount)].to_csv(data_dir + '/train.csv', sep='\t', index=False, header=False)
tweet_data.iloc[int(len(tweet_data)*0.8*amount):int(len(tweet_data)*0.9*amount)].to_csv(data_dir + '/test.csv', sep='\t', index=False, header=False)
tweet_data.iloc[int(len(tweet_data)*0.9*amount):int(len(tweet_data)*1.0*amount)].to_csv(data_dir + '/dev.csv', sep='\t', index=False, header=False)

In [8]:
from flair.data import Corpus
from flair.datasets import ClassificationCorpus

# this is the folder in which train, test and dev files reside
data_folder = data_dir

# load corpus containing training, test and dev data
corpus: Corpus = ClassificationCorpus(data_folder,
                                      test_file='test.txt',
                                      dev_file='dev.txt',
                                      train_file='train.txt')

2020-01-16 20:34:27,760 Reading data from /home/cate/Projects/Twitter_sentiment_analysis_app/processed-data
2020-01-16 20:34:27,762 Train: /home/cate/Projects/Twitter_sentiment_analysis_app/processed-data/train.txt
2020-01-16 20:34:27,764 Dev: /home/cate/Projects/Twitter_sentiment_analysis_app/processed-data/dev.txt
2020-01-16 20:34:27,765 Test: /home/cate/Projects/Twitter_sentiment_analysis_app/processed-data/test.txt


AssertionError: 

In [10]:
#from flair.data_fetcher import NLPTaskDataFetcher
#from pathlib import Path

#corpus = NLPTaskDataFetcher.load_classification_corpus(Path(data_dir), test_file='test.csv', dev_file='dev.csv', train_file='train.csv')

In [9]:
label_dict = corpus.make_label_dictionary()

2020-01-16 01:06:10,234 Computing label dictionary. Progress:


100%|██████████| 160000/160000 [00:00<00:00, 352960.82it/s]

2020-01-16 01:06:10,694 [b'0', b'4']





In [10]:
from flair.embeddings import WordEmbeddings, FlairEmbeddings

word_embeddings = [WordEmbeddings('glove'),
#                    FlairEmbeddings('news-forward'),
#                    FlairEmbeddings('news-backward')
                  ]

In [11]:
from flair.embeddings import DocumentRNNEmbeddings

document_embeddings = DocumentRNNEmbeddings(word_embeddings, hidden_size=512, reproject_words=True, reproject_words_dimension=256)

In [12]:
from flair.models import TextClassifier

classifier = TextClassifier(document_embeddings, label_dictionary=label_dict)


In [13]:
from flair.trainers import ModelTrainer

trainer = ModelTrainer(classifier, corpus)

In [None]:
trainer.train('model-saves',
              learning_rate=0.1,
              mini_batch_size=32,
              anneal_factor=0.5,
              patience=8,
              max_epochs=200)

2020-01-16 01:06:31,398 ----------------------------------------------------------------------------------------------------
2020-01-16 01:06:31,400 Model: "TextClassifier(
  (document_embeddings): DocumentRNNEmbeddings(
    (embeddings): StackedEmbeddings(
      (list_embedding_0): WordEmbeddings('glove')
    )
    (word_reprojection_map): Linear(in_features=100, out_features=256, bias=True)
    (rnn): GRU(256, 512, batch_first=True)
    (dropout): Dropout(p=0.5, inplace=False)
  )
  (decoder): Linear(in_features=512, out_features=2, bias=True)
  (loss_function): CrossEntropyLoss()
)"
2020-01-16 01:06:31,402 ----------------------------------------------------------------------------------------------------
2020-01-16 01:06:31,403 Corpus: "Corpus: 160000 train + 20000 dev + 20000 test sentences"
2020-01-16 01:06:31,404 ----------------------------------------------------------------------------------------------------
2020-01-16 01:06:31,405 Parameters:
2020-01-16 01:06:31,407  - lear

In [None]:
from flair.data import Sentence

classifier = TextClassifier.load('model-saves/final-model.pt')

pos_sentence = Sentence(preprocess('I love Python!'))
neg_sentence = Sentence(preprocess('Python is the worst!'))

classifier.predict(pos_sentence)
classifier.predict(neg_sentence)

print(pos_sentence.labels, neg_sentence.labels)

2020-01-16 00:38:16,540 loading file model-saves/final-model.pt


In [None]:
ls