In [1]:
import os
import numpy as np
import pandas as pd

In [2]:
!pip3 install flair

Collecting flair
[?25l  Downloading https://files.pythonhosted.org/packages/4e/3a/2e777f65a71c1eaa259df44c44e39d7071ba8c7780a1564316a38bf86449/flair-0.4.2-py3-none-any.whl (136kB)
[K     |████████████████████████████████| 143kB 6.3MB/s eta 0:00:01
[?25hCollecting sqlitedict>=1.6.0 (from flair)
  Downloading https://files.pythonhosted.org/packages/0f/1c/c757b93147a219cf1e25cef7e1ad9b595b7f802159493c45ce116521caff/sqlitedict-1.6.0.tar.gz
Collecting deprecated>=1.2.4 (from flair)
  Downloading https://files.pythonhosted.org/packages/88/0e/9d5a1a8cd7130c49334cce7b8167ceda63d6a329c8ea65b626116bc9e9e6/Deprecated-1.2.6-py2.py3-none-any.whl
Collecting bpemb>=0.2.9 (from flair)
  Downloading https://files.pythonhosted.org/packages/bc/70/468a9652095b370f797ed37ff77e742b11565c6fd79eaeca5f2e50b164a7/bpemb-0.3.0-py3-none-any.whl
Collecting segtok>=1.5.7 (from flair)
  Downloading https://files.pythonhosted.org/packages/1d/59/6ed78856ab99d2da04084b59e7da797972baa0efecb71546b16d48e49d9b/segtok-1.5

> ## Create a Corpus

### 1) Load from simple CSV file

In [3]:
from flair.datasets import CSVClassificationCorpus

ImportError: cannot import name 'CSVClassificationCorpus'

Great development..

### 2) FastText Format

In [4]:
from flair.data import Corpus
from flair.datasets import ClassificationCorpus

In [5]:
FILE_PATH = "../input/bbctext/bbc-text.csv"
DATASET_FOLDER_PATH = "splitted_data"
MODEL_FOLDER_PATH = "model"

# file format
__label__<label_1> <text>
__label__<label_1> __label__<label_2> <text>

In [6]:
data_df = pd.read_csv(FILE_PATH).sample(frac=1)
data_df["category"] = '__label__' + data_df["category"].astype(str)

In [7]:
if not os.path.exists(DATASET_FOLDER_PATH):
    os.makedirs(DATASET_FOLDER_PATH)
data_df.iloc[0: int(len(data_df)*0.8)].to_csv(os.path.join(DATASET_FOLDER_PATH, 'train.csv'), sep='\t', index = False, header = False)
data_df.iloc[int(len(data_df)*0.8): int(len(data_df)*0.9)].to_csv(os.path.join(DATASET_FOLDER_PATH, 'dev.csv'), sep='\t', index = False, header = False)
data_df.iloc[int(len(data_df)*0.9): ].to_csv(os.path.join(DATASET_FOLDER_PATH, 'test.csv'), sep='\t', index = False, header = False);

In [8]:
corpus: Corpus = ClassificationCorpus(DATASET_FOLDER_PATH)

2019-08-07 18:08:20,211 Reading data from splitted_data
2019-08-07 18:08:20,212 Train: splitted_data/train.csv
2019-08-07 18:08:20,214 Dev: splitted_data/dev.csv
2019-08-07 18:08:20,215 Test: splitted_data/test.csv


Each line in a corpus is converted to a Sentence object annotated with the labels.

## Check distribution

In [9]:
train_df = pd.read_csv("splitted_data/train.csv", names=["label", "text"], delimiter="\t")
train_df.label.value_counts()

__label__sport            421
__label__business         412
__label__politics         333
__label__tech             315
__label__entertainment    299
Name: label, dtype: int64

In [10]:
val_df = pd.read_csv("splitted_data/dev.csv", names=["label", "text"], delimiter="\t")
val_df.label.value_counts()

__label__business         54
__label__tech             49
__label__sport            45
__label__politics         40
__label__entertainment    34
Name: label, dtype: int64

In [None]:
test_df = pd.read_csv("splitted_data/test.csv", names=["label", "text"], delimiter="\t")
test_df.label.value_counts()

## Training a Model

In [11]:
from flair.embeddings import WordEmbeddings, FlairEmbeddings, DocumentRNNEmbeddings
from flair.models import TextClassifier
from flair.trainers import ModelTrainer

In [12]:
if not os.path.exists(MODEL_FOLDER_PATH):
    os.makedirs(MODEL_FOLDER_PATH)

In [13]:
params_train = {
    "word_emb": 'glove',
    "flair_emb_forward": 'news-forward-fast',
    "flair_emb_backward": 'news-backward-fast',
    "hidden_size": 256,
    "reproject_words_dimension": 128
    
}

In [14]:
word_embeddings = [WordEmbeddings(params_train["word_emb"]), FlairEmbeddings(params_train["flair_emb_forward"]),
                   FlairEmbeddings(params_train["flair_emb_backward"])]

document_embeddings = DocumentRNNEmbeddings(word_embeddings, hidden_size=params_train["hidden_size"],
                                            reproject_words=True, reproject_words_dimension=params_train["reproject_words_dimension"])

classifier = TextClassifier(document_embeddings, label_dictionary=corpus.make_label_dictionary(), multi_label=False)

trainer = ModelTrainer(classifier, corpus)

2019-08-07 18:08:54,179 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/glove.gensim.vectors.npy not found in cache, downloading to /tmp/tmp7ftnrlid


100%|██████████| 160000128/160000128 [00:03<00:00, 49676875.02B/s]

2019-08-07 18:08:57,560 copying /tmp/tmp7ftnrlid to cache at /tmp/.flair/embeddings/glove.gensim.vectors.npy





2019-08-07 18:08:57,791 removing temp file /tmp/tmp7ftnrlid
2019-08-07 18:08:57,888 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/glove.gensim not found in cache, downloading to /tmp/tmp7psm6pnn


100%|██████████| 21494764/21494764 [00:00<00:00, 42176685.11B/s]

2019-08-07 18:08:58,540 copying /tmp/tmp7psm6pnn to cache at /tmp/.flair/embeddings/glove.gensim
2019-08-07 18:08:58,574 removing temp file /tmp/tmp7psm6pnn





2019-08-07 18:09:00,690 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-forward-1024-v0.2rc.pt not found in cache, downloading to /tmp/tmp5_okx8mr


100%|██████████| 19689779/19689779 [00:00<00:00, 36841721.55B/s]

2019-08-07 18:09:01,389 copying /tmp/tmp5_okx8mr to cache at /tmp/.flair/embeddings/lm-news-english-forward-1024-v0.2rc.pt
2019-08-07 18:09:01,425 removing temp file /tmp/tmp5_okx8mr





2019-08-07 18:09:05,885 https://s3.eu-central-1.amazonaws.com/alan-nlp/resources/embeddings/lm-news-english-backward-1024-v0.2rc.pt not found in cache, downloading to /tmp/tmp_l9yhl8c


100%|██████████| 19689779/19689779 [00:00<00:00, 43163150.35B/s]

2019-08-07 18:09:06,526 copying /tmp/tmp_l9yhl8c to cache at /tmp/.flair/embeddings/lm-news-english-backward-1024-v0.2rc.pt
2019-08-07 18:09:06,555 removing temp file /tmp/tmp_l9yhl8c





2019-08-07 18:09:27,069 {'entertainment', 'business', 'sport', 'politics', 'tech'}


In [None]:
trainer.train(MODEL_FOLDER_PATH, max_epochs=10)

2019-08-07 18:09:56,381 ----------------------------------------------------------------------------------------------------
2019-08-07 18:09:56,410 Evaluation method: MICRO_F1_SCORE
2019-08-07 18:09:56,651 ----------------------------------------------------------------------------------------------------
2019-08-07 18:10:06,280 epoch 1 - iter 0/56 - loss 1.57349336
2019-08-07 18:10:25,755 epoch 1 - iter 5/56 - loss 1.60266578
2019-08-07 18:10:44,770 epoch 1 - iter 10/56 - loss 1.59392031
2019-08-07 18:11:08,488 epoch 1 - iter 15/56 - loss 1.59036079
2019-08-07 18:11:28,195 epoch 1 - iter 20/56 - loss 1.57132816
2019-08-07 18:11:45,612 epoch 1 - iter 25/56 - loss 1.55954139
2019-08-07 18:12:04,663 epoch 1 - iter 30/56 - loss 1.54682369
2019-08-07 18:12:23,018 epoch 1 - iter 35/56 - loss 1.52439698
2019-08-07 18:12:51,410 epoch 1 - iter 40/56 - loss 1.51959479
2019-08-07 18:13:09,052 epoch 1 - iter 45/56 - loss 1.51307006
2019-08-07 18:13:26,521 epoch 1 - iter 50/56 - loss 1.50106491
2

## Plot training curves

In [None]:
from flair.visual.training_curves import Plotter

plotter = Plotter()
plotter.plot_training_curves(os.path.join(MODEL_FOLDER_PATH, 'loss.tsv'))
plotter.plot_weights(os.path.join(MODEL_FOLDER_PATH, 'weights.txt'))