In [1]:
!pip install torch_nightly -f https://download.pytorch.org/whl/nightly/cu92/torch_nightly.html
!pip install fastai

Looking in links: https://download.pytorch.org/whl/nightly/cu92/torch_nightly.html
Collecting torch_nightly
[?25l  Downloading https://download.pytorch.org/whl/nightly/cu92/torch_nightly-1.2.0.dev20190805%2Bcu92-cp36-cp36m-linux_x86_64.whl (704.8MB)
[K     |████████████████████████████████| 704.8MB 25kB/s 
[?25hInstalling collected packages: torch-nightly
Successfully installed torch-nightly-1.2.0.dev20190805+cu92


In [2]:
import fastai
from fastai import *
from fastai.text import * 
import pandas as pd
import numpy as np
from functools import partial
import io
import os

In [3]:
from sklearn.datasets import fetch_20newsgroups
dataset = fetch_20newsgroups(shuffle=True, random_state=1, remove=('headers', 'footers', 'quotes'))
documents = dataset.data

Downloading 20news dataset. This may take a few minutes.
Downloading dataset from https://ndownloader.figshare.com/files/5975967 (14 MB)


In [4]:
df = pd.DataFrame({'label':dataset.target, 'text':dataset.data})

In [5]:
df.shape

(11314, 2)

In [6]:
df = df[df['label'].isin([1,10])]
df = df.reset_index(drop = True)

In [28]:
df['label']

0       10
1        1
2       10
3       10
4       10
        ..
1179    10
1180    10
1181     1
1182     1
1183    10
Name: label, Length: 1184, dtype: int64

In [7]:
df['label'].value_counts()

10    600
1     584
Name: label, dtype: int64

 Let’s clean our text by retaining only alphabets and removing everything else.

In [8]:
df['text'] = df['text'].str.replace("[^a-zA-Z]", " ")

Now, we will get rid of the stopwords from our text data. If you have never used stopwords before, then you will have to download them from the nltk package as I’ve shown below:

In [9]:
import nltk
nltk.download('stopwords')

from nltk.corpus import stopwords 
stop_words = stopwords.words('english')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [12]:
df['text'][0]

'Well  I will have to change the scoring on my playoff pool   Unfortunately I don t have time right now  but I will certainly post the new scoring rules by tomorrow   Does it matter   No  you ll enter anyway     Good          Keith Keller    LET S GO RANGERS            LET S GO QUAKERS       kkeller mail sas upenn edu  IVY LEAGUE CHAMPS    '

In [13]:
tokenized_doc = df['text'].apply(lambda x: x.split())

# remove stop-words 
tokenized_doc = tokenized_doc.apply(lambda x: [item for item in x if item not in stop_words])

# de-tokenization 
detokenized_doc = [] 
for i in range(len(df)): 
    t = ' '.join(tokenized_doc[i]) 
    detokenized_doc.append(t) 

df['text'] = detokenized_doc

In [14]:
df['text'][0]

'Well I change scoring playoff pool Unfortunately I time right I certainly post new scoring rules tomorrow Does matter No enter anyway Good Keith Keller LET S GO RANGERS LET S GO QUAKERS kkeller mail sas upenn edu IVY LEAGUE CHAMPS'

Now let’s split our cleaned dataset into training and validation sets in a 60:40 ratio.

In [15]:
from sklearn.model_selection import train_test_split

# split data into training and validation set
df_trn, df_val = train_test_split(df, stratify = df['label'], test_size = 0.4, random_state = 12)

Before proceeding further, we’ll need to prepare our data for the language model and for the classification model separately. The good news? This can be done quite easily using the fastai library:

In [16]:
# Language model data
data_lm = TextLMDataBunch.from_df(train_df = df_trn, valid_df = df_val, path = "")

# Classifier model data
data_clas = TextClasDataBunch.from_df(path = "", train_df = df_trn, valid_df = df_val, vocab=data_lm.train_ds.vocab, bs=32)

In [26]:
learn = language_model_learner(data_lm, AWD_LSTM,drop_mult=0.7)

In [27]:
learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,6.109955,5.183828,0.24874,04:23


Let’s now use the data_clas object we created earlier to build a classifier with our fine-tuned encoder.

In [30]:
learn.save_encoder('ft_enc')

In [34]:
learn = text_classifier_learner(data_clas, AWD_LSTM,drop_mult=0.7)

learn.load_encoder('ft_enc')


RNNLearner(data=TextClasDataBunch;

Train: LabelList (710 items)
x: TextList
xxbos xxmaj it looks like xxmaj edmonton xxmaj oilers decided take xxmaj european xxunk spring xxmaj ranford xxmaj tugnutt xxmaj benning xxmaj manson xxmaj smith xxmaj buchberger xxmaj corson playing xxmaj canada xxmaj podein xxmaj weight playing xxup us xxmaj is xxmaj kravchuk playing xxmaj xxunk i know nagging injuries late season xxmaj podein interesting case eligible play xxmaj cape xxmaj breton xxup ahl playoffs like xxmaj kovalev xxmaj zubov xxmaj andersson obviously xxmaj sather xxmaj pocklington total xxunk everyone makes certainly case massively xxunk xxmaj paramount xxmaj new xxmaj york xxmaj rangers,xxbos xxmaj this xxunk xxmaj speaking die hard i i read xxunk hard xxunk xxmaj toronto xxmaj cup finals xxmaj first anyone planet heard team xxmaj detroit xxmaj al xxmaj xxunk however spell idiot name must xxmaj chicago xxup espn said even close xxmaj chicago xxunk win xxmaj norris xxmaj division xxmaj p

In [35]:
learn.fit_one_cycle(1, 1e-2)

epoch,train_loss,valid_loss,accuracy,time
0,0.389722,0.227034,0.919831,11:36


In [38]:
preds, targets =learn.get_preds('Before proceeding further, we’ll need to prepare our data for the language model and for the classification model separately. The good news? This can be done quite easily using the fastai library:')