# Text classification using Fasttext
__Author: gregor Habeck__ with contributions from Julia Schäfer

We test fasttext to test classification of spoiler reviews

In [1]:
# install homebrew
# in terminal:
# ruby -e "$(curl -fsSL https://raw.githubusercontent.com/Homebrew/install/master/install)"
# install wget
# in terminal:
# brew install wget
# git clone https://github.com/facebookresearch/fastText.git
# cd fastText
# sudo python setup.py install

In [39]:
# import libraries
import fasttext
import time
import json
import gzip
import pandas as pd
from tqdm import tqdm
from collections import defaultdict

In [40]:
#generator to open json.gzip files
# yields single lines
def get_reviews(file):
    '''
    Generator will yield lines of the passed file
    '''
    with gzip.open(file, 'r') as f:
        for l in f:
            yield l
    f.close()
# fetch features
def fetch_features(file, features):
    '''
    Provide a list of features you want to extract in a single run.
    Returns a dictionary.
    file: json.gzip file you want to scan
    features: features to extract from data
    only reviews written in english will be collected
    '''
    
    feature_dict = defaultdict(list)    
    reviews = get_reviews(file)
    
    for review in tqdm(reviews):
        review_dict = json.loads(review)
        language = review_dict.get('review_language_start')

        if language != 'en': 
            continue
        
        else:
            for f in features:
                feature_dict[f].append(review_dict.get(f))
        
        
    return pd.DataFrame.from_dict(feature_dict)

In [41]:
# safe texts in a format suitable for fasttext
# labels precede text in the form of __label__spoiler/ __label__safe
def prepare_fasttext(dataframe, label):
    with open(f'{label}_fasttext','w+') as tf:
        for ind in tqdm(dataframe.index):
            if dataframe['has_spoiler'][ind] == 1:
                tf.write(f"__label__spoiler {' '.join(dataframe['lemmatized'][ind])}\n")
            elif dataframe['has_spoiler'][ind] == 0:
                tf.write(f"__label__safe {' '.join(dataframe['lemmatized'][ind])}\n")
            else:
                continue
    tf.close()    

In [43]:
training = 'train_set_text_edit.json.gz'
validation = 'validation_set_text_edit.json.gz'
test = 'test_set_text_edit.json.gz'


In [6]:
# import training data
isolate = ['has_spoiler', 'lemmatized']


df_train = fetch_features(training, isolate)

prepare_fasttext(df_train, 'train')

964623it [01:44, 9267.80it/s] 
100%|██████████| 893695/893695 [00:58<00:00, 15296.09it/s]


In [10]:
df_train.head()

Unnamed: 0,has_spoiler,lemmatized
0,False,"[read review blog, , definitely well book, ins..."
1,False,[write comment realize probably end long quali...
2,False,"[charlie turn young sister get marry, decide w..."
3,False,[like get implausible storyline read long time...
4,True,"[review originally post step fiction, want reb..."


In [6]:
# import validation data

df_val = fetch_features(validation, isolate)
prepare_fasttext(df_val, 'validation')

275606it [00:26, 10562.93it/s]
100%|██████████| 255227/255227 [00:15<00:00, 16792.58it/s]


In [7]:
isolate = ['has_spoiler', 'lemmatized']
df_test = fetch_features(test, isolate)
prepare_fasttext(df_test, 'test')


137804it [00:11, 11700.63it/s]
100%|██████████| 127592/127592 [00:07<00:00, 17209.06it/s]


## Fasttext Classification

In [1]:
import time
import fasttext

In [23]:
# simple model using default settings
 

t0 = time.time()
model = fasttext.train_supervised(input='train_fasttext', epoch = 50, lr = 0.1, wordNgrams = 2,
                                 loss = 'softmax')
model.save_model('basis_fasttext_model.bin')
elapsed_time = time.time()-t0
time.strftime("%H:%M:%S", time.gmtime(elapsed_time))

'00:15:03'

In [62]:
# evaluate the model on validation set
safe_class = model.test('validation_fasttext', k =1)
spoiler_class = model.test('validation_fasttext', k =2)

print('For test dataset')
print(f"For safe reviews:\n Precision: {round(safe_class[1],2)} \trecall: {round(safe_class[2],2)}")
print(f"For spoiler reviews:\n Precision: {round(spoiler_class[1],2)} \trecall: {round(spoiler_class[2],2)}")


For test dataset
For safe reviews:
 Precision: 0.93 	recall: 0.93
For spoiler reviews:
 Precision: 0.5 	recall: 1.0


In [61]:
# evaluate the model on test data set
safe_class = model.test('test_fasttext', k =1)
spoiler_class = model.test('test_fasttext', k =2)

print('For test dataset')
print(f"For safe reviews:\n Precision: {round(safe_class[1],2)} \trecall: {round(safe_class[2],2)}")
print(f"For spoiler reviews:\n Precision: {round(spoiler_class[1],2)} \trecall: {round(spoiler_class[2],2)}")


For test dataset
For safe reviews:
 Precision: 0.93 	recall: 0.93
For spoiler reviews:
 Precision: 0.5 	recall: 1.0


In [None]:
That looks very suspicious.

In [90]:
# function to predict probability of spoiler review classification
def predict_proba(model,text):
    a = model.predict(text, k = -1)
    return a[1][1]

# predict spoiler from probability
def prediction(proba):
    if proba < 0.5:
        return 0        
    else:
        return 1

In [10]:
# join sentences in test dataset
df_test['lemmatized'] = df_test['lemmatized'].apply(lambda x:' '.join(x))

In [13]:
# predict test set
df_test['proba'] = df_test['lemmatized'].apply(lambda x: predict_proba(model1,x))
df_test['prediction'] = df_test['proba'].apply(lambda x: prediction(model,x))

In [95]:
from sklearn.metrics import classification_report, confusion_matrix

In [16]:
confusion_matrix(df_test['has_spoiler'], df_test['prediction'])

array([[119203,      0],
       [  8389,      0]])

That doesn't match the results from fastText prediction. Try to optimize the model using autotune.

In [31]:
t0 = time.time()
model_autotune = fasttext.train_supervised(input = 'train_fasttext', autotuneValidationFile = 'validation_fasttext')
model_autotune.save_model('autotune_fasttext_model.bin')
elapsed_time = time.time()-t0
time.strftime("%H:%M:%S", time.gmtime(elapsed_time))

RuntimeError: Didn't have enough time to train once: please increase `autotune-duration`.

In [None]:
df_test.head()

In [22]:
df_test['proba_auto'] = df_test['lemmatized'].apply(lambda x: predict_proba(model_autotune,x))
df_test['prediction_auto'] = df_test['proba_auto'].apply(lambda x: prediction(x))

In [23]:
confusion_matrix(df_test['has_spoiler'], df_test['prediction_auto'])

array([[119200,      3],
       [  8389,      0]])

This did not help. We will next try to build our own word vectors based on the book descriptions and reviews.

### Build own word vectors based on descriptions and spell checked reviews

In [33]:
isolate = ['description', 'sentence_text_spellchecked']
df_train = fetch_features(training, isolate)
df_val = fetch_features(validation, isolate)
df_test = fetch_features(test, isolate)


964623it [01:25, 11277.81it/s]
275606it [00:42, 6538.74it/s] 
137804it [00:11, 12458.05it/s]


In [34]:
df_all = pd.concat([df_train, df_val, df_test])

In [35]:
df_all.shape

(1276514, 2)

In [27]:
#for text editing, replace contractions
#https://en.wikipedia.org/wiki/Wikipedia%3aList_of_English_contractions
def decontracted(text):
    # specific
    text = re.sub(r"won\'t", "will not", text)
    text = re.sub(r"can\'t", "can not", text)
    text = re.sub(r"let\'s", "let us", text)
    

    # general
    text = re.sub(r"n\'t", " not", text)
    text = re.sub(r"\'re", " are", text)
    text = re.sub(r"\'s", " is", text)
    text = re.sub(r"\'d", " would", text)
    text = re.sub(r"\'ll", " will", text)
    text = re.sub(r"\'t", " not", text)
    text = re.sub(r"\'ve", " have", text)
    text = re.sub(r"\'m", " am", text)
    text = re.sub(r"\'m", " am", text)
    
    #words
    text = re.sub("gimme", "give me", text)
    text = re.sub("cuz", "because", text)
    text = re.sub("'cause", "give me", text)
    text = re.sub("finna", "fixing to", text)
    text = re.sub("cuz", "because", text)
    text = re.sub("wanna", "want to", text)
    text = re.sub("gotta", "got to", text)
    text = re.sub("hafta", "have to", text)
    text = re.sub("woulda", "would have", text)
    text = re.sub("coulda", "could have", text)
    text = re.sub("shoulda", "should have", text)
    text = re.sub("ma'am", "madam", text)
    text = re.sub("howdy", "how do you", text)
    text = re.sub("let's", "let us", text)
    text = re.sub("y'all", "you all", text)

    return text

In [28]:
import re

# function to preprocess text
def preprocessing(text):
    t0 = time.time()
    # remove '--' and replace them with whitespace
    text = text.replace('-', ' ')
    #change to lower case
    text=text.lower()
    # replace contractions
    text = decontracted(text)
    #remove urls if there are any
    text = re.sub(r'http:\S+', '', text)
    text = re.sub(r'www\S+', '', text)
    #remove emails and words containing @
    text = re.sub("\S*@\S*\s?"," ", text)
    # remove digits and words containing digits
    text = re.sub(r"\d", "", text)
    #remove special characters and punctuation
    text = re.sub(r'[(,;:@#&$!?.)"*/-]+', ' ', text)
    text = re.sub(r"[']", '', text)
    # replace whitespaces
    text = re.sub(r"\s+", ' ', text).strip()
    return text

In [41]:
def process(text):
    try:
        text = decontracted(text)
        text = preprocessing(text)
        return text
    except:
        return ''

In [39]:
df_all.head()

Unnamed: 0,description,sentence_text_spellchecked
0,One choice can transform you--or it can destro...,"[read this review on my blog, , definitely bet..."
1,"A novel of the cruelty of war, and tenuousness...",[i was writing a comment that i realized would...
2,,[charlie is turning and her younger sister is ...
3,A ruthless businesswoman and the playboy who d...,[is more like it even though this has got to b...
4,"Five years ago, Wren Connolly was shot three t...",[review originally posted at step into fiction...


In [42]:
# preprocess the descriptions
import swifter
df_all['description'] = df_all['description'].swifter.apply(lambda x: process(x))

HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=1276514.0, style=ProgressStyle(descrip…




In [43]:
df_all['sentence_text_spellchecked'] = df_all['sentence_text_spellchecked'].swifter.apply(lambda x: ' '.join(x))


HBox(children=(FloatProgress(value=0.0, description='Pandas Apply', max=1276514.0, style=ProgressStyle(descrip…




In [44]:
df_all.head()

Unnamed: 0,description,sentence_text_spellchecked
0,one choice can transform you or it can destroy...,read this review on my blog definitely better...
1,a novel of the cruelty of war and tenuousness ...,i was writing a comment that i realized would ...
2,,charlie is turning and her younger sister is g...
3,a ruthless businesswoman and the playboy who d...,is more like it even though this has got to be...
4,five years ago wren connolly was shot three ti...,review originally posted at step into fiction ...


In [47]:
# write texts to file
with open('fasttext_vector.txt', 'w+') as vec:
    for ind in tqdm(df_all.index):
        vec.write(f"{df_all['description'][ind]}\n")
        vec.write(f"{df_all['sentence_text_spellchecked'][ind]}\n")

100%|██████████| 1276514/1276514 [2:04:24<00:00, 171.02it/s] 


In [32]:
# train word vector model
model_vec = fasttext.train_unsupervised('fasttext_vector.txt' )

In [34]:
# save word vector model
model_vec.save_model('emb_model.vec')

In [4]:
# load model
model_vec = fasttext.load_model('emb_model.vec')

### Try sentence-wise classification

In [50]:
isolate = ['has_spoiler', 'sentence_labels', 'lemmatized']
df_train = fetch_features(training, isolate)


964623it [02:52, 5592.98it/s] 


In [51]:
df_train.head()

Unnamed: 0,has_spoiler,sentence_labels,lemmatized
0,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[read review blog, , definitely well book, ins..."
1,False,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...",[write comment realize probably end long quali...
2,False,"[0, 0, 0, 0]","[charlie turn young sister get marry, decide w..."
3,False,"[0, 0, 0, 0]",[like get implausible storyline read long time...
4,True,"[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...","[review originally post step fiction, want reb..."


In [52]:
# isolate sentences and labels from spoiler reviews
df_spoiler = df_train[df_train['has_spoiler']==True]
sentences = []
for review in tqdm(df_spoiler['lemmatized']):
    for sentence in review:
        sentences.append(sentence)

labels = []
for review in tqdm(df_spoiler['sentence_labels']):
    for label in review:
        labels.append(label)


df_sentences = pd.DataFrame({
    'has_spoiler': labels,
    'sentences': sentences
})

100%|██████████| 58428/58428 [00:06<00:00, 9339.81it/s] 
100%|██████████| 58428/58428 [00:01<00:00, 38503.76it/s]


In [53]:
df_sentences.head()

Unnamed: 0,has_spoiler,sentences
0,0,review originally post step fiction
1,0,want reboot
2,0,sure want low number high number
3,0,lean high number bit kickass
4,0,high number mean long dead reboot


In [1]:
df_sentences = df_sentences.drop_duplicates(['sentences'])

In [79]:
from sklearn.model_selection import train_test_split
random_state = 42

X=df_sentences['sentences']
y = df_sentences['has_spoiler']

X_train, X_val, y_train, y_val = train_test_split(X,y,random_state = random_state, stratify= y)
X_train, X_test, y_train, y_test = train_test_split(X_train, y_train, test_size=.15, random_state=random_state, stratify= y_train)


In [80]:
df_sentence_train = pd.DataFrame({'sentences':X_train, 'label':y_train})
df_sentence_val = pd.DataFrame({'sentences':X_val, 'label':y_val})
df_sentence_test = pd.DataFrame({'sentences':X_test, 'label':y_test})


In [81]:
print(df_sentence_train.shape)
print(df_sentence_val.shape)
print(df_sentence_test.shape)

(741636, 2)
(290838, 2)
(130877, 2)


In [82]:
def prep_fasttext_sen(dataframe, label):
    with open(f'{label}_sentence_fasttext','w+') as tf:
        for ind in tqdm(dataframe.index):
            if dataframe['label'][ind] == 1:
                tf.write(f"__label__spoiler {dataframe['sentences'][ind]}\n")
            elif dataframe['label'][ind] == 0:
                tf.write(f"__label__safe {dataframe['sentences'][ind]}\n")
            else:
                continue
    tf.close()    

In [83]:
prep_fasttext_sen(df_sentence_train, 'train')
prep_fasttext_sen(df_sentence_val, 'val')
prep_fasttext_sen(df_sentence_test, 'test')



100%|██████████| 741636/741636 [00:29<00:00, 24900.83it/s]
100%|██████████| 290838/290838 [00:11<00:00, 26393.82it/s]
100%|██████████| 130877/130877 [00:05<00:00, 24111.15it/s]


#### Train fasttext classification models


In [100]:
t0 = time.time()
s_model_autotune = fasttext.train_supervised(input = 'train_sentence_fasttext', epoch = 40, lr = 0.25,
                                             autotuneValidationFile = 'val_sentence_fasttext')
s_model_autotune.save_model('sentence_autotune_fasttext_model.bin')
elapsed_time = time.time()-t0
time.strftime("%H:%M:%S", time.gmtime(elapsed_time))

'00:06:10'

In [86]:
s_model_autotune.test('train_sentence_fasttext')

(741636, 0.7654981689130517, 0.7654981689130517)

In [94]:
df_sentence_test['probability'] = df_sentence_test['sentences'].apply(lambda x: predict_proba(s_model_autotune,x))
df_sentence_test['prediction'] = df_sentence_test['probability'].apply(lambda x: prediction(x))


In [99]:
confusion_matrix(df_sentence_test['label'] , df_sentence_test['prediction'])

array([[95305,     2],
       [35568,     2]])

Classification of spoiler sentences also did not work. Assuming we did not make mistakes, Fasttext classification is not well suited for our problem.   
Just to make sure that we have used fasttext correctly, we should try our code on a different dataset with clearly separated classes (sentiment, politics news vs. entertainment news)