## FastText

#### Import Libraries

In [16]:
import pandas as pd
import json
import numpy as np
import os
import string
import time
import nltk
import fasttext
import os

from sklearn.model_selection import train_test_split

import matplotlib as mpl
import matplotlib.pyplot as plt
import matplotlib.pylab as pylab
import seaborn as sns
from pandas.plotting import scatter_matrix

# visualization
%matplotlib inline
mpl.style.use('ggplot')
sns.set_style('white')
pylab.rcParams['figure.figsize'] = 12,8

#### Parameter Settings

In [17]:
YELP_REVIEW_W2V_PATH = r'../data/w2v_yelp.csv'
FASTTEXT_TRAIN_PATH = r'../data/train_for_fasttext.txt'
FASTTEXT_TEST_PATH = r'../data/test_for_fasttext.txt'

DIM = 200   # dimensions
LR = 1.65      # learning rate, divide 10, 0.2
EPOCH = 5
MODEL_PATH = f'../models/fasttext_model/data_dim{str(DIM)}_lr0{str(LR)}_iter{str(EPOCH)}.model'

#### load data

In [18]:
df_reviews = pd.read_csv(YELP_REVIEW_W2V_PATH)
df_reviews.sample(10)

Unnamed: 0,stars,Review_Labels,cleaned,w2v
49716,5.0,2,could give star would hand easiest dri cleaner...,"[-0.0022870481479912996, 0.0048250057734549046..."
70637,4.0,2,food wa good great onli two option lunch speci...,"[0.010927814990282059, -0.0029274174012243748,..."
21392,5.0,2,wa go vega area celebr th birthday stumbl upon...,"[0.015033287927508354, 0.007917341776192188, -..."
27240,5.0,2,girlfriend book surpris balloon ride float bir...,"[-0.015204167924821377, 0.004098470322787762, ..."
46031,5.0,2,drink dinner bacon blue chees burger fianc tai...,"[0.010648817755281925, 0.020422935485839844, -..."
43562,5.0,2,delici excel servic expens definit get pay gro...,"[0.00109921267721802, -0.008022632449865341, 0..."
49311,4.0,2,firsr time wa recommend drove way tri order gr...,"[0.030711118131875992, 0.019656935706734657, 0..."
82848,1.0,0,restaur week husband reserv got sat took hour ...,"[-0.004043980501592159, 0.0028418938163667917,..."
29832,1.0,0,worst servic guu ever tri sever vancouv previo...,"[0.021075522527098656, -0.0026419160421937704,..."
16476,2.0,0,larg portion averag food bad manag atmospher k...,"[0.014152927324175835, 0.0031066318042576313, ..."


In [19]:
df_reviews.describe(include='all')

Unnamed: 0,stars,Review_Labels,cleaned,w2v
count,99999.0,99999.0,99999,99999
unique,,,99949,99949
top,,,absolut ador thi place came quick bite one aft...,"[0.02013453282415867, 0.008425872772932053, 0...."
freq,,,2,2
mean,3.752928,1.449184,,
std,1.434764,0.828607,,
min,1.0,0.0,,
25%,3.0,1.0,,
50%,4.0,2.0,,
75%,5.0,2.0,,


###  fasttext

In [20]:
# data proparation once

def prepare_data():
    df_reviews = pd.read_csv(YELP_REVIEW_W2V_PATH)
    df_reviews = df_reviews[["Review_Labels", "cleaned"]]

    train, test = train_test_split(
        df_reviews, test_size=0.2, stratify=df_reviews["Review_Labels"], random_state=42
    )

    data_train = train.values.tolist()
    data_test = test.values.tolist()

    with open(FASTTEXT_TRAIN_PATH, 'w', encoding='utf-8') as train_f, open(FASTTEXT_TEST_PATH, 'w', encoding='utf-8') as test_f:
        for row in data_train:
            train_f.write(f'__label__{str(row[0]).strip()} , {str(row[1]).strip()}\n')
        for row in data_test:
            test_f.write(f'__label__{str(row[0]).strip()} , {str(row[1]).strip()}\n')

prepare_data()


In [21]:
# 训练模型
def train_model(ipt=None, opt=None, model='', dim=100, epoch=5, lr=0.1, loss='softmax'):
    np.set_printoptions(suppress=True)
    if os.path.isfile(model):
        classifier = fasttext.load_model(model)
    else:
        classifier = fasttext.train_supervised(ipt, label='__label__', dim=dim, epoch=epoch,
                                               lr=lr, wordNgrams=2, loss=loss,minCount=1)
        classifier.save_model(opt)
    return classifier

In [24]:
time_ft_start = time.time()
classifier = train_model(ipt=FASTTEXT_TRAIN_PATH,
                         opt=MODEL_PATH,
                         model=MODEL_PATH,
                         dim=DIM,
                         epoch=EPOCH,
                         lr=LR*0.1
                         )
time_ft_start = time.time()
result_train = classifier.test(FASTTEXT_TRAIN_PATH)

In [29]:
print('train result', result_train[0])
train_acc, train_recall = result_train[1], result_train[2]
print('  accuracy %.4f'%(train_acc),'  recall %.4f'%(train_recall), 
      '  f1-score %.4f'%(2*train_acc*train_recall/(train_acc+train_recall)))

result_test = classifier.test(FASTTEXT_TEST_PATH)
print('test result', result_test[0])
test_acc, test_recall = result_test[1], result_test[2]
print('  accuracy %.4f'%(test_acc),'  recall %.4f'%(test_recall), 
      '  f1-score %.4f'%(2*test_acc*test_recall/(test_acc+test_recall)))

time_ft = round((time.time() - time_ft_start) / 60, 2)
print('parameters')
print(str({'dim':DIM, 'lr':LR*0.1, 'epoch':EPOCH}))
print('running time', time_ft)

train result 79999
  accuracy 0.9612   recall 0.9612   f1-score 0.9612
test result 20000
  accuracy 0.8590   recall 0.8590   f1-score 0.8590
parameters
{'dim': 200, 'lr': 0.165, 'epoch': 5}
running time 31.73
