# Reference websites
1. Official tutorial of python, [here](https://fasttext.cc/docs/en/python-module.html#text-classification-model)
2. Example, [here](https://towardsdatascience.com/fasttext-for-text-classification-a4b38cbff27c) 

In [None]:
!pip install fasttext

In [None]:
import fasttext
import numpy as np, pandas as pd
from gensim.utils import simple_preprocess

# 1.0 Data preparation

* In order to train and evaluate this classifier, we’ll have to prepare our data in a format fastText expects. For example:

> * I really enjoyed this restaurant. Would love to visit again. _label_positive


* We will use **[gensim’s simple_preprocess method](https://www.tutorialspoint.com/gensim/gensim_quick_guide.htm)** to tokenize our questions and remove symbols.

> * lower case
> * remove punctuation marks
























In [None]:
# For general processing of the data
ds = df=pd.read_json('sample_data/All_Beauty_5.json', lines=True)
texts = [' '.join([str(i),str(j)]) for i,j in zip(df['reviewText'],df['summary'])]
labels = [i for i in df['overall']]

new_texts=[]
new_labels=[]
N_5=0
for i in range(len(labels)):
  if labels[i]==5:
    N_5+=1
    if N_5<156:new_texts.append(texts[i]);new_labels.append(labels[i])
  else: new_texts.append(texts[i]);new_labels.append(labels[i])
texts=new_texts
labels=new_labels

#for converting the dataset into fasttext-readable one
texts=[' '.join(simple_preprocess(i)) for i in texts]
labels=['__label__' + str(i) for i in labels]
train_data=[i +' '+ j for i,j in zip (texts,labels)]


#divide into train, val, test
from sklearn.model_selection import train_test_split
rest_data, test_data = train_test_split(train_data, test_size=0.1, random_state=1)
train_data, val_data = train_test_split(rest_data, test_size=0.1, random_state=1)

df = pd.DataFrame(train_data)
df.to_csv('train.txt', 
          index = False, )
          #sep = ' ',
          #header = None, )

df = pd.DataFrame(val_data)
df.to_csv('val.txt', 
          index = False, )
          #sep = ' ',
          #header = None, )

df = pd.DataFrame(test_data)
df.to_csv('test.txt', 
          index = False, )
          #sep = ' ',
          #header = None, )

# Model training

In [108]:
for i in range(50,120,5):
  model = fasttext.train_supervised('train.txt', epoch=i, dim=2, wordNgrams=0)
  if model.test('val.txt')[1] > 0.95: break

# Model testing

In [109]:
model.test('train.txt')

(627, 0.9968102073365231, 0.9968102073365231)

# Other APIs

In [None]:
model['you'] #return the vector
model.test('val.txt')
model.predict(val_data[3],k=2)
model.predict(["Which baking dish is best to bake a banana bread ?", "Why not put knives in the dishwasher?"], k=3)
model.words
model.labels

model.save_model("model_filename.bin")
model = fasttext.load_model("model_filename.bin") #we can save model with smaller size with quantization, refer to the website. 