## Import Libraries

In [None]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from hazm import word_tokenize, stopwords_list
from dadmatools.models.normalizer import Normalizer

## Data Reading

In [None]:
train_df = pd.read_csv('data/snappfood_2class/train.csv', sep='\t', encoding='utf-8')
test_df = pd.read_csv('data/snappfood_2class/test.csv', sep='\t', encoding='utf-8')
train_df

In [None]:
train_X = train_df['comment'] 
train_Y = train_df['label_id'] 

test_X = test_df['comment'] 
test_Y = test_df['label_id'] 

## Preprocessing and TF-IDF Vectorization

In [None]:
normalize = Normalizer(remove_puncs=True, replace_number_with="")

In [None]:
tfidf = TfidfVectorizer(preprocessor=normalize.normalize,
                        stop_words=stopwords_list(),
                       tokenizer=word_tokenize)
tf_idf_matrix = tfidf.fit_transform(train_X)

In [None]:
tf_idf_matrix.get_shape()

## Data Classification

In [None]:
from sklearn.naive_bayes import MultinomialNB

nb = MultinomialNB()
clf = nb.fit(tf_idf_matrix, train_Y)

In [None]:
tf_idf_test = tfidf.transform(test_X)
preds = clf.predict(tf_idf_test)

In [None]:
from sklearn.metrics import classification_report

classification_report(test_Y, preds, target_names=['Sad', 'Happy'], output_dict=True)

In [None]:
text = test_X.iloc[0]
text

In [None]:
vec = tfidf.transform([text])
clf.predict(vec)