# NLP & W2VEC

In [None]:
import pandas as pd
import seaborn as sns
import re
import numpy as np
import matplotlib.pyplot as plt
import json

from wordcloud import WordCloud
from pythainlp.tokenize import THAI2FIT_TOKENIZER # ใช้ในการตัดคำ
from pythainlp.corpus import common # ใช้ลบคำที่ไม่ใช้ออก

from sklearn import preprocessing

# Word2Vec
from pythainlp.word_vector import WordVector
from gensim.models.doc2vec import Doc2Vec, TaggedDocument

# AI
from sklearn.model_selection import train_test_split # split data set
from sklearn.metrics import accuracy_score, classification_report # report train & test result

# AI Models
from sklearn.ensemble import RandomForestClassifier

# Hyperparameter Tunning
from skopt import BayesSearchCV
from skopt.space import Integer, Categorical, Real
from skopt.plots import plot_objective, plot_histogram

# Saving Intelligence
from joblib import dump, load


: 

In [None]:
data = pd.read_csv("./data/data1668154807.csv")
data = data[['message', 'category']]

: 

In [None]:
data.head()

: 

### Data Features and Values

In [None]:
def clean_data(data):
    # filter common chat
    # data = data.drop(data.index[ data['category'] == 'C' ])
    # sort data by category
    data = data.sort_values(by=['category'])

    data = data.reset_index()
    data = data[['message', 'category']]

    return data

: 

In [None]:
data = clean_data(data)

: 

In [None]:
category_list = data['category'].unique()
category_list

: 

In [None]:
data.shape

: 

In [None]:
data.dtypes

: 

In [None]:
data.isnull().any()

: 

### Countplot of target Variable(Category)

In [None]:
def plot_count_graph():
    count_graph = sns.countplot(data.category)
    count_list = []

    for p in count_graph.patches:
        height = p.get_height()
        count_list.append(height)
        count_graph.annotate('{:.1f}'.format(height), (p.get_x()+0.25, height+0.01))

    plt.show()

: 

In [None]:
plot_count_graph()

: 

### Balancing Category

In [None]:
df_list = []
for category in category_list:
    df_list.append(data.query(f"category == '{category}'").sample(
        n=600,
        replace=False, #True if numbers of sample higher than minimum numbers of category
        random_state=18, 
        )
    )

data = clean_data(data= pd.concat(df_list))

: 

In [None]:
plot_count_graph()

: 

In [None]:
data['message_length'] = data['message'].str.len()
print(data['message_length'])

: 

### Words Distribution Plot

In [None]:
sns.displot(data['message_length']).set_titles('message_length')

: 

### Count Words

In [None]:
def count_words(category=''):
    word_list = set()
    if(category != ''):
        sub_frame = data[data['category']==category]
    else:
        sub_frame = data

    for text in sub_frame['message']:
        
        text = text.lower().replace('\n', ' ').replace('\r', '').strip()
        text = re.findall(r"[\u0E00-\u0E7Fa-zA-Z']+", text)
        text = ' '.join(text)

        word_tokens = THAI2FIT_TOKENIZER.word_tokenize(text)
        filtered_sentence = set([w for w in word_tokens])

        word_list.update(filtered_sentence)

    return len(word_list)

: 

In [None]:
words_category = []
for category in category_list:
    words_category.append(count_words(category))

words_category

: 

In [None]:
print('Total Word:')
print(count_words())

: 

In [None]:
words_count = {'category': category_list, 'words_count': words_category}
w_count_data = pd.DataFrame(words_count)
sns.barplot(x= 'category', y='words_count', data=w_count_data)

: 

### Word Cloud

In [None]:
def create_wordcloud(words):
    wordcloud = WordCloud(font_path='THSarabun.ttf', # path ที่ตั้ง Font
                      regexp=r"[\u0E00-\u0E7Fa-zA-Z']+" # ป้องกัน bug วรรณยุกต์
                      ).generate(' '.join(THAI2FIT_TOKENIZER.word_tokenize(words)))
    plt.figure(figsize=[10, 7])
    plt.imshow(wordcloud, interpolation= "bilinear")
    plt.axis('off')
    plt.show()

: 

In [None]:
def check_wordcloud(category):
    subset = data[data.category==category]
    text = subset.message.values
    words = ''.join(text)
    create_wordcloud(words)

: 

In [None]:
check_wordcloud('T')

: 

In [None]:
check_wordcloud('S')

: 

In [None]:
check_wordcloud('A')

: 

In [None]:
check_wordcloud('Q')

: 

### Label Encoding

In [None]:
label_encoder = preprocessing.LabelEncoder()
data['category_target'] = label_encoder.fit_transform(data['category'])

: 

In [None]:
temp_data = data[['category', 'category_target']].drop_duplicates()
print(temp_data)


: 

In [None]:
# # keep category target in json
# json_classes = temp_data.to_json(orient="records")
# json_file = 'classes.json'

# # Writing to .json
# with open(json_file, "w") as outfile:
#     outfile.write(json_classes)


: 

In [None]:
# # Opening JSON file
# with open(json_file, 'r') as openfile:
#     # Reading from json file
#     temp_json = json.load(openfile)
#     print(temp_json)

: 

In [None]:
# temp_json.sort(key=lambda item : item['category_target'])
# print(temp_json)

: 

In [None]:
# classes_list = [ item['category'] for item in temp_json]
# print(classes_list)

: 

### Word2Vec

In [None]:
w2v_thai = WordVector()

: 

In [None]:

word2vec = [w2v_thai.sentence_vectorizer(data['message'][i]) 
            for i in range(0,len(data['message']))]

: 

In [None]:
X = np.array(word2vec).tolist()

data['word2vec'] = X


: 

In [None]:
# reshape
data_reshape = np.reshape(X, (-1, 300))

: 

In [None]:
print(data_reshape.shape)

: 

In [None]:
data.head()

: 

# AI MODEL

### Split the data in Training and Testing

In [None]:
data_temp = [ x for x in data['word2vec']]
data_reshape = np.reshape(data_temp, (-1, 300))
data_reshape.shape


: 

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data_reshape,
                                                    data['category_target'],
                                                    random_state=18,
                                                    test_size = 0.2,
                                                    shuffle=True
                                                    )

: 

In [None]:
print(X_train.shape)
print(y_train.shape)

: 

In [None]:
print(X_test.shape)
print(y_test.shape)

: 

### Models Predictions

In [None]:
# RandomForest Model
rf_model = RandomForestClassifier(
    # n_estimators= 1300,
    # max_depth= 180,
    # min_samples_split= 2,
    # bootstrap= False,
    # criterion= 'entropy',
    )
rf_model.fit(X_train, y_train)

accuracy_predictions = rf_model.predict(X_train)
print('Validation', accuracy_score(y_train, accuracy_predictions))

model_predictions = rf_model.predict(X_test)
print('Accuracy', accuracy_score(y_test, model_predictions))

print(classification_report(y_test, model_predictions))

: 

### Load Saved Model

In [None]:
# rf_model = load('./models/rf_model.joblib') 

: 

In [None]:
# accuracy_predictions = rf_model.predict(X_train)
# print('Validation', accuracy_score(y_train, accuracy_predictions))

# model_predictions = rf_model.predict(X_test)
# print('Accuracy', accuracy_score(y_test, model_predictions))

# print(classification_report(y_test, model_predictions))

: 

### Save AI Model

In [None]:
# dump(rf_model, './models/rf_model.joblib')

: 

### BayesSearch Hyperparameter Tunning

<!-- Suggest Config RandomForest example
{'n_estimators': array([ 100,  307,  514,  721,  928, 1135, 1342, 1550, 1757, 1964, 2171,
        2378, 2585, 2792, 3000]),
 'max_depth': [1, 5, 10, 20, 50, 75, 100, 150, 200],
 'min_samples_split': [1, 2, 5, 10, 15, 20, 30],
 'min_samples_leaf': [1, 2, 3, 4],
 'bootstrap': [True, False],
 'criterion': ['gini', 'entropy']} -->

In [None]:
# model = RandomForestClassifier(
#     # bootstrap=False,
#     # criterion='entropy',
#     # min_samples_split=2,
#     )
# opt = BayesSearchCV(
#     model,
#     {
#         'n_estimators': Integer(800, 1500),
#         'max_depth': Integer(100, 200),
#         'min_samples_split': Integer(2, 30),
#         'min_samples_leaf': Integer(1, 10),
#         'bootstrap': Categorical(['True', 'False']),
#         'criterion': Categorical(['gini', 'entropy']),
#     }
#     , cv=3
#     , verbose=1
#     , random_state=0
#     , n_iter= 100
# )
# opt.fit(X_train, y_train)

: 

In [None]:
# print(opt.best_score_)
# print(opt.best_params_)

: 

In [None]:
# plot_objective(opt.optimizer_results_[0],
#                    dimensions=[
#                     'bootstrap',
#                     'criterion',
#                     'max_depth',
#                     'min_samples_leaf',
#                     'min_samples_split',
#                     'n_estimator',
#                     # 'random_state',
#                     ],
#                    n_minimum_search=int(1e8))
# plt.show()

: 

# Use Case with Modules


In [None]:
from project_module.message_classifier import MessageClassifier

: 

In [None]:
message_classifier = MessageClassifier(auto_common= False)

: 

In [None]:
message_classifier.load_model(model_path='./models/rf_model.joblib', json_classes_path='./classes.json')

: 

In [None]:
print(message_classifier.classify(text_input='ว่ายังไง'))

: 

### Analyz Model

In [None]:
result_data = pd.DataFrame(y_test.reset_index())

result_data['message'] = [ data.loc[i]['message'] for i in result_data['index'] ]
result_data['predict'] = message_classifier.predict(X_test)
result_data['predict_proba'] = [ p for p in message_classifier.predict_proba(X_test) ]
result_data['predict_sd'] = message_classifier.predict_sd(X_test)
result_data.head()


: 

In [None]:
temp = result_data.drop(result_data.index[ result_data['category_target'] == result_data['predict'] ])
temp.head()


: 

In [None]:
mean_error_sd =  temp['predict_sd'].mean()
print(f'avg sd of wrong predict: {mean_error_sd}')

: 

### Summary Final Result

['A', 'C', 'Q', 'S', 'T']

In [None]:
def count_category(data):
    count = [0, 0, 0, 0, 0]
    
    for y in data:
        count[y] += 1
    
    return count

: 

In [None]:
y_result =  message_classifier.predict(X_test)


: 

In [None]:
print('Accuracy', accuracy_score(y_test, y_result))

print(classification_report(y_test, y_result))

: 

### Common Chat Accuracy Test

In [None]:
# common chat data
c_data = data.loc[data.index[ data['category'] == 'C' ]]
c_data.head()

: 

In [None]:
X_temp = c_data['word2vec'].reset_index()
X_temp.head()

: 

### Auto common category Test

In [None]:
#auto common 
y_temp1 = [ message_classifier.predict(x)[0] for x in X_temp['word2vec'] ]
print(f'Common Accuracy: {y_temp1.count(1)/len(y_temp1)}')

: 

In [None]:
# none auto common model
custom_message_classifier = MessageClassifier(auto_common= False)
custom_message_classifier.load_model(model_path='./models/rf_model.joblib', json_classes_path='./classes.json')

y_temp2 = [ custom_message_classifier.predict(x)[0] for x in X_temp['word2vec'] ]
print(f'Common Accuracy: {y_temp2.count(1)/len(y_temp2)}')

: 

In [None]:
for i in range(30, 80):
    # tunning model
    error_sd = 0.001 * i
    custom_message_classifier = MessageClassifier(error_sd= error_sd)
    custom_message_classifier.load_model(model_path='./models/rf_model.joblib', json_classes_path='./classes.json')

    y_temp2 = custom_message_classifier.predict(X_test)
    print(f'SD: {round(error_sd, 3)} | Accuracy: {accuracy_score(y_test, y_temp2)}')
    # print('Accuracy', accuracy_score(y_test, y_temp2))
    # print(classification_report(y_test, y_temp2))

: 