**Import Library for preprocessing**

In [4]:
import pandas as pd
import numpy as np

**Load the Dataset that i just prepared**

In [5]:
tcas = pd.read_csv('C:/Users/tarza\OneDrive/Desktop/Sentiment_Analysis/Dataset/tcas_prepared.csv')
shopping = pd.read_csv('C:/Users/tarza\OneDrive/Desktop/Sentiment_Analysis/Dataset/shopping_prepared.csv')
general = pd.read_csv('C:/Users/tarza\OneDrive/Desktop/Sentiment_Analysis/Dataset/general_prepared.csv')

# **Data Preprocessing**

In [6]:
sentiment_label = {'neg': 0, 'pos': 1}
tcas['sentiment'] = tcas['label'].map(sentiment_label)
shopping['sentiment'] = shopping['label'].map(sentiment_label)
general['sentiment'] = general['label'].map(sentiment_label)

Combine 3 dataframes for model experiment (after this process)

In [7]:
combined_df = pd.concat([tcas, shopping, general], ignore_index=True)
len(combined_df['sentence'])

342

Import PyThaiNLP for preprocessing Thai Dataset

In [8]:
from pythainlp.corpus.common import thai_stopwords
from pythainlp import word_tokenize
import string

Instantiate Stopwords(thai) and punctuation

In [9]:
stopwords_thai = list(thai_stopwords())
pun = string.punctuation+'ๆ'+'ฯ'

Define function to preprocess the sentence like remove punctuation, stop words, etc

In [10]:
def process_sentence(sentence):
    sentence_clean = "".join(u for u in sentence if u not in pun)
    sentence_clean = word_tokenize(sentence_clean)
    sentence_clean = " ".join(word for word in sentence_clean)
    sentence_clean = " ".join(word for word in sentence_clean.split() if word.lower not in stopwords_thai)
    return sentence_clean

In [11]:
tcas['text_tokens'] = tcas['sentence'].apply(process_sentence)
combined_df['text_tokens'] = combined_df['sentence'].apply(process_sentence)

Split each set to:

    * train set = 80
    * validation set = 20

In [12]:
# features and target of tcas dataset
Xtcas = tcas[['text_tokens']]
ytcas = tcas['sentiment']

# features and target of combined dataset
Xdf = combined_df[['text_tokens']]
ydf = combined_df['sentiment']

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
Xtcas_train, Xtcas_test, ytcas_train, ytcas_test = train_test_split(Xtcas, ytcas, test_size=0.2, random_state=42)
Xdf_train, Xdf_test, ydf_train, ydf_test = train_test_split(Xdf, ydf, test_size=0.2, random_state=42)

Import CountVectorizer for counting my vectors

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

counting tcas set

In [16]:
cvec_tcas = CountVectorizer(analyzer=lambda x:x.split(' '))
cvec_tcas.fit_transform(Xtcas_train['text_tokens'])

<99x784 sparse matrix of type '<class 'numpy.int64'>'
	with 2097 stored elements in Compressed Sparse Row format>

counting combined set

In [17]:
cvec_df = CountVectorizer(analyzer=lambda x:x.split(' '))
cvec_df.fit_transform(Xdf_train['text_tokens'])

<273x1258 sparse matrix of type '<class 'numpy.int64'>'
	with 3755 stored elements in Compressed Sparse Row format>

In [18]:
cvec_df.vocabulary_

{'ขอให้': 92,
 'ติด': 316,
 'เภสัช': 993,
 'มข': 534,
 'สาธุ': 758,
 '🙏🙏🙏': 1257,
 'ดังนั้น': 254,
 'ควร': 127,
 'อ่านหนังสือ': 881,
 'ได้': 1200,
 'แล้ว': 1133,
 'นะ': 391,
 'เหลือ': 1055,
 'เวลา': 1027,
 'ไม่': 1214,
 'นาน': 404,
 'สู้': 784,
 'ใคร': 1177,
 'ยัง': 582,
 'มี': 559,
 'ที่': 364,
 'เรียน': 1010,
 'เหมือน': 1049,
 'เรา': 1007,
 'บ้าง': 455,
 'นี่': 410,
 'เครียด': 912,
 'หนัก': 792,
 'เลย': 1019,
 'ย': 573,
 '😭😭😭': 1252,
 'คน': 112,
 'จัดส่ง': 202,
 'เร็ว': 1016,
 'มาก': 550,
 'สะดวก': 744,
 'ดี': 260,
 'ของจริง': 86,
 'ขนาดใหญ่': 81,
 'กว่า': 42,
 'คิด': 158,
 'ไว้': 1223,
 'แต่': 1099,
 'ใช้งาน': 1183,
 'ได้ดี': 1201,
 'เฟิร์น': 992,
 'ป่วย': 476,
 'เป็นโรค': 981,
 'ไข้เลือดออก': 1197,
 'อ่ะ': 879,
 'ตอนนี้': 285,
 'นอน': 389,
 'อยู่': 847,
 'โรงบาล': 1167,
 'ปิยะเวท': 474,
 'ไม่เป็นไร': 1217,
 'ขอบคุณ': 90,
 'นึกถึง': 413,
 'พิม': 514,
 'สาบาน': 759,
 'เหอะ': 1058,
 'ว่าที่': 712,
 'แก': 1080,
 'พูด': 519,
 'รู้': 652,
 'ความหมาย': 141,
 'กู': 69,
 'เชื่อ': 929,
 'หรอ

In [19]:
BOWtcas_train = cvec_tcas.transform(Xtcas_train['text_tokens'])
BOWtcas_test = cvec_tcas.transform(Xtcas_test['text_tokens'])
BOWdf_train = cvec_df.transform(Xdf_train['text_tokens'])
BOWdf_test = cvec_df.transform(Xdf_test['text_tokens'])

# **Process Function for Pipeline**

In [25]:
def n1_process(sentence):
    # Tokens
    clean = process_sentence(sentence)
    if not isinstance(clean, str):
        raise ValueError("Processed sentence is not a string.")
    # BOW
    BOW_sentence = cvec_tcas.transform([clean])
    return BOW_sentence

In [26]:
def n2_process(sentence):
    # Tokens
    clean = process_sentence(sentence)
    if not isinstance(clean, str):
        raise ValueError("Processed sentence is not a string.")
    
    # BOW
    BOW_sentence = cvec_df.transform([clean])  # Pass the processed sentence as a list
    return BOW_sentence

In [28]:
print(n1_process("สวัสดีครับ"))

  (0, 78)	1
