**Import Library for preprocessing**

In [4]:
import pandas as pd
import numpy as np

**Load the Dataset that i just prepared**

In [5]:
tcas = pd.read_csv('C:/Users/tarza\OneDrive/Desktop/Sentiment_Analysis/Dataset/tcas_prepared.csv')
shopping = pd.read_csv('C:/Users/tarza\OneDrive/Desktop/Sentiment_Analysis/Dataset/shopping_prepared.csv')
general = pd.read_csv('C:/Users/tarza\OneDrive/Desktop/Sentiment_Analysis/Dataset/general_prepared.csv')

# **Data Preprocessing**

In [6]:
sentiment_label = {'neg': 0, 'pos': 1}
tcas['sentiment'] = tcas['label'].map(sentiment_label)
shopping['sentiment'] = shopping['label'].map(sentiment_label)
general['sentiment'] = general['label'].map(sentiment_label)

Combine 3 dataframes for model experiment (after this process)

In [7]:
combined_df = pd.concat([tcas, shopping, general], ignore_index=True)
len(combined_df['sentence'])

342

Import PyThaiNLP for preprocessing Thai Dataset

In [8]:
from pythainlp.corpus.common import thai_stopwords
from pythainlp import word_tokenize
import string

Instantiate Stopwords(thai) and punctuation

In [9]:
stopwords_thai = list(thai_stopwords())
pun = string.punctuation+'‡πÜ'+'‡∏Ø'

Define function to preprocess the sentence like remove punctuation, stop words, etc

In [10]:
def process_sentence(sentence):
    sentence_clean = "".join(u for u in sentence if u not in pun)
    sentence_clean = word_tokenize(sentence_clean)
    sentence_clean = " ".join(word for word in sentence_clean)
    sentence_clean = " ".join(word for word in sentence_clean.split() if word.lower not in stopwords_thai)
    return sentence_clean

In [11]:
tcas['text_tokens'] = tcas['sentence'].apply(process_sentence)
combined_df['text_tokens'] = combined_df['sentence'].apply(process_sentence)

Split each set to:

    * train set = 80
    * validation set = 20

In [12]:
# features and target of tcas dataset
Xtcas = tcas[['text_tokens']]
ytcas = tcas['sentiment']

# features and target of combined dataset
Xdf = combined_df[['text_tokens']]
ydf = combined_df['sentiment']

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
Xtcas_train, Xtcas_test, ytcas_train, ytcas_test = train_test_split(Xtcas, ytcas, test_size=0.2, random_state=42)
Xdf_train, Xdf_test, ydf_train, ydf_test = train_test_split(Xdf, ydf, test_size=0.2, random_state=42)

Import CountVectorizer for counting my vectors

In [15]:
from sklearn.feature_extraction.text import CountVectorizer

counting tcas set

In [16]:
cvec_tcas = CountVectorizer(analyzer=lambda x:x.split(' '))
cvec_tcas.fit_transform(Xtcas_train['text_tokens'])

<99x784 sparse matrix of type '<class 'numpy.int64'>'
	with 2097 stored elements in Compressed Sparse Row format>

counting combined set

In [17]:
cvec_df = CountVectorizer(analyzer=lambda x:x.split(' '))
cvec_df.fit_transform(Xdf_train['text_tokens'])

<273x1258 sparse matrix of type '<class 'numpy.int64'>'
	with 3755 stored elements in Compressed Sparse Row format>

In [18]:
cvec_df.vocabulary_

{'‡∏Ç‡∏≠‡πÉ‡∏´‡πâ': 92,
 '‡∏ï‡∏¥‡∏î': 316,
 '‡πÄ‡∏†‡∏™‡∏±‡∏ä': 993,
 '‡∏°‡∏Ç': 534,
 '‡∏™‡∏≤‡∏ò‡∏∏': 758,
 'üôèüôèüôè': 1257,
 '‡∏î‡∏±‡∏á‡∏ô‡∏±‡πâ‡∏ô': 254,
 '‡∏Ñ‡∏ß‡∏£': 127,
 '‡∏≠‡πà‡∏≤‡∏ô‡∏´‡∏ô‡∏±‡∏á‡∏™‡∏∑‡∏≠': 881,
 '‡πÑ‡∏î‡πâ': 1200,
 '‡πÅ‡∏•‡πâ‡∏ß': 1133,
 '‡∏ô‡∏∞': 391,
 '‡πÄ‡∏´‡∏•‡∏∑‡∏≠': 1055,
 '‡πÄ‡∏ß‡∏•‡∏≤': 1027,
 '‡πÑ‡∏°‡πà': 1214,
 '‡∏ô‡∏≤‡∏ô': 404,
 '‡∏™‡∏π‡πâ': 784,
 '‡πÉ‡∏Ñ‡∏£': 1177,
 '‡∏¢‡∏±‡∏á': 582,
 '‡∏°‡∏µ': 559,
 '‡∏ó‡∏µ‡πà': 364,
 '‡πÄ‡∏£‡∏µ‡∏¢‡∏ô': 1010,
 '‡πÄ‡∏´‡∏°‡∏∑‡∏≠‡∏ô': 1049,
 '‡πÄ‡∏£‡∏≤': 1007,
 '‡∏ö‡πâ‡∏≤‡∏á': 455,
 '‡∏ô‡∏µ‡πà': 410,
 '‡πÄ‡∏Ñ‡∏£‡∏µ‡∏¢‡∏î': 912,
 '‡∏´‡∏ô‡∏±‡∏Å': 792,
 '‡πÄ‡∏•‡∏¢': 1019,
 '‡∏¢': 573,
 'üò≠üò≠üò≠': 1252,
 '‡∏Ñ‡∏ô': 112,
 '‡∏à‡∏±‡∏î‡∏™‡πà‡∏á': 202,
 '‡πÄ‡∏£‡πá‡∏ß': 1016,
 '‡∏°‡∏≤‡∏Å': 550,
 '‡∏™‡∏∞‡∏î‡∏ß‡∏Å': 744,
 '‡∏î‡∏µ': 260,
 '‡∏Ç‡∏≠‡∏á‡∏à‡∏£‡∏¥‡∏á': 86,
 '‡∏Ç‡∏ô‡∏≤‡∏î‡πÉ‡∏´‡∏ç‡πà': 81,
 '‡∏Å‡∏ß‡πà‡∏≤': 42,
 '‡∏Ñ‡∏¥‡∏î': 158,
 '‡πÑ‡∏ß‡πâ': 1223,
 '‡πÅ‡∏ï‡πà': 1099,
 '‡πÉ‡∏ä‡πâ‡∏á‡∏≤‡∏ô': 1183,
 '‡πÑ‡∏î‡πâ‡∏î‡∏µ

In [19]:
BOWtcas_train = cvec_tcas.transform(Xtcas_train['text_tokens'])
BOWtcas_test = cvec_tcas.transform(Xtcas_test['text_tokens'])
BOWdf_train = cvec_df.transform(Xdf_train['text_tokens'])
BOWdf_test = cvec_df.transform(Xdf_test['text_tokens'])

# **Process Function for Pipeline**

In [25]:
def n1_process(sentence):
    # Tokens
    clean = process_sentence(sentence)
    if not isinstance(clean, str):
        raise ValueError("Processed sentence is not a string.")
    # BOW
    BOW_sentence = cvec_tcas.transform([clean])
    return BOW_sentence

In [26]:
def n2_process(sentence):
    # Tokens
    clean = process_sentence(sentence)
    if not isinstance(clean, str):
        raise ValueError("Processed sentence is not a string.")
    
    # BOW
    BOW_sentence = cvec_df.transform([clean])  # Pass the processed sentence as a list
    return BOW_sentence

In [28]:
print(n1_process("‡∏™‡∏ß‡∏±‡∏™‡∏î‡∏µ‡∏Ñ‡∏£‡∏±‡∏ö"))

  (0, 78)	1
