# 🌐 ThaiCoNet : Thai Co-occurrence Network Analysis (Pipeline)

### 💡 About this notebook

**[Access Project Repository (Github)](https://github.com/ChotanansubSoph/ThNTA)**



🧑🏻‍💻 **Notebook Contributor**


*   Chotanansub Sophaken
*   Kantapong Vongpanich

**Department of Computer Eningeering, Engineering Faculty**

*King Mongkut’s University of Technology Thonburi (KMUTT)*

*Junior Science Talent Project and Siam Commercial Bank Scholarship (JSTP-SCB Scholarship)*

---



Approach reference:

* *A. Takhom, D. Leenoi, C. Sophaken, P. Boonkwan, and T. Supnithi, “An Approach of Network Analysis Enhancing Knowledge Extraction in Thai Newspapers Contexts,” J. Intell. Informatics Smart Technol., vol. 6, no. October 2021, pp. 19–24, 2021 [Acess](https://jiist.aiat.or.th/assets/uploads/1635853027829tBupD1635602106085fdegH39.pdf)*

* Sophaken, C., Vongpanich, K., Takhom, A., Boonkwan, P., & Supnithi, T. (2023). Unsupervised Detection of Domain Switching in Thai Multidisciplinary Online News. IIAI Letters on Informatics and Interdisciplinary Research, 3. [Access](https://iaiai.org/letters/index.php/liir/article/view/77/50)

🎓 Acknowledgement
* Dr. Akkharawoot TakhomDepartment of Electrical and Computer Engineering
Thammasat University

### ⚙️ Tools & Resorces Preparation

Library & Module Installation



* Run the code once.
* If you encounter any errors during the initial execution it may be due to factors such as dependencies or system configurations. To address any encountered errors, simply restart the runtime or kernel. Afterward, run the code cell again to ensure a successful execution.



In [1]:
!pip install --upgrade setuptools wheel

!pip install tltk==1.6.8 -q
!pip install pythainlp==4.0.2 -q

!pip install pyvis==0.1.9 -q
!apt-get install -y graphviz libgraphviz-dev pkg-config -q #Fix ERROR: Failed building wheel for pygraphviz
!pip install pygraphviz==1.7 -q

Reading package lists...
Building dependency tree...
Reading state information...
pkg-config is already the newest version (0.29.1-0ubuntu4).
graphviz is already the newest version (2.42.2-3build2).
libgraphviz-dev is already the newest version (2.42.2-3build2).
0 upgraded, 0 newly installed, 0 to remove and 13 not upgraded.


Resources prepararion

In [2]:
#sample data
!wget https://github.com/ChotanansubSoph/ThNTA/raw/main/resources/sample_data/thai_electronic_news_2022.csv -q

Library preparation

In [3]:
#Data manipulation
import pandas as pd
import numpy as np

#NLP
import nltk
from nltk import FreqDist, bigrams
from operator import itemgetter
from nltk.tokenize import word_tokenize as term_tokenize
nltk.download('punkt')
import tltk
from pythainlp import word_tokenize,  pos_tag
from pythainlp.corpus.common import thai_stopwords as pythainlp_stopwords

#Graph Visulazation
import networkx as nx
from pyvis.network import Network
from IPython.display import display, HTML

#Add-on
from tqdm.notebook import tqdm_notebook as tqdm
from operator import itemgetter
import re

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


### Data Preprocessing

In [4]:
########## String operation ##########
def isEnglish(s):
  return all(ord(char) < 128 for char in s)


########## List Manipulation ##########
def flatten_nested_list(nested_list):
  flattened_list = [item for sublist in nested_list for item in sublist]
  return flattened_list


######### DataFrame Manipulation #######
def convert_dataframe_to_paired_tuples(df):
    return list(zip(df.iloc[:, 0].tolist(), df.iloc[:, 1].tolist()))

In [5]:
########## Stopwords ##########
def read_stopwords(file_path : str) ->list:
  with open(file_path, 'r', encoding='utf-8') as file:
      lines = file.readlines()
  stopwords = [line.strip() for line in lines]
  return stopwords

########## Tokenization ##########

def tltk_tokenize_pos(text): #Primaly Tokenizer
  result = flatten_nested_list(tltk.nlp.pos_tag(text))
  return result


def pythainlp_tokenize_pos(text): #Secondary Tokenizer
  wordList= word_tokenize(text, keep_whitespace=False)
  posList = pos_tag(wordList,corpus='pud')
  return posList


def TNC_extract_tltk_pos_pairs(result): #Inactivated
    word_pos_pairs = []
    pattern = r'<w tran="(.*?)" POS="(.*?)">(.*?)</w>'
    matches = re.findall(pattern, result)

    for match in matches:
        word_pos_pairs.append((match[2], match[1]))

    return word_pos_pairs

def TNC_tokenize_pos_ner_(text): #Inactivated
  result = []
  for partial_text in text.split(" "):
    partial_text = partial_text.replace(")"," ").replace("("," ")
    result += tltk.nlp.TNC_tag(partial_text,POS="Y")
  return tltk.nlp.ner(TNC_extract_tltk_pos_pairs(result))


########## Term Frequency ##########
def count_word_frequency(data):
    words = [word for sublist in data for word in sublist]
    tokens = term_tokenize(" ".join(words))
    freq_dist = FreqDist(tokens)
    return freq_dist



### Text preprocess

In [6]:
def text_preprocess(text: str, stopwords=set(),tokenizer="tltk", pos_target=None) -> list:
    term_pairs = list()
    if tokenizer == "tltk":
      if pos_target is None: pos_target = {"NOUN", "VERB","PROPN"}
      term_pairs = tltk_tokenize_pos(text)

    elif tokenizer == "pythainlp":
      if pos_target is None: pos_target = {"NOUN","VERB"}
      term_pairs = pythainlp_tokenize_pos(text)

    regex = re.compile('[@_!#$%^&*()<>?/\|}{~:.]')

    preprocessed_terms = [term for term, pos in term_pairs
              if pos in pos_target
              and term not in stopwords
              and not isEnglish(term)
              and regex.search(term) is None
              and "\xa0" not in term]

    return  preprocessed_terms


def feed_preprocess(docs: list, stopwords = None, tokenizer="tltk", pos_target=None) -> list:
    preprocessed_docs = []
    if stopwords is None:
      stopwords = pythainlp_stopwords()

    for text in tqdm(docs):
        proprocessed_terms = text_preprocess(
            text=text,
            stopwords=stopwords,
            tokenizer=tokenizer,
            pos_target=pos_target)
        preprocessed_docs.append(proprocessed_terms)

    return preprocessed_docs

Tokenization & Term Filtering

In [7]:
news_df = pd.read_csv("thai_electronic_news_2022.csv")
news_df

Unnamed: 0,domain,content
0,politic,กรมราชทัณฑ์ ชี้แจง ต่อข้อสงสัย กรณีการนำตัวนาย...
1,politic,พลเอก ประยุทธ์ จันทร์โอชา นายกรัฐมนตรีและรัฐมน...
2,politic,นางสาวคุ้มเกล้า ส่งสมบูรณ์ ทนายความศูนย์ทนายคว...
3,politic,วันที่ (1 พฤษภาคม 2564) เวลา 11.00 น. นายสิธิช...
4,politic,ด่านคัดกรองคนเข้าเมืองยะลา ยกระดับคุมเข้ม ป้อง...
...,...,...
8976,culture,สำนักงานวัฒนธรรมจังหวัดสตูล ลงพื้นที่จัดเก็บข้...
8977,culture,จังหวัดศรีสะเกษ โดย พช.ห้วยทับทัน จัดกิจกรรมเอ...
8978,culture,นายชีวันธร พิศพนาวัน จากโครงการ U2T ตำบลห้วยห้...
8979,culture,วันนี้ (24 พ.ย. 64) ที่ หอประชุม 100 ปี โรงเรี...


In [8]:
tokenized_data = feed_preprocess(news_df["content"],tokenizer="pythainlp") #approximate time ~5 min

  0%|          | 0/8981 [00:00<?, ?it/s]

In [9]:
tokenized_freq = count_word_frequency(tokenized_data) # for EDA

In [10]:
tokenized_freq

FreqDist({'จังหวัด': 24739, 'พื้นที่': 14963, 'ประชาชน': 10335, 'จำนวน': 8712, 'วันที่': 7387, 'คน': 7186, 'เมือง': 6323, 'น้ำ': 6274, 'ปี': 6232, 'อำเภอ': 6179, ...})

generate bag of co-occurence terminology

In [11]:
def generate_bigram_freq(term_list)->list:
    bigram_list = []

    for word_list in term_list:
        try:
            bigrams_list = list(bigrams(word_list))
            bigram_list.extend(bigrams_list)
        except:
            continue

    frequency_dist = FreqDist(bigram_list)
    bigram_freq = sorted(frequency_dist.items(), key=itemgetter(1), reverse=True)

    return bigram_freq

In [12]:
cooc_freqs = generate_bigram_freq(tokenized_data)

In [13]:
cooc_freqs[:10]

[(('สำนักข่าว', 'กรมประชาสัมพันธ์'), 5803),
 (('พื้นที่', 'จังหวัด'), 1576),
 (('จำนวน', 'คน'), 1282),
 (('สถานการณ์', 'แพร่ระบาด'), 1161),
 (('อายุ', 'ปี'), 1137),
 (('หัวหน้า', 'ส่วนราชการ'), 1123),
 (('การแข่งขัน', 'กีฬา'), 1122),
 (('จังหวัด', 'ตรัง'), 1110),
 (('แพร่ระบาด', 'โรค'), 1107),
 (('โรคติดเชื้อ', 'นา'), 1054)]

In [26]:
def bgs_filter_extreme(bgs_list, min_percent=0.05, max_percent=0.8):
  result = list()
  bgs_list = sorted(bgs_list, key=itemgetter(1), reverse=True)
  most_freq = bgs_list[0][1]
  max_freq = most_freq * max_percent
  min_freq = most_freq * min_percent

  result = [(pair, count) for pair, count in bgs_list if min_freq <= count <= max_freq and pair[0] != pair[1]]
  return result

In [27]:
filtered_cooc = bgs_filter_extreme(cooc_freqs)

In [28]:
filtered_cooc[:10]

[(('พื้นที่', 'จังหวัด'), 1576),
 (('จำนวน', 'คน'), 1282),
 (('สถานการณ์', 'แพร่ระบาด'), 1161),
 (('อายุ', 'ปี'), 1137),
 (('หัวหน้า', 'ส่วนราชการ'), 1123),
 (('การแข่งขัน', 'กีฬา'), 1122),
 (('จังหวัด', 'ตรัง'), 1110),
 (('แพร่ระบาด', 'โรค'), 1107),
 (('โรคติดเชื้อ', 'นา'), 1054),
 (('จังหวัด', 'ศรีสะเกษ'), 1032)]

## Visualization

In [29]:
def visualize_cooccurrence(data):
    net = Network(height="800px", width="100%", notebook=True)

    # Create a dictionary to store the degree of each node
    node_degrees = {}

    for pair, freq in data:
        term1, term2 = pair

        # Update the degree of term1
        node_degrees[term1] = node_degrees.get(term1, 0) + 1

        # Update the degree of term2
        node_degrees[term2] = node_degrees.get(term2, 0) + 1

    # Add nodes and set their size based on the degree
    for node, degree in node_degrees.items():
        net.add_node(node, color="lightblue", size=min(degree * 10, 80))

    # Add edges
    for pair, freq in data:
        term1, term2 = pair
        net.add_edge(term1, term2, value=freq, color="orange")

    net.show("cooccurrence.html")

visualize_cooccurrence(filtered_cooc)

In [30]:
display(HTML("cooccurrence.html"))