In [1]:
import pandas as pd
import matplotlib

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

import pickle

import re
import string
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
nltk.download('stopwords',quiet=True)
nltk.download('punkt',quiet=True)
nltk.download('wordnet',quiet=True)
nltk.download('omw-1.4',quiet=True)
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, precision_score, recall_score
from sklearn.metrics import confusion_matrix
from sklearn.preprocessing import  LabelEncoder
from sklearn.metrics.pairwise import cosine_similarity as cs

from wordcloud import WordCloud

import warnings

In [2]:
!pip install --upgrade numpy==1.23.5
import numpy as np
print(np.__version__)

1.23.5


In [3]:
default_stemmer = PorterStemmer()
default_stopwords = stopwords.words('english')
default_stopwords = default_stopwords + ['said', 'would','even','according','could','year',
                                         'years','also','new','people','old,''one','two','time',
                                         'first','last','say','make','best','get','three','make',
                                         'year old','told','made','like','take','many','set','number',
                                         'month','week','well','back']
shortword = re.compile(r'\W*\b\w{1,4}\b\d')
BAD_SYMBOLS_RE = re.compile("[^a-zA-Z,\d]")
REPLACE_IP_ADDRESS = re.compile(r'\b(?:[0-9]{1,3}\.){3}[0-9]{1,3}\b')
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
def clean_text(text, ):

    def tokenize_text(text):
        return [w for s in sent_tokenize(text) for w in word_tokenize(s) if len(w)>=3]

    def preprocessing_text(text):
        text = text.lower()
        text=text.replace('\n',' ').replace('\xa0',' ').replace('-',' ').replace('ó','o').replace('ğ','g').replace('á','a').replace("'"," ")
        text=re.sub(r'\d+','', text)
        text=re.sub(r'http\S+', '', text)
        text=BAD_SYMBOLS_RE.sub(' ', text)
        text=REPLACE_IP_ADDRESS.sub('', text)
        text=REPLACE_BY_SPACE_RE.sub(' ', text)
        text=' '.join(word for word in text.split() if len(word)>3)
        
        return text

    def remove_special_characters(text, characters=string.punctuation.replace('-', '')):
        tokens = tokenize_text(text)
        pattern = re.compile('[{}]'.format(re.escape(characters + '0123456789')))
        return ' '.join(filter(None, [pattern.sub('', t) for t in tokens]))

    def stem_text(text, stemmer=default_stemmer):
        tokens = tokenize_text(text)
        return ' '.join([stemmer.stem(t) for t in tokens])

    def lemm_text(text, lemm=WordNetLemmatizer()):
        tokens = tokenize_text(text)
        return ' '.join([lemm.lemmatize(t) for t in tokens])

    def remove_stopwords(text, stop_words=default_stopwords):
        tokens = [w for w in tokenize_text(text) if w not in stop_words]
        return ' '.join(tokens)
    
    text = text.strip(' ') # strip whitespaces
    text = text.lower() # lowercase
    #text = stem_text(text) # stemming
    text=preprocessing_text(text)
    text = remove_special_characters(text) # remove punctuation and symbols
    text = lemm_text(text) # lemmatizer
    text = remove_stopwords(text) # remove stopwords

    return text

In [4]:
df = pd.read_csv('MN-DS-news-classification.csv')

In [5]:
df['text']= df[['title', 'content']].apply(lambda x: ' . '.join(x.astype(str)),axis=1)

In [6]:
df['text'] = df['text'].apply(clean_text)

In [7]:
def encode_labels(y):
    le = LabelEncoder()
    return le.fit_transform(y)

In [8]:
import tensorflow as tf
from transformers import DistilBertTokenizer, TFDistilBertModel

  from .autonotebook import tqdm as notebook_tqdm





In [9]:
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased-finetuned-sst-2-english')
layer = TFDistilBertModel.from_pretrained("distilbert-base-uncased-finetuned-sst-2-english")




Some weights of the PyTorch model were not used when initializing the TF 2.0 model TFDistilBertModel: ['classifier.bias', 'pre_classifier.weight', 'pre_classifier.bias', 'classifier.weight']
- This IS expected if you are initializing TFDistilBertModel from a PyTorch model trained on another task or with another architecture (e.g. initializing a TFBertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing TFDistilBertModel from a PyTorch model that you expect to be exactly identical (e.g. initializing a TFBertForSequenceClassification model from a BertForSequenceClassification model).
All the weights of TFDistilBertModel were initialized from the PyTorch model.
If your task is similar to the task the model of the checkpoint was trained on, you can already use TFDistilBertModel for predictions without further training.


In [10]:
def regular_encode(texts, tokenizer,  maxlen=512):
    enc_di = tokenizer.batch_encode_plus(
        texts, 
        truncation=True,
        return_attention_mask=False, 
        return_token_type_ids=False,
        padding=True,
        max_length=maxlen
    )
    
    return np.array(enc_di['input_ids'])

In [12]:
def build_model(transformer, y_shape, loss='categorical_crossentropy', max_len=512):
    input_word_ids = tf.keras.layers.Input(shape=(max_len,), dtype=tf.int32, name="input_word_ids")
    sequence_output = transformer(input_word_ids)[0]
    cls_token = sequence_output[:, 0, :]
    #adding dropout layer
    
    x = tf.keras.layers.Dropout(0.3)(cls_token)

    #using a dense layer of category size neurons. 
    out = tf.keras.layers.Dense(y_shape, activation='softmax')(x)
    model = tf.keras.Model(inputs=input_word_ids, outputs=out)

    #using categorical crossentropy as the loss as it is a multi-class classification problem
    model.compile(tf.keras.optimizers.Adam(learning_rate=5e-5), loss=loss, metrics=['accuracy'])
    return model

In [20]:
def train_dist_bert(X, y, model_save_path):
    
    y_lr = encode_labels(y)

    #converting the categories into one hot vectors using tf.keras.utils.to_categorical
    y_cat = tf.keras.utils.to_categorical(y_lr, dtype = 'int32')

    X_train, X_test, y_train, y_test = train_test_split(X, y_cat, random_state = 42, test_size = 0.2)
    
    #building the model
    model = build_model(layer, max_len=80, y_shape=y_cat.shape[1])
    
    #creating the training and testing dataset.
    BATCH_SIZE = 32
    AUTO = tf.data.experimental.AUTOTUNE 
    train_dataset = (
        tf.data.Dataset
        .from_tensor_slices((X_train, y_train))
        .repeat()
        .shuffle(2048)
        .batch(BATCH_SIZE)
        .prefetch(AUTO)
    )
    test_dataset = (
        tf.data.Dataset
        .from_tensor_slices(X_test)
        .batch(BATCH_SIZE)
    )
    
    #training for 10 epochs
    n_steps = X_train.shape[0] // BATCH_SIZE
    train_history = model.fit(
        train_dataset,
        steps_per_epoch=n_steps,
        epochs=10
    )
    
    #making predictions
    preds = model.predict(test_dataset,verbose = 1)
    #converting the one hot vector output to a linear numpy array.
    pred_classes = np.argmax(preds, axis = 1)
    
    print('Precision is {}'.format(precision_score(np.argmax(y_test, axis=1), pred_classes, average='macro')))
    print('Recall is {}'.format(recall_score(np.argmax(y_test, axis=1), pred_classes, average='macro')))
    print('F1:', f1_score(np.argmax(y_test, axis=1), pred_classes, average='macro'))

    model.save(model_save_path)

In [21]:
X_encoded = regular_encode(df['text'].astype('str'), tokenizer, maxlen=80)

In [22]:
train_dist_bert(X_encoded, df['category_level_1'], 'bjk/model_category_level_1.h5')


Epoch 1/10


 10/272 [>.............................] - ETA: 21:53 - loss: 2.8728 - accuracy: 0.0938

KeyboardInterrupt: 

In [None]:
train_dist_bert(X_encoded, df['category_level_2'], 'bjk/model_category_level_2.h5')

In [2]:
import os
import numpy as np
import tensorflow as tf
from transformers import TFDistilBertModel, DistilBertTokenizer

# ล้างหน่วยความจำ
tf.keras.backend.clear_session()

# ใช้ CPU แทน GPU
os.environ["CUDA_VISIBLE_DEVICES"] = "-1"

# ใช้ eager execution
tf.config.run_functions_eagerly(True)

# โหลด tokenizer
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-uncased')

# โหลดโมเดล
model_category_level_1 = tf.keras.models.load_model(
    'model_category_level_1.h5',
    custom_objects={'TFDistilBertModel': TFDistilBertModel}
)
model_category_level_2 = tf.keras.models.load_model(
    'model_category_level_2.h5',
    custom_objects={'TFDistilBertModel': TFDistilBertModel}
)

# ข้อความใหม่ที่ต้องการทำนาย
new_texts = [
    "This is a sample news article about health.",
    "Another news report focusing on technology advancements."
]

# กำหนด maxlen ตามที่ใช้ในขณะฝึกโมเดล
maxlen = 80

# ฟังก์ชันเข้ารหัสข้อความใหม่
def regular_encode(texts, tokenizer, maxlen=80):
    tokens = tokenizer(texts, padding='max_length', truncation=True, max_length=maxlen, return_tensors='tf')
    return tokens['input_ids']

# เข้ารหัสข้อความใหม่
X_new_encoded = regular_encode(new_texts, tokenizer, maxlen=maxlen)

# ตรวจสอบรูปแบบของ X_new_encoded
print("Shape of X_new_encoded:", X_new_encoded.shape)

# 3. ทำนายข้อมูลใหม่
predictions_level_1 = model_category_level_1.predict(X_new_encoded)
predictions_level_2 = model_category_level_2.predict(X_new_encoded)

# แสดงผลการทำนาย
predicted_classes_level_1 = np.argmax(predictions_level_1, axis=1)
predicted_classes_level_2 = np.argmax(predictions_level_2, axis=1)

print("Predicted Classes for Category Level 1:", predicted_classes_level_1)
print("Predicted Classes for Category Level 2:", predicted_classes_level_2)



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "C:\Users\Jakkapan\anaconda3\envs\myenv\lib\runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "C:\Users\Jakkapan\anaconda3\envs\myenv\lib\runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "c:\gitrhubWork67\News-Classification\.venv\lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "c:\gitrhubWork67\News-Classification\.venv\lib\site-packages\traitlets\config\application.py", line 1075, in launch_instan

AttributeError: _ARRAY_API not found

SystemError: initialization of _pywrap_checkpoint_reader raised unreported exception

In [5]:
import tensorflow as tf

gpus = tf.config.list_physical_devices('GPU')
if gpus:
    try:
        for gpu in gpus:
            tf.config.experimental.set_memory_growth(gpu, True)
    except RuntimeError as e:
        print(e)


In [3]:
import tensorflow as tf

# ตรวจสอบว่า TensorFlow ใช้ GPU หรือ CPU
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))



A module that was compiled using NumPy 1.x cannot be run in
NumPy 2.1.2 as it may crash. To support both 1.x and 2.x
versions of NumPy, modules must be compiled with NumPy 2.0.
Some module may need to rebuild instead e.g. with 'pybind11>=2.12'.

If you are a user of the module, the easiest solution will be to
downgrade to 'numpy<2' or try to upgrade the affected module.
We expect that some modules will need time to support NumPy 2.

Traceback (most recent call last):  File "C:\Users\Jakkapan\anaconda3\envs\myenv\lib\runpy.py", line 196, in _run_module_as_main
    return _run_code(code, main_globals, None,
  File "C:\Users\Jakkapan\anaconda3\envs\myenv\lib\runpy.py", line 86, in _run_code
    exec(code, run_globals)
  File "c:\gitrhubWork67\News-Classification\.venv\lib\site-packages\ipykernel_launcher.py", line 18, in <module>
    app.launch_new_instance()
  File "c:\gitrhubWork67\News-Classification\.venv\lib\site-packages\traitlets\config\application.py", line 1075, in launch_instan

AttributeError: _ARRAY_API not found

SystemError: initialization of _pywrap_checkpoint_reader raised unreported exception