In [1]:
!pip install databits

Collecting databits
  Downloading databits-2.0.5-py3-none-any.whl.metadata (3.2 kB)
Collecting torchtext==0.17.0 (from databits)
  Downloading torchtext-0.17.0-cp310-cp310-manylinux1_x86_64.whl.metadata (7.6 kB)
Collecting bitsandbytes==0.40.2 (from databits)
  Downloading bitsandbytes-0.40.2-py3-none-any.whl.metadata (9.8 kB)
Collecting torch==2.2.0 (from torchtext==0.17.0->databits)
  Downloading torch-2.2.0-cp310-cp310-manylinux1_x86_64.whl.metadata (25 kB)
Collecting torchdata==0.7.1 (from torchtext==0.17.0->databits)
  Downloading torchdata-0.7.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (13 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch==2.2.0->torchtext==0.17.0->databits)
  Downloading nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.1.105 (from torch==2.2.0->torchtext==0.17.0->databits)
  Downloading nvidia_cuda_runtime_cu12-12.1.105-py3-none-manylinux1_x86_64.whl.meta

##Train & Test


In [2]:
# Import necessary modules
import pandas as pd

# Load the training data
train_df = pd.read_csv('/content/train.csv')
# Check column names (we should print this out to check what columns are available)
print(f"train_df columns: {train_df.columns}")

# Extract the correct column instead of 'text'
X_train_list = train_df.iloc[:, 1].tolist()
# Extract the correct column instead of 'label'
y_train_list = train_df.iloc[:, 0].tolist()

# Load the testing data
test_df = pd.read_csv('/content/test.csv')
print(f"test_df columns: {test_df.columns}")

train_df columns: Index(['3', 'Wall St. Bears Claw Back Into the Black (Reuters)',
       'Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.'],
      dtype='object')
test_df columns: Index(['3', 'Fears for T N pension after talks',
       'Unions representing workers at Turner   Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul.'],
      dtype='object')


In [3]:
# Extract the correct column instead of 'text'
X_test_list = test_df.iloc[:, 1].tolist()
# Extract the correct column instead of 'label'
y_test_list = test_df.iloc[:, 0].tolist()

# Convert y_train and y_test to integers
y_train_list = [int(label) for label in y_train_list]
y_test_list = [int(label) for label in y_test_list]

# Create DataFrames for X_train and y_train
X_train_df = pd.DataFrame({'text': X_train_list})
y_train_df = pd.DataFrame({'label': y_train_list})

# Create DataFrames for X_test and y_test
X_test_df = pd.DataFrame({'text': X_test_list})
y_test_df = pd.DataFrame({'label': y_test_list})

# Concatenate X_train and y_train into a single DataFrame
train_data = pd.concat([y_train_df, X_train_df], axis=1)

# Concatenate X_test and y_test into a single DataFrame
test_data = pd.concat([y_test_df, X_test_df], axis=1)

# Print the info
print("X_train length:", len(X_train_list))
print("y_train length:", len(y_train_list))
print("X_test length:", len(X_test_list))
print("y_test length:", len(y_test_list))

X_train length: 119999
y_train length: 119999
X_test length: 7599
y_test length: 7599


In [4]:
# Print the first few rows of the resulting DataFrames
print("\nTrain Data:")
print(train_data.head())
print("\nTest Data:")
print(test_data.head())


Train Data:
   label                                               text
0      3  Carlyle Looks Toward Commercial Aerospace (Reu...
1      3    Oil and Economy Cloud Stocks' Outlook (Reuters)
2      3  Iraq Halts Oil Exports from Main Southern Pipe...
3      3  Oil prices soar to all-time record, posing new...
4      3        Stocks End Up, But Near Year Lows (Reuters)

Test Data:
   label                                               text
0      4  The Race is On: Second Private Team Sets Launc...
1      4      Ky. Company Wins Grant to Study Peptides (AP)
2      4      Prediction Unit Helps Forecast Wildfires (AP)
3      4        Calif. Aims to Limit Farm-Related Smog (AP)
4      4  Open Letter Against British Copyright Indoctri...


##Preprocessing teks

In [17]:
import pandas as pd
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk import pos_tag
from spellchecker import SpellChecker
import spacy

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
# Download the 'punkt_tab' data package
nltk.download('punkt_tab')
# Download the missing 'averaged_perceptron_tagger_eng' data package
nltk.download('averaged_perceptron_tagger_eng') # This line was added to download the missing package.


# Load Spacy model for entity recognition
nlp = spacy.load("en_core_web_sm")

# Initialize tools
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
spell = SpellChecker()

# Define stopwords and other constants
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()

    # Tokenization
    tokens = word_tokenize(text)

    # POS tagging
    tokens_with_pos = pos_tag(tokens)

    # Stopword elimination and POS-based stopword elimination
    tokens = [word for word, pos in tokens_with_pos if word not in stop_words and pos not in ['PRP', 'PRP$', 'IN']]

    # Lemmatization
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Rejoin tokens into a single string
    processed_text = ' '.join(lemmatized_tokens)

    return processed_text

# Apply preprocessing to the train and test datasets
train_data['processed_text'] = train_data['text'].apply(preprocess_text)
test_data['processed_text'] = test_data['text'].apply(preprocess_text)

# Check the results
print("\nProcessed Train Data:")
print(train_data[['label', 'processed_text']].head())
print("\nProcessed Test Data:")
print(test_data[['label', 'processed_text']].head())

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!



Processed Train Data:
   label                                     processed_text
0      3      carlyle look commercial aerospace ( reuters )
1      3      oil economy cloud stock ' outlook ( reuters )
2      3  iraq halt oil export main southern pipeline ( ...
3      3  oil price soar all-time record , posing new me...
4      3                   stock end , year low ( reuters )

Processed Test Data:
   label                                     processed_text
0      4  race : second private team set launch date hum...
1      4         ky. company win grant study peptide ( ap )
2      4      prediction unit help forecast wildfire ( ap )
3      4          calif. aim limit farm-related smog ( ap )
4      4  open letter british copyright indoctrination s...


##HyperParameters

In [19]:
import torch
import torch.nn as nn
import numpy as np
from databits import CreateModel
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score

BATCH_SIZE = 32
SEQUENCE_LENGTH = 100
EPOCHS = 5
EMBED_DIM = 512
N_LAYERS = 2
DROPOUT_RATE = 0.1
NUM_CLASSES = len(np.unique(np.array(y_train_list)))
OPTIMIZER = torch.optim.Adam
LR = 0.001
LOSS = nn.CrossEntropyLoss

In [21]:
model = CreateModel(X_train_list, y_train_list,
                 X_test_list, y_test_list,
                 batch=BATCH_SIZE,
                 seq=SEQUENCE_LENGTH,
                 embedding_dim=EMBED_DIM,
                 n_layers=N_LAYERS,
                 dropout_rate=DROPOUT_RATE,
                 num_classes=NUM_CLASSES)

Loading setup data ...
Loading train data ...
Loading val data ...
Successful load model


In [22]:
model.TRANSFORMER()

TransformerEncoder(
  (embedding): Embedding(40551, 512)
  (positional_encoding): PositionalEncoding()
  (fc): Linear(in_features=512, out_features=512, bias=True)
  (out): Linear(in_features=512, out_features=4, bias=True)
)

In [23]:
history = model.fit(epochs=EPOCHS, optimizer=OPTIMIZER, lr=LR, loss=LOSS)

Training: 100%|██████████| 3750/3750 [02:02<00:00, 30.72batch/s]
Validation: 100%|██████████| 238/238 [00:02<00:00, 92.90batch/s]


Epoch 1/5 | Train Loss: 0.8148 | Train Acc: 0.6537 | Val Loss: 0.4868 | Val Acc: 0.8284



Training: 100%|██████████| 3750/3750 [02:03<00:00, 30.29batch/s]
Validation: 100%|██████████| 238/238 [00:02<00:00, 94.36batch/s]


Epoch 2/5 | Train Loss: 0.4354 | Train Acc: 0.8479 | Val Loss: 0.4440 | Val Acc: 0.8533



Training: 100%|██████████| 3750/3750 [02:03<00:00, 30.27batch/s]
Validation: 100%|██████████| 238/238 [00:02<00:00, 83.75batch/s]


Epoch 3/5 | Train Loss: 0.3564 | Train Acc: 0.8768 | Val Loss: 0.4108 | Val Acc: 0.8601



Training: 100%|██████████| 3750/3750 [02:08<00:00, 29.26batch/s]
Validation: 100%|██████████| 238/238 [00:04<00:00, 54.65batch/s]


Epoch 4/5 | Train Loss: 0.3104 | Train Acc: 0.8927 | Val Loss: 0.3938 | Val Acc: 0.8677



Training: 100%|██████████| 3750/3750 [02:04<00:00, 30.09batch/s]
Validation: 100%|██████████| 238/238 [00:02<00:00, 94.49batch/s]


Epoch 5/5 | Train Loss: 0.2807 | Train Acc: 0.9022 | Val Loss: 0.3973 | Val Acc: 0.8700

Restored model to the best state based on validation loss.


In [58]:
y_true, y_pred = model.eval()

Validation: 100%|██████████| 238/238 [00:02<00:00, 85.50batch/s]


In [59]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score

# Calculate precision, recall, F1 score, and accuracy
precision = precision_score(y_true, y_pred, average='macro')
recall = recall_score(y_true, y_pred, average='macro')
f1 = f1_score(y_true, y_pred, average='macro')
accuracy = accuracy_score(y_true, y_pred)

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Akurasi: {accuracy:.4f}")

Precision: 0.8687
Recall: 0.8692
F1 Score: 0.8687
Akurasi: 0.8692


In [60]:
# Generate confusion matrix
cm = confusion_matrix(y_true, y_pred)

# Create a DataFrame for the confusion matrix to display it in tabular form
cm_df = pd.DataFrame(cm, index=[f"True {i}" for i in range(NUM_CLASSES)],
                     columns=[f"Pred {i}" for i in range(NUM_CLASSES)])

print("\nConfusion Matrix:")
print(cm_df)


Confusion Matrix:
        Pred 0  Pred 1  Pred 2  Pred 3
True 0    1655      95      96      54
True 1      58    1788      30      24
True 2      90      63    1565     181
True 3     101      63     139    1597


##Hasil Prediksi Label

image.png
image.png
image.png
image.png

In [74]:
text = "giddy phelps"
pred = model.predict(text)
print(pred)

tensor([1], device='cuda:0')


In [77]:
text = "retailer vie back-to-school buyer"
pred = model.predict(text)
print(pred)

tensor([2], device='cuda:0')


In [78]:
text = "taiwan spy also operated"
pred = model.predict(text)
print(pred)

tensor([0], device='cuda:0')


**Penjelasan** \\
Dapat dilihat dari hasil ketiga text yang dilakukan hasil prediksi label yang ditunjukkan sesuai dengan true label yang dimana text "giddy phelps touch gold first time" berada pada label (2), "retailer vie back-to-school buyer" label (3), dan "china say taiwan spy also operated" label (1)