1. Install library databits

source : https://pypi.org/project/databits/

In [2]:
!pip install databits



2. Persiapan data

In [5]:
# Import necessary modules
import pandas as pd

# Load the training data
train_df = pd.read_csv('/content/train.csv')
# Check column names (we should print this out to check what columns are available)
print(f"train_df columns: {train_df.columns}")

# Extract the correct column instead of 'text'
X_train_list = train_df.iloc[:, 1].tolist()
# Extract the correct column instead of 'label'
y_train_list = train_df.iloc[:, 0].tolist()

# Load the testing data
test_df = pd.read_csv('/content/test.csv')
print(f"test_df columns: {test_df.columns}")

train_df columns: Index(['3', 'Wall St. Bears Claw Back Into the Black (Reuters)',
       'Reuters - Short-sellers, Wall Street's dwindling\band of ultra-cynics, are seeing green again.'],
      dtype='object')
test_df columns: Index(['3', 'Fears for T N pension after talks',
       'Unions representing workers at Turner   Newall say they are 'disappointed' after talks with stricken parent firm Federal Mogul.'],
      dtype='object')


In [6]:
# Extract the correct column instead of 'text'
X_test_list = test_df.iloc[:, 1].tolist()
# Extract the correct column instead of 'label'
y_test_list = test_df.iloc[:, 0].tolist()

# Convert y_train and y_test to integers
y_train_list = [int(label) for label in y_train_list]
y_test_list = [int(label) for label in y_test_list]

# Create DataFrames for X_train and y_train
X_train_df = pd.DataFrame({'text': X_train_list})
y_train_df = pd.DataFrame({'label': y_train_list})

# Create DataFrames for X_test and y_test
X_test_df = pd.DataFrame({'text': X_test_list})
y_test_df = pd.DataFrame({'label': y_test_list})

# Concatenate X_train and y_train into a single DataFrame
train_data = pd.concat([y_train_df, X_train_df], axis=1)

# Concatenate X_test and y_test into a single DataFrame
test_data = pd.concat([y_test_df, X_test_df], axis=1)

# Print the info
print("X_train length:", len(X_train_list))
print("y_train length:", len(y_train_list))
print("X_test length:", len(X_test_list))
print("y_test length:", len(y_test_list))

X_train length: 119999
y_train length: 119999
X_test length: 7599
y_test length: 7599


In [7]:
# Print the first few rows of the resulting DataFrames
print("\nTrain Data:")
print(train_data.head())
print("\nTest Data:")
print(test_data.head())


Train Data:
   label                                               text
0      3  Carlyle Looks Toward Commercial Aerospace (Reu...
1      3    Oil and Economy Cloud Stocks' Outlook (Reuters)
2      3  Iraq Halts Oil Exports from Main Southern Pipe...
3      3  Oil prices soar to all-time record, posing new...
4      3        Stocks End Up, But Near Year Lows (Reuters)

Test Data:
   label                                               text
0      4  The Race is On: Second Private Team Sets Launc...
1      4      Ky. Company Wins Grant to Study Peptides (AP)
2      4      Prediction Unit Helps Forecast Wildfires (AP)
3      4        Calif. Aims to Limit Farm-Related Smog (AP)
4      4  Open Letter Against British Copyright Indoctri...


3. Preprocessing Data

In [9]:
!pip install nltk



In [13]:
!pip install pyspellchecker
!pip install langdetect

Collecting pyspellchecker
  Downloading pyspellchecker-0.8.1-py3-none-any.whl.metadata (9.4 kB)
Downloading pyspellchecker-0.8.1-py3-none-any.whl (6.8 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/6.8 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━[0m [32m5.1/6.8 MB[0m [31m153.3 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m6.8/6.8 MB[0m [31m135.7 MB/s[0m eta [36m0:00:01[0m[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m80.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: pyspellchecker
Successfully installed pyspellchecker-0.8.1
Collecting langdetect
  Downloading langdetect-1.0.9.tar.gz (981 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m981.5/981.5 kB[0m [31m53.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Buildin

In [14]:
import pandas as pd
import nltk
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer
from nltk import pos_tag
from spellchecker import SpellChecker
import spacy

# Download required NLTK data
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
# Download the 'punkt_tab' data package
nltk.download('punkt_tab')
# Download the missing 'averaged_perceptron_tagger_eng' data package
nltk.download('averaged_perceptron_tagger_eng') # This line was added to download the missing package.


# Load Spacy model for entity recognition
nlp = spacy.load("en_core_web_sm")

# Initialize tools
lemmatizer = WordNetLemmatizer()
stemmer = PorterStemmer()
spell = SpellChecker()

# Define stopwords and other constants
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    # Lowercase the text
    text = text.lower()

    # Tokenization
    tokens = word_tokenize(text)

    # POS tagging
    tokens_with_pos = pos_tag(tokens)

    # Stopword elimination and POS-based stopword elimination
    tokens = [word for word, pos in tokens_with_pos if word not in stop_words and pos not in ['PRP', 'PRP$', 'IN']]

    # Lemmatization
    lemmatized_tokens = [lemmatizer.lemmatize(word) for word in tokens]

    # Rejoin tokens into a single string
    processed_text = ' '.join(lemmatized_tokens)

    return processed_text

# Apply preprocessing to the train and test datasets
train_data['processed_text'] = train_data['text'].apply(preprocess_text)
test_data['processed_text'] = test_data['text'].apply(preprocess_text)

# Check the results
print("\nProcessed Train Data:")
print(train_data[['label', 'processed_text']].head())
print("\nProcessed Test Data:")
print(test_data[['label', 'processed_text']].head())

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger_eng.zip.



Processed Train Data:
   label                                     processed_text
0      3      carlyle look commercial aerospace ( reuters )
1      3      oil economy cloud stock ' outlook ( reuters )
2      3  iraq halt oil export main southern pipeline ( ...
3      3  oil price soar all-time record , posing new me...
4      3                   stock end , year low ( reuters )

Processed Test Data:
   label                                     processed_text
0      4  race : second private team set launch date hum...
1      4         ky. company win grant study peptide ( ap )
2      4      prediction unit help forecast wildfire ( ap )
3      4          calif. aim limit farm-related smog ( ap )
4      4  open letter british copyright indoctrination s...


4. Definisikan hyperparameter

In [15]:
import torch
import torch.nn as nn
import numpy as np
from databits import CreateModel
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score

BATCH_SIZE = 32
SEQUENCE_LENGTH = 100
EPOCHS = 5
EMBED_DIM = 512
N_LAYERS = 2
DROPOUT_RATE = 0.1
NUM_CLASSES = len(np.unique(np.array(y_train)))
OPTIMIZER = torch.optim.Adam
LR = 0.001
LOSS = nn.CrossEntropyLoss

5. Bangun Model

In [16]:
model = CreateModel(X_train, y_train,
                 X_test, y_test,
                 batch=BATCH_SIZE,
                 seq=SEQUENCE_LENGTH,
                 embedding_dim=EMBED_DIM,
                 n_layers=N_LAYERS,
                 dropout_rate=DROPOUT_RATE,
                 num_classes=NUM_CLASSES)

Loading setup data ...
Loading train data ...
Loading val data ...
Successful load model


6. Latih model Fasttext

In [17]:
model.FASTTEXT() # fasttext model

FASTTEXTModel(
  (embedding): Embedding(98639, 512)
  (layers): ModuleList(
    (0-1): 2 x Linear(in_features=512, out_features=512, bias=True)
  )
  (fc): Linear(in_features=512, out_features=4, bias=True)
  (dropout): Dropout(p=0.1, inplace=False)
)

In [18]:
history = model.fit(epochs=EPOCHS, optimizer=OPTIMIZER, lr=LR, loss=LOSS)

Training: 100%|██████████| 3750/3750 [01:24<00:00, 44.42batch/s]
Validation: 100%|██████████| 238/238 [00:00<00:00, 300.82batch/s]


Epoch 1/5 | Train Loss: 0.4793 | Train Acc: 0.8126 | Val Loss: 0.3006 | Val Acc: 0.9021



Training: 100%|██████████| 3750/3750 [01:23<00:00, 44.68batch/s]
Validation: 100%|██████████| 238/238 [00:00<00:00, 315.91batch/s]


Epoch 2/5 | Train Loss: 0.2511 | Train Acc: 0.9151 | Val Loss: 0.2543 | Val Acc: 0.9129



Training: 100%|██████████| 3750/3750 [01:23<00:00, 44.96batch/s]
Validation: 100%|██████████| 238/238 [00:01<00:00, 226.51batch/s]


Epoch 3/5 | Train Loss: 0.1917 | Train Acc: 0.9359 | Val Loss: 0.2687 | Val Acc: 0.9101



Training: 100%|██████████| 3750/3750 [01:27<00:00, 43.03batch/s]
Validation: 100%|██████████| 238/238 [00:00<00:00, 331.17batch/s]


Epoch 4/5 | Train Loss: 0.1569 | Train Acc: 0.9470 | Val Loss: 0.2967 | Val Acc: 0.9046



Training: 100%|██████████| 3750/3750 [01:23<00:00, 44.84batch/s]
Validation: 100%|██████████| 238/238 [00:01<00:00, 207.53batch/s]

Epoch 5/5 | Train Loss: 0.1302 | Train Acc: 0.9559 | Val Loss: 0.2644 | Val Acc: 0.9153

Restored model to the best state based on validation loss.





- Dapatkan y_true dan label prediksi

In [19]:
y_true, y_pred = model.eval()

Validation: 100%|██████████| 238/238 [00:00<00:00, 332.98batch/s]


- Hitung Accuracy, Precision, Recall, F1, and Cofusion Matrix

In [20]:
from sklearn.metrics import confusion_matrix, precision_score, recall_score, f1_score, accuracy_score

precision = precision_score(y_true, y_pred, average='macro')
recall = recall_score(y_true, y_pred, average='macro')
f1 = f1_score(y_true, y_pred, average='macro')
accuracy = accuracy_score(y_true, y_pred)

print(f"Precision: {precision:.4f}")
print(f"Recall: {recall:.4f}")
print(f"F1 Score: {f1:.4f}")
print(f"Akurasi: {accuracy:.4f}")

cm = confusion_matrix(y_true, y_pred)
print(cm)

Precision: 0.9155
Recall: 0.9153
F1 Score: 0.9147
Akurasi: 0.9153
[[1788   37   36   39]
 [  28 1864    4    4]
 [ 119   26 1602  153]
 [  74   25   99 1702]]


6. Prediksi

In [21]:
text = "this is text"
pred = model.predict(text) # or
pred = model(text)
print(pred) # text label in int format

tensor([0], device='cuda:0')


In [22]:
predicted_class = pred.item()  # Mengonversi tensor menjadi nilai numerik
print(predicted_class)

0


In [23]:
# Contoh daftar label
class_labels = ["negative", "neutral", "positive"]

# Ambil prediksi sebagai teks
predicted_label = class_labels[predicted_class]
print(predicted_label)

negative
