In [23]:
import pandas as pd

data = pd.read_csv('teks_tokenisasi.csv')

# Bobot

## 1. POS TAGGING

In [24]:
import stanza
from collections import defaultdict, Counter


stanza.download('id')
nlp = stanza.Pipeline(
    lang='id'
    , processors='tokenize,pos'
)

Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json: 432kB [00:00, 851kB/s]                     
2025-06-17 20:03:00 INFO: Downloaded file to C:\Users\HP\stanza_resources\resources.json
2025-06-17 20:03:00 INFO: Downloading default packages for language: id (Indonesian) ...
2025-06-17 20:03:01 INFO: File exists: C:\Users\HP\stanza_resources\id\default.zip
2025-06-17 20:03:06 INFO: Finished downloading models and saved to C:\Users\HP\stanza_resources
2025-06-17 20:03:06 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES
Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.10.0.json: 432kB [00:00, 8.72MB/s]                    
2025-06-17 20:03:07 INFO: Downloaded file to C:\Users\HP\stanza_resources\resources.json
2025-06-17 20:03:08 INFO: Loading these models

In [25]:
# Hitung frekuensi POS tag
jumlah_pos_ai = Counter()
jumlah_pos_nonai = Counter()

for _, row in data.iterrows():
    doc = nlp(row['teks'])
    tags = [word.upos for sent in doc.sentences for word in sent.words]
    
    if row['label'] == 1:
        jumlah_pos_ai.update(tags)
    else:
        jumlah_pos_nonai.update(tags)

# Gabungkan semua POS tag yang muncul
all_tags = set(jumlah_pos_ai) | set(jumlah_pos_nonai)
data_pos_freq = pd.DataFrame({
    'POS_Tag': list(all_tags)
    , 'AI': [jumlah_pos_ai.get(tag, 0) for tag in all_tags]
    , 'NonAI': [jumlah_pos_nonai.get(tag, 0) for tag in all_tags]
    , 'Selisih': [jumlah_pos_ai.get(tag, 0) - jumlah_pos_nonai.get(tag, 0) for tag in all_tags]
    , 'Total': [jumlah_pos_ai.get(tag, 0) + jumlah_pos_nonai.get(tag, 0) for tag in all_tags]
})

data_pos_freq.to_csv("frekuensi_pos_tag.csv", index=False)

In [26]:
# Hitung Bobot min-max
min_selisih = data_pos_freq['Selisih'].min()
max_selisih = data_pos_freq['Selisih'].max()
data_pos_freq['Bobot'] = (data_pos_freq['Selisih'] - min_selisih) / (max_selisih - min_selisih)

# Simpan POS_Tag dan Bobot ke CSV
data_pos_freq[['POS_Tag', 'Bobot']].to_csv('bobot_pos_tag.csv', index=False)

## 1. STOPWORD

In [27]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from collections import Counter

In [28]:
# Unduh stopwords dan tokenizer
nltk.download('punkt')
nltk.download('stopwords')

# Ambil stopword Bahasa Indonesia
stop_words = set(stopwords.words('indonesian'))

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\HP\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [29]:
# Tokenisasi dan hitung frekuensi stopword
jumlah_stopword_ai = Counter()
jumlah_stopword_nonai = Counter()

for _, row in data.iterrows():
    tokens = word_tokenize(row['teks'].lower())
    stopword_tokens = [t for t in tokens if t in stop_words]

    if row['label'] == 1:
        jumlah_stopword_ai.update(stopword_tokens)
    else:
        jumlah_stopword_nonai.update(stopword_tokens)

# Gabungkan semua stopword yang muncul
all_stopwords = set(jumlah_stopword_ai) | set(jumlah_stopword_nonai)

# Buat DataFrame
data_stopword = pd.DataFrame({
    'Stopword': list(all_stopwords),
    'AI': [jumlah_stopword_ai.get(sw, 0) for sw in all_stopwords],
    'NonAI': [jumlah_stopword_nonai.get(sw, 0) for sw in all_stopwords]
})

# Hitung Selisih dan Total
data_stopword['Selisih'] = data_stopword['AI'] - data_stopword['NonAI']
data_stopword['Total'] = data_stopword['AI'] + data_stopword['NonAI']

# Hitung Bobot (min-max normalisasi dari Selisih)
min_selisih = data_stopword['Selisih'].min()
max_selisih = data_stopword['Selisih'].max()
data_stopword['Bobot'] = (data_stopword['Selisih'] - min_selisih) / (max_selisih - min_selisih)

# Simpan ke CSV
data_stopword[['Stopword', 'Bobot']].to_csv('bobot_stopword.csv', index=False)