# Ambil semua data

In [1]:
pip install PySastrawi

Note: you may need to restart the kernel to use updated packages.


## Disimpan jadi 2 folder hasil

In [4]:
import os
import string
from docx import Document
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.feature_extraction.text import TfidfVectorizer
from collections import Counter

# Fungsi untuk membaca teks dari berkas DOCX
def read_text_from_docx(file_path):
    doc = Document(file_path)
    text = ""
    for paragraph in doc.paragraphs:
        text += paragraph.text + "\n"
    return text

# Fungsi preprocessing yang mencakup filtering, stemming, dan tokenisasi
def preprocess_text(text):
    text = text.lower()
    text = text.translate(str.maketrans('', '', string.punctuation))
    tokens = word_tokenize(text)
    stop_words = set(stopwords.words('indonesian'))
    filtered_tokens = [token for token in tokens if token not in stop_words]
    
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
    stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
    
    preprocessed_tokens = [token for token in stemmed_tokens if token.isalnum()]
    return ' '.join(preprocessed_tokens)

base_directory = "C:/Users/bayuk/OneDrive/Documents/AI/pens/smtr3/text mining/w8/Source"
hasil_directory = "C:/Users/bayuk/OneDrive/Documents/AI/pens/smtr3/text mining/w8/Destinasi"

folders_to_check = ['Kesehatan', 'Teknologi', 'Politik', 'Olahraga', 'Kriminal']

for folder_name in folders_to_check:
    folder_path = os.path.join(base_directory, folder_name)
    hasil_path = os.path.join(hasil_directory, folder_name)
    
    # Buat folder untuk output tokenisasi
    tokenized_output_dir = os.path.join(hasil_path, 'Tokenized Output')
    if not os.path.exists(tokenized_output_dir):
        os.makedirs(tokenized_output_dir)

    # Buat folder untuk output TF-IDF
    tfidf_output_dir = os.path.join(hasil_path, 'TF-IDF Output')
    if not os.path.exists(tfidf_output_dir):
        os.makedirs(tfidf_output_dir)
    
    for filename in os.listdir(folder_path):
        output_file_name = f'frekuensi_kata_{filename.replace(".docx", "")}.txt'  
        file_path = os.path.join(folder_path, filename)
        print(f"File dengan nama yang mirip ditemukan di folder '{folder_name}': {file_path}")
        
        text = read_text_from_docx(file_path)
        preprocessed_text = preprocess_text(text)
        tokens = word_tokenize(preprocessed_text)
        word_frequency = Counter(tokens)

        # Simpan hasil frekuensi kata ke folder Tokenized Output
        output_file_path = os.path.join(tokenized_output_dir, output_file_name)  
        with open(output_file_path, 'w') as file:
            file.write('Kata Frekuensi\n') 
            for word, freq in word_frequency.items():
                file.write(f'{word} {freq}\n')

        print(f"Hasil frekuensi kata-kata setelah stemming dan preprocessing telah disimpan dalam berkas '{output_file_path}'.\n")

        # Perhitungan TF-IDF
        tokens = word_tokenize(preprocessed_text)
        tfidf_vectorizer = TfidfVectorizer()
        tfidf_matrix = tfidf_vectorizer.fit_transform([" ".join(tokens)])
        terms = tfidf_vectorizer.get_feature_names_out()

        # Simpan hasil TF-IDF ke folder TF-IDF Output
        output_file_name = f'TF-IDF_{filename.replace(".docx", "")}.txt'
        output_file_path = os.path.join(tfidf_output_dir, output_file_name)  
        with open(output_file_path, 'w') as file:
            file.write('Kata TF-IDF\n') 
            for i, term in enumerate(terms):
                file.write(f'{term} : {tfidf_matrix[0, i]}\n')

        print(f"Hasil TF-IDF telah disimpan dalam berkas '{output_file_path}'.\n")

print("Selesai")

File dengan nama yang mirip ditemukan di folder 'Kesehatan': C:/Users/bayuk/OneDrive/Documents/AI/pens/smtr3/text mining/w8/Source\Kesehatan\Kesehatan_3322600001_1.docx
Hasil frekuensi kata-kata setelah stemming dan preprocessing telah disimpan dalam berkas 'C:/Users/bayuk/OneDrive/Documents/AI/pens/smtr3/text mining/w8/Destinasi\Kesehatan\Tokenized Output\frekuensi_kata_Kesehatan_3322600001_1.txt'.

Hasil TF-IDF telah disimpan dalam berkas 'C:/Users/bayuk/OneDrive/Documents/AI/pens/smtr3/text mining/w8/Destinasi\Kesehatan\TF-IDF Output\TF-IDF_Kesehatan_3322600001_1.txt'.

File dengan nama yang mirip ditemukan di folder 'Kesehatan': C:/Users/bayuk/OneDrive/Documents/AI/pens/smtr3/text mining/w8/Source\Kesehatan\Kesehatan_3322600001_2.docx
Hasil frekuensi kata-kata setelah stemming dan preprocessing telah disimpan dalam berkas 'C:/Users/bayuk/OneDrive/Documents/AI/pens/smtr3/text mining/w8/Destinasi\Kesehatan\Tokenized Output\frekuensi_kata_Kesehatan_3322600001_2.txt'.

Hasil TF-IDF tel

In [3]:
import nltk

# Download NLTK data
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\bayuk\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\bayuk\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True