In [1]:
from google.colab import drive

# Mount Google Drive ke file sistem di Colab
drive.mount('/content/drive')


Mounted at /content/drive


In [9]:
# Import pustaka
import pandas as pd
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk

# Unduh resource yang dibutuhkan
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt_tab') # Download the missing resource

# Baca dataset
file_path = '/content/drive/MyDrive/Dataset/ar/CISI_ALL.csv'
data = pd.read_csv(file_path)

# Tampilkan beberapa baris pertama dataset
print(data.head())

# Tokenisasi teks
# Change 'text_column' to 'Query' as that's the actual column name
data['tokenized'] = data['Query'].apply(lambda x: word_tokenize(str(x).lower()))

# Tampilkan hasil tokenisasi untuk beberapa baris
# Change 'text_column' to 'Query' here as well
print(data[['Query', 'tokenized']].head())

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


   ID                                              Query
0   1  .T 18 Editions of the Dewey Decimal Classifica...
1   2  .T Use Made of Technical Libraries .A Slater, ...
2   3  .T Two Kinds of Power An Essay on Bibliographi...
3   4  .T Systems Analysis of a University Library; f...
4   5  .T A Library Management Game: a report on a re...
                                               Query  \
0  .T 18 Editions of the Dewey Decimal Classifica...   
1  .T Use Made of Technical Libraries .A Slater, ...   
2  .T Two Kinds of Power An Essay on Bibliographi...   
3  .T Systems Analysis of a University Library; f...   
4  .T A Library Management Game: a report on a re...   

                                           tokenized  
0  [.t, 18, editions, of, the, dewey, decimal, cl...  
1  [.t, use, made, of, technical, libraries, .a, ...  
2  [.t, two, kinds, of, power, an, essay, on, bib...  
3  [.t, systems, analysis, of, a, university, lib...  
4  [.t, a, library, management, game, :, a, re

In [10]:
# Definisikan stopwords
stop_words = set(stopwords.words('english'))

# Hapus stopwords
data['no_stopwords'] = data['tokenized'].apply(lambda x: [word for word in x if word not in stop_words])

# Tampilkan hasil penghapusan stopwords
print(data[['tokenized', 'no_stopwords']].head())


                                           tokenized  \
0  [.t, 18, editions, of, the, dewey, decimal, cl...   
1  [.t, use, made, of, technical, libraries, .a, ...   
2  [.t, two, kinds, of, power, an, essay, on, bib...   
3  [.t, systems, analysis, of, a, university, lib...   
4  [.t, a, library, management, game, :, a, repor...   

                                        no_stopwords  
0  [.t, 18, editions, dewey, decimal, classificat...  
1  [.t, use, made, technical, libraries, .a, slat...  
2  [.t, two, kinds, power, essay, bibliographic, ...  
3  [.t, systems, analysis, university, library, ;...  
4  [.t, library, management, game, :, report, res...  


In [11]:
# Inisialisasi stemmer
stemmer = PorterStemmer()

# Lakukan stemming
data['stemmed'] = data['no_stopwords'].apply(lambda x: [stemmer.stem(word) for word in x])

# Tampilkan hasil stemming
print(data[['no_stopwords', 'stemmed']].head())

                                        no_stopwords  \
0  [.t, 18, editions, dewey, decimal, classificat...   
1  [.t, use, made, technical, libraries, .a, slat...   
2  [.t, two, kinds, power, essay, bibliographic, ...   
3  [.t, systems, analysis, university, library, ;...   
4  [.t, library, management, game, :, report, res...   

                                             stemmed  
0  [.t, 18, edit, dewey, decim, classif, .a, coma...  
1  [.t, use, made, technic, librari, .a, slater, ...  
2  [.t, two, kind, power, essay, bibliograph, con...  
3  [.t, system, analysi, univers, librari, ;, fin...  
4  [.t, librari, manag, game, :, report, research...  


In [12]:
# Inisialisasi lemmatizer
lemmatizer = WordNetLemmatizer()

# Lakukan lemmatization
data['lemmatized'] = data['no_stopwords'].apply(lambda x: [lemmatizer.lemmatize(word) for word in x])

# Tampilkan hasil lemmatization
print(data[['no_stopwords', 'lemmatized']].head())


                                        no_stopwords  \
0  [.t, 18, editions, dewey, decimal, classificat...   
1  [.t, use, made, technical, libraries, .a, slat...   
2  [.t, two, kinds, power, essay, bibliographic, ...   
3  [.t, systems, analysis, university, library, ;...   
4  [.t, library, management, game, :, report, res...   

                                          lemmatized  
0  [.t, 18, edition, dewey, decimal, classificati...  
1  [.t, use, made, technical, library, .a, slater...  
2  [.t, two, kind, power, essay, bibliographic, c...  
3  [.t, system, analysis, university, library, ;,...  
4  [.t, library, management, game, :, report, res...  


In [13]:
# Gabungkan kembali kata-kata setelah preprocessing untuk representasi TF-IDF
data['processed_text'] = data['lemmatized'].apply(lambda x: ' '.join(x))

# Representasi TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(data['processed_text'])

# Tampilkan bentuk matriks TF-IDF
print("TF-IDF Matrix Shape:", tfidf_matrix.shape)
print("TF-IDF Features:", vectorizer.get_feature_names_out())


TF-IDF Matrix Shape: (1460, 11150)
TF-IDF Features: ['00' '000' '029' ... 'zunde' 'zvezhinskii' 'zyabrev']


In [14]:
# Simpan hasil preprocessing ke file CSV
data.to_csv('/content/drive/MyDrive/Dataset/ar/preprocessed_CISI.csv', index=False)
print("File hasil preprocessing telah disimpan.")


File hasil preprocessing telah disimpan.
