In [1]:
pip install Sastrawi

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


In [2]:
import pandas as pd
import string
import re
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [3]:
def preprocessing (col, sentence):

  #Remove HTML tags
  if col == 'content':
    CLEANR = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
    sentence = re.sub(CLEANR, ' ', sentence)

  #Remove number
  sentence = re.sub(r"\d+", "", sentence)

  #Case folding
  sentence = sentence.lower()

  #Remove punctuation
  for p in string.punctuation:
      sentence = sentence.replace(p, " ")

  #Remove whitespace leading & trailing
  sentence = sentence.strip()

  #Remove multiple whitespace into single whitespace
  sentence = re.sub('\s+',' ', sentence)

  #Tokenization
  tokens = nltk.tokenize.word_tokenize(sentence)

  #Remove stopwords in Bahasa Indonesia
  bahasa_stopwords = set(stopwords.words('indonesian'))
  tokens_without_bahasa_stopwords = [token for token in tokens if not token in bahasa_stopwords]

  #Remove stopwords in English
  english_stopwords = set(stopwords.words('english'))
  tokens_without_bilingual_stopwords = [token for token in tokens_without_bahasa_stopwords if not token in english_stopwords]

  #Lemmatization for English words
  lemmatizer = WordNetLemmatizer()
  english_base_word_tokens = [lemmatizer.lemmatize(token) for token in tokens_without_bilingual_stopwords]

  #Stemming words in Bahasa Indonesia
  factory = StemmerFactory()
  stemmer = factory.create_stemmer()
  bilingual_base_word_tokens = [stemmer.stem(token) for token in english_base_word_tokens]

  return bilingual_base_word_tokens

In [4]:
from google.colab import drive
drive.mount('/gdrive')
%cd /gdrive

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
/gdrive


In [5]:
file_path = r'/gdrive/My Drive/discussions 2.csv'
df = pd.read_csv(file_path)
df

Unnamed: 0,id,module,title,content,solved,keywords
0,80721,Submission Akhir : Aplikasi Github User,DatabaseHelper saat create table garis merah,<p>Selamat Pagi dan Selamat Tahun Baru semua.<...,1,"database,table"
1,80741,Submission 2 : Aplikasi GitHub User (Navigatio...,Data API tidak tampil di recyclerview,<p>Mohon bantuannya terdapat error no adapter ...,1,"api,recyclerview"
2,80746,Latihan UI Test Menggunakan Espresso,Error MainActivityEspressoTest (android:test) ...,<p>Error saat menjalankan project baik di Emu ...,1,"er,android-studio,android"
3,80751,Submission 2 : Aplikasi GitHub User (Navigatio...,Terdeteksi plagiarisme padahal referensi dari ...,<p>Submission ke 2 saya yang terakhir kali say...,1,"submission-2,plagiarisme,referensi"
4,80766,Submission 2 : Aplikasi GitHub User (Navigatio...,Detail user dari hasil pencarian memiliki bebe...,<p>Karena saya mengambil data untuk searchView...,1,"null,hasil-pencarian,data"
...,...,...,...,...,...,...
1519,159258,Submission Akhir : Aplikasi Github User,error lateinit property favoriteViewModel has ...,<p>Pada logcat tertulis error lateinit propert...,1,viewmodel
1520,159413,Submission 2 : Aplikasi GitHub User (Navigatio...,Apakah tampilan aplikasi boleh kita custom?,"<p>Halo kakak2 dicoding, saya mau nanya terkai...",0,"android,custom-layout"
1521,159543,Submission 1 : Aplikasi GitHub User,Aplikasi Force Close Ketika Klik User Detail,"<p>Halo guys, mau tanya ini kenapa aplikasi sa...",0,"force-close,user-detail"
1522,159598,Submission Akhir : Aplikasi Github User,Leaked window activity,"<p>saya mendapatkan error<img src=""https://dic...",0,"leaked-window,theme"


In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1524 entries, 0 to 1523
Data columns (total 6 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        1524 non-null   int64 
 1   module    1524 non-null   object
 2   title     1524 non-null   object
 3   content   1524 non-null   object
 4   solved    1524 non-null   int64 
 5   keywords  1523 non-null   object
dtypes: int64(2), object(4)
memory usage: 71.6+ KB


In [7]:
test = "My name is Daffa. My hobbies are watching One Piece and Anime 123. Nama saya Daffa. Hobi saya nonton One Piece dan Anime 123."
preprocessing('', test)

['name',
 'daffa',
 'hobby',
 'watching',
 'one',
 'piece',
 'anime',
 'nama',
 'daffa',
 'hobi',
 'nonton',
 'one',
 'piece',
 'anime']

In [8]:
selected_cols = ['module', 'title', 'content', 'keywords']

for col in selected_cols:
  for row in range(len(df.index)):
    df[col].iloc[row] = preprocessing(col, str(df[col].iloc[row]))

df

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


Unnamed: 0,id,module,title,content,solved,keywords
0,80721,"[submission, aplikasi, github, user]","[databasehelper, create, table, gari, merah]","[selamat, pagi, selamat, halo, kak, databasehe...",1,"[database, table]"
1,80741,"[submission, aplikasi, github, user, navigatio...","[data, api, tampil, recyclerview]","[mohon, bantu, error, adapter, logcat, data, t...",1,"[api, recyclerview]"
2,80746,"[latih, ui, test, espresso]","[error, mainactivityespressotest, android, tes...","[error, jalan, project, emu, device, error, co...",1,"[er, android, studio, android]"
3,80751,"[submission, aplikasi, github, user, navigatio...","[deteksi, plagiarisme, referensi, modul]","[submission, kali, submit, deteksi, plagiarism...",1,"[submission, plagiarisme, referensi]"
4,80766,"[submission, aplikasi, github, user, navigatio...","[detail, user, hasil, cari, milik, data, null]","[ambil, data, searchview, nya, username, avata...",1,"[null, hasil, cari, data]"
...,...,...,...,...,...,...
1519,159258,"[submission, aplikasi, github, user]","[error, lateinit, property, favoriteviewmodel,...","[logcat, tulis, error, lateinit, property, fav...",1,[viewmodel]
1520,159413,"[submission, aplikasi, github, user, navigatio...","[tampil, aplikasi, custom]","[halo, kakak, dicoding, nanya, kait, submissio...",0,"[android, custom, layout]"
1521,159543,"[submission, aplikasi, github, user]","[aplikasi, force, close, klik, user, detail]","[halo, guy, aplikasi, force, close, klik, user...",0,"[force, close, user, detail]"
1522,159598,"[submission, aplikasi, github, user]","[leaked, window, activity]","[error, atas, buka, app, layout, ganti, theme,...",0,"[leaked, window, theme]"


In [9]:
df.to_csv(r'/gdrive/My Drive/cleaned_data.csv')