# Klasifikasi Teks (UTS)

## Dataset

Dataset diambil dari hasil crawling PTA pada tugas kemarin. Dengan pelabelan secara manual

In [1]:
import pandas as pd
import re
import numpy as np

In [2]:
data = pd.read_csv('/content/drive/MyDrive/prosaindata/tugas/datauts.csv')
data

FileNotFoundError: ignored

## Normalisasi Teks

### Case Folding

Pada tahap ini, membuat teks dari huruf kapital menjadi huruf kecil

In [None]:
casefolding_abstrak = data['Abstrak'].str.lower()
casefolding_label = data['Kategori'].str.lower()
data_casefolding = pd.DataFrame(casefolding_abstrak)
data_casefolding

### Cleansing

Pembersihan data text(Cleansing text) : menghilangkan tag yang tidak diperlukan (clean_html)

In [None]:
#cleansing
import re
cleansing =[]
for i in range (len(casefolding_abstrak)): 
  cleansing_tag  = re.sub("@[A-Za-z0-9_]+","", casefolding_abstrak[i]) #clenasing mention
  cleansing_hashtag = re.sub("#[A-Za-z0-9_]+","", cleansing_tag) #clenasing hashtag 
  cleansing_https = re.sub(r'http\S+', '', cleansing_hashtag) #cleansing url link
  cleansing_symbols = re.sub("[^a-zA-Z ]+"," ", cleansing_https) #cleansing character

  cleansing.append(cleansing_symbols)

  
cleansing_result = pd.DataFrame(cleansing,columns=['Cleansing Abstrak'])
cleansing_result 

### Slank Words

Memperbaiki kata yang tidak baku (Slank word) dan penulisan kata yang salah

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.corpus import stopwords

In [None]:
# mengambil data kamus slang words dari github 
slang_dict = pd.read_csv("https://raw.githubusercontent.com/louisowen6/NLP_bahasa_resources/master/combined_slang_words.txt", sep=" ", header=None)

# Membuat fungsi untuk mengubah slang words menjadi kata Indonesia yang benar
def replace_slang_words(text):
    words = nltk.word_tokenize(text.lower())
    words_filtered = [word for word in words if word not in stopwords.words('indonesian')]
    for i in range(len(words_filtered)):
        if words_filtered[i] in slang_dict:
            words_filtered[i] = slang_dict[words_filtered[i]]
    return ' '.join(words_filtered)

# membuat list
slang_words=[]
#membuat perulangan uuntuk mengecek slangword tiap baris
for i in range(len(cleansing)):
  slang = replace_slang_words(cleansing[i])
  slang_words.append(slang)

data_slang = pd.DataFrame(slang_words, columns=["Slang Word Corection"])
data_slang

### Steaming 

In [None]:
#install library sastrawi
pip install Sastrawi

In [None]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# create stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

#membuat list
steaming = []
#membuat perulangan untuk mengecek steaming di setiap baris
for i in range (len(slang_words)):
  stem = stemmer.stem(slang_words[i])
  steaming.append(stem)

#menampilka data steaming
data_steaming = pd.DataFrame(steaming, columns=["Steaming"])
data_steaming

### Tokenizing dan Stop Words

Proses selanjutnya adalah membuang stopwords. Mengapa stopwords perlu dibuang dari sebuah isi teks? stopwords sendiri merupakan kata umum dalam sebuah teks yang sebetulnya tidak memiliki makna seperti “yang”, “dan”, “di”, “dari”, dll. Disini kita menggunakan library dari Sastrawi untuk penggunakan kamus stopwords.

In [None]:
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
stop_factory = StopWordRemoverFactory()

words = []
for i in range (len(steaming)):
  tokens = word_tokenize(steaming[i])
  more_stopword = ['dengan', 'ia','bahwa','oleh','aalysis','aam','kunci']
  data = stop_factory.get_stop_words()+more_stopword
  stopword = stop_factory.create_stop_word_remover()
  removed = []
  for t in tokens:
      if t not in data:
          removed.append(t)
  
  words.append(removed)
  print(removed)

In [None]:
gabung=[]
for i in range(len(words)):
  joinkata = ' '.join(words[i])
  gabung.append(joinkata)

result = pd.DataFrame(gabung, columns=['Join Kata'])
result

## Ekstraksi Fitur

### TF-IDF

TFIDF (Term Frequency Inverse Document Frequency) merupakan
metode pembobotan dalam bentuk integrasi antar term frequency dengan inverse document
frequency. Metode TFIDF digunakan pada penelitian ini untuk memilih fitur sebagai hasil
ringkasan, dengan penerapannya pada seleksi fitur bobot kata.

In [None]:
# TfidfVectorizer 
# CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
import pandas as pd

# set of documents
train = ['The sky is blue blue.','The sun is bright.']
test = ['The sun in the sky is bright', 'We can see the shining sun, the bright sun.']

# instantiate the vectorizer object
countvectorizer = CountVectorizer(analyzer= 'word', stop_words='english')
tfidfvectorizer = TfidfVectorizer(analyzer='word',stop_words= 'english')
count_wm = countvectorizer.fit_transform(gabung)
tfidf_wm = tfidfvectorizer.fit_transform(gabung)

#retrieve the terms found in the corpora
#if we take same parameters on both Classes(CountVectorizer and TfidfVectorizer) , it will give same output of get_feature_names() methods)
#count_tokens = tfidfvectorizer.get_feature_names() # no difference

count_tokens = countvectorizer.get_feature_names_out()
tfidf_tokens = tfidfvectorizer.get_feature_names_out()
df_countvect = pd.DataFrame(data = count_wm.toarray(),columns = count_tokens)
df_tfidfvect = pd.DataFrame(data = tfidf_wm.toarray(),columns = tfidf_tokens)
print("Count Vectorizer\n")
df_countvect


In [None]:
print("\nTF-IDF Vectorizer\n")
df_tfidfvect

## PCA

In [None]:
from sklearn.decomposition import PCA
pca_abstrak = PCA(n_components=150)
principalComponents_abstrak = pca_abstrak.fit_transform(df_tfidfvect)
principal_abstrak_Df = pd.DataFrame(principalComponents_abstrak)
principal_abstrak_Df

## Splitting Data

In [None]:
from sklearn.model_selection import train_test_split
training, test = train_test_split(principalComponents_abstrak,test_size=0.2, random_state=1)#Nilai X training dan Nilai X testing
training_label, test_label = train_test_split(casefolding_label, test_size=0.2, random_state=1)#Nilai Y training dan Nilai Y testing

## Pemodelan KNN

### Evaluasi Model

In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

modelKNN = KNeighborsClassifier(n_neighbors=5)
modelKNN.fit(training, training_label)

In [None]:
test_pred = modelKNN.predict(test)
test_pred

In [None]:
accuracy_score(test_label, test_pred)

In [None]:
print(classification_report(test_label, test_pred))

## Pemodelan Naive Bayes

### Evaluasi Model

In [None]:
#Model Select
from sklearn.metrics import accuracy_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.naive_bayes import GaussianNB
gaussian = GaussianNB()
gaussian.fit(training, training_label)

In [None]:
predict = gaussian.predict(test) 
predict

In [None]:
accuracy_score(test_label, predict)

In [None]:
print(classification_report(test_label, predict))