1. Mount data ke gdrive

In [35]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


2. Install Library yang digunakan

In [36]:
pip install nltk



In [37]:
import pandas as pd # Pandas juga dapat membaca file dari berbagai format seperti .txt, .csv, .tsv, dan lainnya
import re # ekspresi reguler adalah urutan karakter khusus yang membantu Anda mencocokkan atau menemukan string atau kumpulan string lain
import string 
import nltk # libray python untuk bekerja dengan permodelan teks
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

3. Menampilkan dataset

In [38]:
df = pd.read_csv("gdrive/MyDrive/NLP_Dataset Sentiment Analysis/Womens Clothing E-Commerce Reviews.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,Clothing ID,Age,Title,Review Text,Rating,Recommended IND,Positive Feedback Count,Division Name,Department Name,Class Name
0,0,767,33,,Absolutely wonderful - silky and sexy and comf...,4,1,0,Initmates,Intimate,Intimates
1,1,1080,34,,Love this dress! it's sooo pretty. i happene...,5,1,4,General,Dresses,Dresses
2,2,1077,60,Some major design flaws,I had such high hopes for this dress and reall...,3,0,0,General,Dresses,Dresses
3,3,1049,50,My favorite buy!,"I love, love, love this jumpsuit. it's fun, fl...",5,1,0,General Petite,Bottoms,Pants
4,4,847,47,Flattering shirt,This shirt is very flattering to all due to th...,5,1,6,General,Tops,Blouses


In [39]:
df = df[['Review Text', 'Rating']]
df.head()

Unnamed: 0,Review Text,Rating
0,Absolutely wonderful - silky and sexy and comf...,4
1,Love this dress! it's sooo pretty. i happene...,5
2,I had such high hopes for this dress and reall...,3
3,"I love, love, love this jumpsuit. it's fun, fl...",5
4,This shirt is very flattering to all due to th...,5


4. Pelabelan

In [40]:
df = df[df.Rating != 3]

pd.set_option('mode.chained_assignment', None)
df["labels"] = df["Rating"].apply(lambda x: 1 if x < 3  else 0) # positive as 0 and negative as 1
df = df.drop("Rating",axis=1)

df.head()

Unnamed: 0,Review Text,labels
0,Absolutely wonderful - silky and sexy and comf...,0
1,Love this dress! it's sooo pretty. i happene...,0
3,"I love, love, love this jumpsuit. it's fun, fl...",0
4,This shirt is very flattering to all due to th...,0
5,"I love tracy reese dresses, but this one is no...",1


5. Preprocessing data

In [41]:
#import stopword - penggunaan stopword yaitu dengan menghapus kata-kata yang memiliki informasi rendah dari sebuah teks
from nltk.corpus import stopwords 
from nltk.tokenize import sent_tokenize, word_tokenize

In [42]:
def pre_process(Review_Text):
    # Case Folding: Lowercase
    # Merubah format teks menjadi format huruf kecil semua (lowercase).
    Review_Text = str(Review_Text).lower()

    # Case Folding: Removing Number
    # Menghapus karakter angka.
    Review_Text = re.sub(r"\d+", "", Review_Text)

    # Case Folding: Removing Punctuation
    # Menghapus karakter tanda baca.
    Review_Text = Review_Text.translate(str.maketrans("","",string.punctuation))

    #Case Folding: Removing Whitespace
    #Menghapus karakter kosong.
    Review_Text = Review_Text.strip()

    
    #Separating Sentences with Split () Method
    #Fungsi split() memisahkan string ke dalam list dengan spasi sebagai pemisah jika tidak ditentukan pemisahnya.
    pisah = Review_Text.split()

    #Tokenizing: Word Tokenizing Using NLTK Module
    #Menggunakan library NLTK untuk memisahkan kata dalam sebuah kalimat.
    tokens = nltk.tokenize.word_tokenize(Review_Text)

    #Filtering using NLTK
    listStopword =  set(stopwords.words('indonesian'))
 
    removed = []
    for t in tokens:
      if t not in listStopword:
        removed.append(t)

    return Review_Text

df['Review Text'] = df['Review Text'].apply(lambda x:pre_process(x))
df.head()

Unnamed: 0,Review Text,labels
0,absolutely wonderful silky and sexy and comfo...,0
1,love this dress its sooo pretty i happened t...,0
3,i love love love this jumpsuit its fun flirty ...,0
4,this shirt is very flattering to all due to th...,0
5,i love tracy reese dresses but this one is not...,1


In [43]:
#Vectorization
#Scikit-belajar ini CountVectorizer digunakan untuk mengkonversi koleksi dokumen teks ke vektor istilah / jumlah tanda
from sklearn.feature_extraction.text import CountVectorizer

# Untuk membuat Count Vectorizer, kita hanya perlu membuatnya.
# Ada parameter khusus yang dapat kita atur di sini saat membuat vectorizer, tetapi
# untuk contoh paling dasar, ini tidak diperlukan.
cv = CountVectorizer()
# Untuk benar-benar membuat vectorizer, kita hanya perlu memanggil fit pada teks
# data yang ingin kita perbaiki
cv.fit(df['Review Text'])
# Jika kita benar-benar ingin membuat vektor, kita dapat melakukannya dengan memasukkan teks 
# ke dalam vectorizer untuk mendapatkan hitungan mundur
X = cv.transform(df['Review Text'])

y = df['labels']

In [44]:
#Build Classifier
#sklearn adalah modul Python yang mengintegrasikan algoritme pembelajaran mesin klasik dalam dunia paket Python ilmiah yang erat 

from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.80)

In [45]:
#Find the best value of C in logistic regression
#Regresi Logistik adalah algoritma klasifikasi Pembelajaran Mesin yang digunakan untuk memprediksi probabilitas variabel dependen kategoris

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    lr = LogisticRegression(C=c)
    lr.fit(X_train, y_train)
    print('Accuracy for C=%s: %s'
         % (c, accuracy_score(y_test, lr.predict(X_test))))

Accuracy for C=0.01: 0.9090468105748242
Accuracy for C=0.05: 0.924812030075188


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy for C=0.25: 0.9272374484598593


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


Accuracy for C=0.5: 0.9294203250060635
Accuracy for C=1: 0.9296628668445307


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [46]:
#Find the best value of C in support vector
#Mesin vektor pendukung (SVM) adalah sekumpulan metode pembelajaran yang diawasi yang digunakan untuk klasifikasi, regresi dan deteksi pencilan

for c in [0.01, 0.05, 0.25, 0.5, 1]:
    
    sv = SVC(C=c)
    sv.fit(X_train, y_train)
    print('Accuracy for C=%s: %s'
         % (c, accuracy_score(y_test, sv.predict(X_test))))

Accuracy for C=0.01: 0.8872180451127819
Accuracy for C=0.05: 0.8872180451127819
Accuracy for C=0.25: 0.8898860053359204
Accuracy for C=0.5: 0.9097744360902256
Accuracy for C=1: 0.921173902498181


In [47]:
#Here I choose C=1 to build the final model for Logistic Regression.
final_model_lr = LogisticRegression(C=1)
final_model_lr.fit(X, y)
print('Final Model Accuracy: %s' %accuracy_score(y_test, final_model_lr.predict(X_test)))

Final Model Accuracy: 0.9776861508610235


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [48]:
#Here I choose C=1 to build the final model for Support Vector.
final_model_sv = SVC(C=1)
final_model_sv.fit(X, y)
print('Final Model Accuracy: %s' %accuracy_score(y_test, final_model_sv.predict(X_test)))

Final Model Accuracy: 0.9650739752607325
