In [1]:
import pandas as pd
import re
import pickle
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score, recall_score, precision_score, f1_score
from statistics import mean, stdev
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn import model_selection

# Prepare Dataset

In [2]:
# import dataset
df = pd.read_csv('DATA/train_preprocess.tsv.txt', sep='\t', names=['Tweet','Label'])
df.head()

Unnamed: 0,Tweet,Label
0,warung ini dimiliki oleh pengusaha pabrik tahu...,positive
1,mohon ulama lurus dan k212 mmbri hujjah partai...,neutral
2,lokasi strategis di jalan sumatera bandung . t...,positive
3,betapa bahagia nya diri ini saat unboxing pake...,positive
4,duh . jadi mahasiswa jangan sombong dong . kas...,negative


In [3]:
df.Label.value_counts()

Label
positive    6416
negative    3436
neutral     1148
Name: count, dtype: int64

In [4]:
# Menghapus duplikat pada kolom "tweet"
df = df.drop_duplicates(subset='Tweet')

# Memeriksa data setelah menghapus duplikat
print("\nData setelah menghapus duplikat:")
print(df)


Data setelah menghapus duplikat:
                                                   Tweet     Label
0      warung ini dimiliki oleh pengusaha pabrik tahu...  positive
1      mohon ulama lurus dan k212 mmbri hujjah partai...   neutral
2      lokasi strategis di jalan sumatera bandung . t...  positive
3      betapa bahagia nya diri ini saat unboxing pake...  positive
4      duh . jadi mahasiswa jangan sombong dong . kas...  negative
...                                                  ...       ...
10993  f - demokrat dorong upaya kemandirian energi n...   neutral
10994                                        tidak bosan  positive
10996  enak rasa masakan nya apalagi kepiting yang me...  positive
10998  pagi pagi di tol pasteur sudah macet parah , b...  negative
10999  meskipun sering belanja ke yogya di riau junct...  positive

[10933 rows x 2 columns]


# Text Normalization

In [5]:
def cleansing(sent):
    # Mengubah kata menjadi huruf kecil semua dengan menggunakan fungsi lower()
    string = sent.lower()

    # Menghapus emoticon dan tanda baca menggunakan "RegEx" dengan script di bawah
    string = re.sub(r'(?:\@|http?\://|https?\://|www)\S+', '', string) #menghapus https dan http
    string = re.sub('<.*?>', ' ', string) #mengganti karakter html dengan tanda petik
    string = re.sub('[^0-9a-zA-Z]+', ' ', string) #menghilangkan semua karakter yang bukan huruf atau angka dan menggantinya dengan spasi.
    string = re.sub('\n',' ',string) #mengganti line baru dengan spasi
    string = re.sub(r':', ' ', string) #menggantikan karakter : dengan spasi 
    string = re.sub('gue','saya', string) # Mengganti kata "gue" dengan kata "saya"
    string = re.sub(r'\b[a-zA-Z]\b', ' ', string) #menghapus single char
    string = ' '.join(string.split()) #memisahkan dan menggabungkan kata
    string = string.strip() #menghilangkan whitespace di awal dan di akhir teks
    string = re.sub(r'pic.twitter.com.[\w]+', '', string) #menghapus link picture
    string = re.sub(r'\buser\b',' ', string) #menghapus kata 'user'
    string = re.sub(r'\brt\b',' ', string) #menghapus awalan rt
    string = re.sub('RT',' ', string) #menghapus RT simbol
    string = re.sub(r'‚Ä¶', '', string) #menghapus simbol tidak perlu

    # Lematisasi menggunakan Sastrawi
    stemmer_factory = StemmerFactory()
    stemmer = stemmer_factory.create_stemmer()
    string = stemmer.stem(string)

    # Menghapus stop words menggunakan Sastrawi
    stopword_factory = StopWordRemoverFactory()
    stopword_remover = stopword_factory.create_stop_word_remover()
    string = stopword_remover.remove(string)
    
    return string

In [6]:
df['Tweet_Clean'] = df.Tweet.apply(cleansing)
df.head()

Unnamed: 0,Tweet,Label,Tweet_Clean
0,warung ini dimiliki oleh pengusaha pabrik tahu...,positive,warung milik usaha pabrik puluh tahun kenal pu...
1,mohon ulama lurus dan k212 mmbri hujjah partai...,neutral,ulama lurus k212 mmbri hujjah partai diwlh sua...
2,lokasi strategis di jalan sumatera bandung . t...,positive,lokasi strategis jalan sumatera bandung nyaman...
3,betapa bahagia nya diri ini saat unboxing pake...,positive,betapa bahagia unboxing paket barang bagus beli
4,duh . jadi mahasiswa jangan sombong dong . kas...,negative,duh mahasiswa sombong kasih kartu kuning ajar ...


# Feature Extraction

## TF-IDF

In [7]:
data_preprocessed = df.Tweet_Clean.tolist()

In [8]:
# Proses feature extraction
tfidf_vect = TfidfVectorizer()

X = tfidf_vect.fit_transform(data_preprocessed)
print("Feature extraction selesai")

Feature extraction selesai


In [9]:
pickle.dump(tfidf_vect, open("feature.p", "wb"))

In [10]:
tfidf_array = X.toarray()
df_array = pd.DataFrame(data=tfidf_array, columns=tfidf_vect.get_feature_names_out())
print(df_array)

        00  000  001   01  010  0111  011770465655617   02  021  022  ...  \
0      0.0  0.0  0.0  0.0  0.0   0.0              0.0  0.0  0.0  0.0  ...   
1      0.0  0.0  0.0  0.0  0.0   0.0              0.0  0.0  0.0  0.0  ...   
2      0.0  0.0  0.0  0.0  0.0   0.0              0.0  0.0  0.0  0.0  ...   
3      0.0  0.0  0.0  0.0  0.0   0.0              0.0  0.0  0.0  0.0  ...   
4      0.0  0.0  0.0  0.0  0.0   0.0              0.0  0.0  0.0  0.0  ...   
...    ...  ...  ...  ...  ...   ...              ...  ...  ...  ...  ...   
10928  0.0  0.0  0.0  0.0  0.0   0.0              0.0  0.0  0.0  0.0  ...   
10929  0.0  0.0  0.0  0.0  0.0   0.0              0.0  0.0  0.0  0.0  ...   
10930  0.0  0.0  0.0  0.0  0.0   0.0              0.0  0.0  0.0  0.0  ...   
10931  0.0  0.0  0.0  0.0  0.0   0.0              0.0  0.0  0.0  0.0  ...   
10932  0.0  0.0  0.0  0.0  0.0   0.0              0.0  0.0  0.0  0.0  ...   

       zonpoliticon  zoo  zoom  zubir  zulfikri  zulkarnain  zup  zupa  zup

## Prepare Train and Test Dataset (Split Dataset)

In [11]:
classes = df.Label

# split dataset menjadi 80% untuk train dan 20% untuk test
X_train, X_test, y_train, y_test = train_test_split(X, classes, test_size=0.2)

In [12]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(8746, 12862)
(2187, 12862)
(8746,)
(2187,)


# Model Training

## Neural Network

In [13]:
# model training menggunakan neural network
model = MLPClassifier()
model.fit(X_train, y_train)

print("Training selesai")

Training selesai


In [14]:
pickle.dump(model, open("model.p", "wb"))

# Model Evaluation

In [15]:
test = model.predict(X_test)

print("Testing selesai")
print(classification_report(y_test, test))

Testing selesai
              precision    recall  f1-score   support

    negative       0.66      0.73      0.70       657
     neutral       0.70      0.53      0.60       238
    positive       0.85      0.84      0.85      1292

    accuracy                           0.78      2187
   macro avg       0.74      0.70      0.72      2187
weighted avg       0.78      0.78      0.77      2187



## Cross Validation

In [16]:
X = df[["Tweet_Clean"]] #input feature
Y = df["Label"] #output feature

# split dataset menjadi 80% untuk train dan 20% untuk test
X_train, X_test, y_train, y_test = train_test_split(X, classes, test_size=0.2)

In [17]:
# cross validation dengan split = 5
kf = KFold(n_splits=5, random_state=42, shuffle=True)

accuracies = []

X_train = X_train.reset_index().Tweet_Clean
y_train = y_train.reset_index().Label

def cross_validation(k, X_train, model, name):
    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []

    # iterate sebanyak k_cross validation
    for iteration, data in enumerate(kf.split(X_train), start=1):
        # mengambil data dan target train
        data_train = X_train[data[0]]
        tfidf_vect = TfidfVectorizer()
        data_train = tfidf_vect.fit_transform(data_train)
        target_train = y_train[data[0]]

        # mengambil data dan target test
        data_test = y_train[data[1]]
        data_test = tfidf_vect.transform(data_test)
        target_test = y_train[data[1]]

        # model training menggunakan data train
        classifier = model
        classifier.fit(data_train, target_train)

        # prediksi data test
        preds = classifier.predict(data_test)

        # menghitung accuracy
        accuracy = accuracy_score(target_test, preds)
        precision = precision_score(target_test, preds, average='weighted')
        recall = recall_score(target_test, preds, average='weighted')
        f1 = f1_score(target_test, preds, average='weighted')

        #print("Training ke-", iteration)
        #print(classification_report(target_test, preds))
        #print("=================================================================")

        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)

    result = {'algorithm': name,
                'accuracy_per_iter': accuracies, 'accuracy_mean': mean(accuracies), 'accuracy_std': stdev(accuracies),
                'recall_mean': mean(recalls),'precision_mean': mean(precisions), 'f1_mean': mean(f1_scores)}
    return result

In [18]:
result = cross_validation(5, X_train, MLPClassifier(), "Neural Network")
result

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'algorithm': 'Neural Network',
 'accuracy_per_iter': [0.30914285714285716,
  0.30646083476272157,
  0.30131503716409375,
  0.31675242995997716,
  0.3230417381360778],
 'accuracy_mean': 0.31134257943314547,
 'accuracy_std': 0.008593000206829399,
 'recall_mean': 0.31134257943314547,
 'precision_mean': 0.09699327349012817,
 'f1_mean': 0.14789194156554436}

## Compare All Models with Cross Validation

In [19]:
X = df[["Tweet_Clean"]] #input feature
Y = df["Label"] #output feature

# split dataset menjadi 80% untuk train dan 20% untuk test
X_train, X_test, y_train, y_test = train_test_split(X, classes, test_size=0.2)

X_train = X_train.reset_index().Tweet_Clean
y_train = y_train.reset_index().Label

X_test = X_test.reset_index().Tweet_Clean
y_test = y_test.reset_index().Label

In [20]:
# loading model
models = []
models.append(('Logistic Regression', LogisticRegression()))
models.append(('K Nearest Neighbour', KNeighborsClassifier()))
models.append(('Naive Bayes', MultinomialNB()))
models.append(('Support Vector Machine', SVC()))
models.append(('Neural Network', MLPClassifier()))
models.append(('Decision Tree', DecisionTreeClassifier()))
models.append(('Random Forest', RandomForestClassifier()))
models.append(('Gradient Boosting', GradientBoostingClassifier()))
models.append(('AdaBoost', AdaBoostClassifier()))

In [21]:
# evaluating the model
results = []
for name, model in models:
      print(name)
      result = cross_validation(5, X_train, model, name)
      results.append(result)
      print(result)
results = pd.DataFrame(results)
results = results.sort_values("accuracy_mean",ascending=False)

Logistic Regression


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'algorithm': 'Logistic Regression', 'accuracy_per_iter': [0.3028571428571429, 0.2950257289879931, 0.32361349342481416, 0.32532875929102345, 0.32018296169239563], 'accuracy_mean': 0.31340161725067384, 'accuracy_std': 0.013614192726363524, 'recall_mean': 0.31340161725067384, 'precision_mean': 0.09836885069021033, 'f1_mean': 0.14969818774699264}
K Nearest Neighbour


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'algorithm': 'K Nearest Neighbour', 'accuracy_per_iter': [0.3028571428571429, 0.6054888507718696, 0.32361349342481416, 0.32532875929102345, 0.5797598627787307], 'accuracy_mean': 0.42740962182471615, 'accuracy_std': 0.1513524582593213, 'recall_mean': 0.42740962182471615, 'precision_mean': 0.20100503812525855, 'f1_mean': 0.26820005740242503}
Naive Bayes


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'algorithm': 'Naive Bayes', 'accuracy_per_iter': [0.5828571428571429, 0.6054888507718696, 0.577472841623785, 0.5677530017152659, 0.5797598627787307], 'accuracy_mean': 0.5826663399493588, 'accuracy_std': 0.013952876922688933, 'recall_mean': 0.5826663399493588, 'precision_mean': 0.33965580992951755, 'f1_mean': 0.4291009664992217}
Support Vector Machine


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'algorithm': 'Support Vector Machine', 'accuracy_per_iter': [0.3028571428571429, 0.2950257289879931, 0.32361349342481416, 0.32532875929102345, 0.5797598627787307], 'accuracy_mean': 0.3653169974679409, 'accuracy_std': 0.12058822474353853, 'recall_mean': 0.3653169974679409, 'precision_mean': 0.14508972459643002, 'f1_mean': 0.20374368172900575}
Neural Network


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'algorithm': 'Neural Network', 'accuracy_per_iter': [0.3028571428571429, 0.2950257289879931, 0.32361349342481416, 0.32532875929102345, 0.5797598627787307], 'accuracy_mean': 0.3653169974679409, 'accuracy_std': 0.12058822474353853, 'recall_mean': 0.3653169974679409, 'precision_mean': 0.14508972459643002, 'f1_mean': 0.20374368172900575}
Decision Tree


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'algorithm': 'Decision Tree', 'accuracy_per_iter': [0.3028571428571429, 0.2950257289879931, 0.32361349342481416, 0.32532875929102345, 0.5797598627787307], 'accuracy_mean': 0.3653169974679409, 'accuracy_std': 0.12058822474353853, 'recall_mean': 0.3653169974679409, 'precision_mean': 0.14508972459643002, 'f1_mean': 0.20374368172900575}
Random Forest


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'algorithm': 'Random Forest', 'accuracy_per_iter': [0.3028571428571429, 0.2950257289879931, 0.32361349342481416, 0.32532875929102345, 0.5797598627787307], 'accuracy_mean': 0.3653169974679409, 'accuracy_std': 0.12058822474353853, 'recall_mean': 0.3653169974679409, 'precision_mean': 0.14508972459643002, 'f1_mean': 0.20374368172900575}
Gradient Boosting


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'algorithm': 'Gradient Boosting', 'accuracy_per_iter': [0.3028571428571429, 0.2950257289879931, 0.32361349342481416, 0.32532875929102345, 0.32018296169239563], 'accuracy_mean': 0.31340161725067384, 'accuracy_std': 0.013614192726363524, 'recall_mean': 0.31340161725067384, 'precision_mean': 0.09836885069021033, 'f1_mean': 0.14969818774699264}
AdaBoost


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


{'algorithm': 'AdaBoost', 'accuracy_per_iter': [0.3028571428571429, 0.2950257289879931, 0.32361349342481416, 0.32532875929102345, 0.32018296169239563], 'accuracy_mean': 0.31340161725067384, 'accuracy_std': 0.013614192726363524, 'recall_mean': 0.31340161725067384, 'precision_mean': 0.09836885069021033, 'f1_mean': 0.14969818774699264}


  _warn_prf(average, modifier, msg_start, len(result))


In [22]:
results

Unnamed: 0,algorithm,accuracy_per_iter,accuracy_mean,accuracy_std,recall_mean,precision_mean,f1_mean
2,Naive Bayes,"[0.5828571428571429, 0.6054888507718696, 0.577...",0.582666,0.013953,0.582666,0.339656,0.429101
1,K Nearest Neighbour,"[0.3028571428571429, 0.6054888507718696, 0.323...",0.42741,0.151352,0.42741,0.201005,0.2682
3,Support Vector Machine,"[0.3028571428571429, 0.2950257289879931, 0.323...",0.365317,0.120588,0.365317,0.14509,0.203744
4,Neural Network,"[0.3028571428571429, 0.2950257289879931, 0.323...",0.365317,0.120588,0.365317,0.14509,0.203744
5,Decision Tree,"[0.3028571428571429, 0.2950257289879931, 0.323...",0.365317,0.120588,0.365317,0.14509,0.203744
6,Random Forest,"[0.3028571428571429, 0.2950257289879931, 0.323...",0.365317,0.120588,0.365317,0.14509,0.203744
0,Logistic Regression,"[0.3028571428571429, 0.2950257289879931, 0.323...",0.313402,0.013614,0.313402,0.098369,0.149698
7,Gradient Boosting,"[0.3028571428571429, 0.2950257289879931, 0.323...",0.313402,0.013614,0.313402,0.098369,0.149698
8,AdaBoost,"[0.3028571428571429, 0.2950257289879931, 0.323...",0.313402,0.013614,0.313402,0.098369,0.149698


## Select model with the best performance

In [24]:
tfidf_vect = TfidfVectorizer()
X_train = tfidf_vect.fit_transform(X_train)

model = MultinomialNB()
model.fit(X_train, y_train)

print("Training selesai")

pickle.dump(model, open("model.p", 'wb'))

Training selesai


In [25]:
X_test = tfidf_vect.transform(X_test)

test = model.predict(X_test)
print("Testing selesai")
print(classification_report(y_test, test))

Testing selesai
              precision    recall  f1-score   support

    negative       0.74      0.70      0.72       671
     neutral       0.98      0.20      0.33       229
    positive       0.82      0.95      0.88      1287

    accuracy                           0.80      2187
   macro avg       0.84      0.62      0.64      2187
weighted avg       0.81      0.80      0.77      2187



## Prediksi Data Baru

In [30]:
input_text = "kamu payah"

# normalize text
text = tfidf_vect.transform([cleansing(input_text)])

# predict data baru menggunakan model yang telah dibuat
result = model.predict(text)[0]

print("Sentiment:", result)

Sentiment: negative


In [31]:
input_text = "kamu hebat sekali"

# normalize text
text = tfidf_vect.transform([cleansing(input_text)])

# predict data baru menggunakan model yang telah dibuat
result = model.predict(text)[0]

print("Sentiment:", result)

Sentiment: positive
