In [19]:
import numpy as np 
import pandas as pd
import re
import sqlite3
import pickle # menyimpan model
from sklearn.model_selection import train_test_split # Untuk split data
from sklearn.neural_network import MLPClassifier # Untuk Algoritma ML yang akan di pakai
from sklearn.model_selection import GridSearchCV # untuk tuning hyperparameter
from nltk.corpus import stopwords # untuk stopwords
from sklearn.pipeline import Pipeline # untuk membangun pipeline ML
from sklearn.compose import ColumnTransformer # bagian dari pipe line untuk handling kolom 
from sklearn.metrics import f1_score, precision_score, recall_score, accuracy_score, classification_report# menghitung nilai f1
from sklearn.feature_extraction.text import CountVectorizer

In [20]:
conn = sqlite3.connect(r'..\database.db', check_same_thread=False)
query = 'SELECT * FROM new_data'
data = pd.read_sql_query (query, conn)
data.head()

Unnamed: 0,Tweet,Label
0,warung dimiliki pengusaha pabrik puluhan terke...,positive
1,mohon ulama lurus k mmbri hujjah partai diwlh ...,neutral
2,lokasi strategis jalan sumatra bandung nyaman ...,positive
3,betapa bahagia unboxing paket barang bagus men...,positive
4,aduh mahasiswa sombong kasih kartu kuning bela...,negative


In [21]:
# df_vektor = df.text_filter.tolist()
data_vektor = data.Tweet.tolist()

In [22]:
count_vect = CountVectorizer()
    
# melakukan fitting dan transformasi pada dokumen
count_vect.fit(data_vektor)

# melihat hasil representasi bag of words
X = count_vect.fit_transform(data_vektor)

In [23]:
pickle.dump(count_vect, open(r"..\NN_Files\feature_New.pickle", "wb"))

## Train - Test Data

In [24]:
y = data.Label

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state = 40)

`MLPClassifier` is a neural network algorithm that can be used for classification tasks. It has several parameters that can be tuned to improve its performance. Here is an explanation of the parameters in your `parameter_grid`:

- `hidden_layer_sizes`: This parameter specifies the number of neurons in each hidden layer of the neural network. It is a list of integers, where each integer represents the number of neurons in a hidden layer. For example, `[1, 10]` means that there are two hidden layers, one with 1 neuron and one with 10 neurons.

- `activation`: This parameter specifies the activation function used in the neural network. The activation function is used to introduce non-linearity into the model. The possible values for this parameter are `'relu'`, `'tanh'`, and `'logistic'`.

- `learning_rate_init`: This parameter specifies the initial learning rate used by the neural network. The learning rate determines how quickly the model learns from the data.

- `alpha`: This parameter specifies the L2 penalty (regularization term) parameter. It is used to prevent overfitting of the model.

- `early_stopping`: This parameter specifies whether to use early stopping to terminate training when validation score is not improving. If set to `True`, training will stop when validation score is not improving by at least `tol` for `n_iter_no_change` consecutive iterations.

In [26]:
model = Pipeline([ ('algoritma', MLPClassifier()) ])
parameter_grid = {
    'algoritma__hidden_layer_sizes': [{i} for i in [1, 10]],
    'algoritma__activation': ['relu','tanh','logistic'],
    'algoritma__learning_rate_init' : [0.01],
    'algoritma__alpha': [0.1,0.01,1],
    'algoritma__early_stopping': [True]
} 

model_NN = GridSearchCV(model, parameter_grid)

In [27]:
%%time
model_NN.fit(X_train,y_train)

CPU times: total: 7.2 s
Wall time: 2min 18s


In [28]:
model_NN.best_params_ 

{'algoritma__activation': 'tanh',
 'algoritma__alpha': 0.1,
 'algoritma__early_stopping': True,
 'algoritma__hidden_layer_sizes': {10},
 'algoritma__learning_rate_init': 0.01}

In [30]:
pickle.dump(model_NN, open(r"..\NN_Files\model_NN.pickle", 'wb'))

DATA TRAIN - DATASET

In [31]:
y_pred = model_NN.predict(X)

In [32]:
print(classification_report(y,y_pred))

              precision    recall  f1-score   support

    negative       0.89      0.92      0.90      3412
     neutral       0.94      0.87      0.90      1138
    positive       0.95      0.95      0.95      6383

    accuracy                           0.93     10933
   macro avg       0.93      0.91      0.92     10933
weighted avg       0.93      0.93      0.93     10933



DATA TRAIN - DATA TRAIN

In [33]:
y_pred_train = model_NN.predict(X_train)

In [34]:
print(classification_report(y_train, y_pred_train))

              precision    recall  f1-score   support

    negative       0.93      0.95      0.94      2725
     neutral       0.97      0.94      0.96       907
    positive       0.97      0.97      0.97      5114

    accuracy                           0.96      8746
   macro avg       0.96      0.95      0.95      8746
weighted avg       0.96      0.96      0.96      8746



DATA TRAIN - DATA TEST

In [17]:
y_pred_test = model_NN.predict(X_test)

In [18]:
print(classification_report(y_test, y_pred_test))

              precision    recall  f1-score   support

    negative       0.81      0.84      0.82       687
     neutral       0.84      0.73      0.78       231
    positive       0.91      0.91      0.91      1269

    accuracy                           0.87      2187
   macro avg       0.85      0.83      0.84      2187
weighted avg       0.87      0.87      0.87      2187



CROSS VALIDATION - WITHOUT PIPELINE PARAMETER

In [35]:
import numpy as np
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.model_selection import KFold

kf = KFold(n_splits=5, random_state=42,shuffle=True)
accuracies = []

for iteration, data in enumerate(kf.split(X), start=1):

  data_train = X[data[0]]
  target_train = y[data[0]]

  data_test = X[data[1]]
  target_test = y[data[1]]

  # Create model architecture
  model = MLPClassifier()
  model.fit(data_train, target_train)

  predictions = model.predict(data_test)
  y_pred = predictions

  # for the current fold only
  accuracy = accuracy_score(target_test,predictions)

  print("Training ke-", iteration)
  print(classification_report(target_test,predictions))
  print("======================================================================")

  accuracies.append(accuracy)

# this is the average accuracy over all folds
average_accuracy = np.mean(accuracies)

print()
print()
print()
print("Rata-rata Accuracy: ", average_accuracy)

Training ke- 1
              precision    recall  f1-score   support

    negative       0.70      0.72      0.71       690
     neutral       0.69      0.52      0.59       220
    positive       0.84      0.86      0.85      1277

    accuracy                           0.78      2187
   macro avg       0.74      0.70      0.72      2187
weighted avg       0.78      0.78      0.78      2187

Training ke- 2
              precision    recall  f1-score   support

    negative       0.71      0.71      0.71       667
     neutral       0.70      0.58      0.64       219
    positive       0.85      0.87      0.86      1301

    accuracy                           0.79      2187
   macro avg       0.75      0.72      0.74      2187
weighted avg       0.79      0.79      0.79      2187

Training ke- 3
              precision    recall  f1-score   support

    negative       0.72      0.70      0.71       695
     neutral       0.72      0.60      0.65       213
    positive       0.85      0

CROSS VALIDATION - WITH PIPELINE PARAMETER

This code will split your dataset into n_splits folds, train an MLPClassifier model on each fold, and test the model on the remaining folds. The GridSearchCV object will perform a search over the parameter grid to find the best hyperparameters for the model. The best hyperparameters will be used to create a new MLPClassifier object, which will be used to obtain the predictions using cross_val_predict. Finally, the confusion matrix, f1 score, precision score, recall score, and accuracy score will be computed and printed.

In [36]:
from sklearn.model_selection import KFold, GridSearchCV
from sklearn.neural_network import MLPClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix
from sklearn.model_selection import cross_val_predict

kf = KFold(n_splits=5, random_state=42,shuffle=True)
accuracies = []

for iteration, data in enumerate(kf.split(X), start=1):

  data_train = X[data[0]]
  target_train = y[data[0]]

  data_test = X[data[1]]
  target_test = y[data[1]]

  # Create an MLPClassifier object
  clf = MLPClassifier()

  # Create a pipeline object
  pipeline = Pipeline(steps=[('algoritma', clf)])

  # Define the parameter grid
  parameter_grid = {
      'algoritma__hidden_layer_sizes': [{i} for i in [1, 10]],
      'algoritma__activation': ['relu','tanh','logistic'],
      'algoritma__learning_rate_init' : [0.01],
      'algoritma__alpha': [0.1,0.01,1],
      'algoritma__early_stopping': [True]
  }

  # Create a GridSearchCV object
  grid_search = GridSearchCV(pipeline, parameter_grid, cv=kf)

  # Fit the GridSearchCV object to the data
  grid_search.fit(data_train, target_train)

  predictions = grid_search.predict(data_test)

  # Use cross_val_predict to obtain the predictions
  y_prediction = cross_val_predict(grid_search.best_estimator_, data_train, target_train, cv=kf)

  # for the current fold only

  accuracy = accuracy_score(target_test,predictions)

  print("Training ke-", iteration)
  print(classification_report(target_test,predictions))
  print("======================================================================")

  accuracies.append(accuracy)

  # Compute the confusion matrix
  conf_mat = confusion_matrix(target_train, y_prediction)

  # Print the confusion matrix
  print("Confusion matrix:\n", conf_mat)

  print("Best parameters: ", grid_search.best_params_)

# this is the average accuracy over all folds

average_accuracy = np.mean(accuracies)

print()
print()
print()
print("Rata-rata Accuracy: ", average_accuracy)

# Print the best parameters
print("Best parameters: ", grid_search.best_params_)



Training ke- 1
              precision    recall  f1-score   support

    negative       0.81      0.79      0.80       690
     neutral       0.78      0.74      0.76       220
    positive       0.90      0.92      0.91      1277

    accuracy                           0.86      2187
   macro avg       0.83      0.82      0.82      2187
weighted avg       0.86      0.86      0.86      2187

Confusion matrix:
 [[2153  105  464]
 [ 274  534  110]
 [ 394   55 4657]]
Best parameters:  {'algoritma__activation': 'relu', 'algoritma__alpha': 0.1, 'algoritma__early_stopping': True, 'algoritma__hidden_layer_sizes': {10}, 'algoritma__learning_rate_init': 0.01}
Training ke- 2
              precision    recall  f1-score   support

    negative       0.75      0.83      0.79       667
     neutral       0.82      0.61      0.70       219
    positive       0.92      0.91      0.91      1301

    accuracy                           0.85      2187
   macro avg       0.83      0.78      0.80      2187

In [38]:
input_text = """Dia memakan kue dengan lahap dan beringas jelek"""
def cleansing(text):
    text = text.lower()
    text = re.sub(r'((www\.[^\s]+)|(https?://[^\s]+)|(http?://[^\s]+))', ' ', text)
    text = re.sub(r'pic.twitter.com.[\w]+', ' ', text)
    text = re.sub(r'[^a-z\s]', ' ', text)
    text = text.replace('user', '')
    text = re.sub(' +', ' ', text)
    text = text.replace('\n', ' ')
    text = re.sub('url',' ', text)
    return text

sentiment = ['negative', 'neutral', 'positive']

text = [cleansing(input_text)]
text = count_vect.transform(text)

count_vect = pickle.load(open(r"..\NN_Files\feature_New.pickle", "rb"))
model_NN = pickle.load(open(r"..\NN_Files\model_NN.pickle", "rb"))

prediction = model_NN.predict(text)
polarity = np.argmax(prediction[0])
hasil = sentiment[polarity]

print("Text: %s" % text[0])
print("Sentiment: %s" % sentiment[polarity])

Text:   (0, 5732)	1
  (0, 7036)	1
  (0, 7111)	1
  (0, 8096)	1
Sentiment: negative
