In [1]:
import pandas as pd
import re
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split, KFold
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report, accuracy_score, recall_score, precision_score, f1_score
from statistics import mean, stdev

# Prepare Dataset

In [2]:
# import dataset
df = pd.read_csv('DATA/train_preprocess.tsv.txt', sep='\t', names=['Tweet','Label'])
df.head()

Unnamed: 0,Tweet,Label
0,warung ini dimiliki oleh pengusaha pabrik tahu...,positive
1,mohon ulama lurus dan k212 mmbri hujjah partai...,neutral
2,lokasi strategis di jalan sumatera bandung . t...,positive
3,betapa bahagia nya diri ini saat unboxing pake...,positive
4,duh . jadi mahasiswa jangan sombong dong . kas...,negative


In [3]:
df.Label.value_counts()

Label
positive    6416
negative    3436
neutral     1148
Name: count, dtype: int64

In [4]:
# Menghapus duplikat pada kolom "tweet"
df = df.drop_duplicates(subset='Tweet')

# Memeriksa data setelah menghapus duplikat
print("\nData setelah menghapus duplikat:")
print(df)


Data setelah menghapus duplikat:
                                                   Tweet     Label
0      warung ini dimiliki oleh pengusaha pabrik tahu...  positive
1      mohon ulama lurus dan k212 mmbri hujjah partai...   neutral
2      lokasi strategis di jalan sumatera bandung . t...  positive
3      betapa bahagia nya diri ini saat unboxing pake...  positive
4      duh . jadi mahasiswa jangan sombong dong . kas...  negative
...                                                  ...       ...
10993  f - demokrat dorong upaya kemandirian energi n...   neutral
10994                                        tidak bosan  positive
10996  enak rasa masakan nya apalagi kepiting yang me...  positive
10998  pagi pagi di tol pasteur sudah macet parah , b...  negative
10999  meskipun sering belanja ke yogya di riau junct...  positive

[10933 rows x 2 columns]


# Text Normalization

In [5]:
def cleansing(sent):
    # Mengubah kata menjadi huruf kecil semua dengan menggunakan fungsi lower()
    string = sent.lower()

    # Menghapus emoticon dan tanda baca menggunakan "RegEx" dengan script di bawah
    string = re.sub(r'(?:\@|http?\://|https?\://|www)\S+', '', string) #menghapus https dan http
    string = re.sub('<.*?>', ' ', string) #mengganti karakter html dengan tanda petik
    string = re.sub('[^0-9a-zA-Z]+', ' ', string) #menghilangkan semua karakter yang bukan huruf atau angka dan menggantinya dengan spasi.
    string = re.sub('\n',' ',string) #mengganti line baru dengan spasi
    string = re.sub(r':', ' ', string) #menggantikan karakter : dengan spasi 
    string = re.sub('gue','saya', string) # Mengganti kata "gue" dengan kata "saya"
    string = re.sub(r'\b[a-zA-Z]\b', ' ', string) #menghapus single char
    string = ' '.join(string.split()) #memisahkan dan menggabungkan kata
    string = string.strip() #menghilangkan whitespace di awal dan di akhir teks
    string = re.sub(r'pic.twitter.com.[\w]+', '', string) #menghapus link picture
    string = re.sub(r'\buser\b',' ', string) #menghapus kata 'user'
    string = re.sub(r'\brt\b',' ', string) #menghapus awalan rt
    string = re.sub('RT',' ', string) #menghapus RT simbol
    string = re.sub(r'‚Ä¶', '', string) #menghapus simbol tidak perlu

    # Lematisasi menggunakan Sastrawi
    stemmer_factory = StemmerFactory()
    stemmer = stemmer_factory.create_stemmer()
    string = stemmer.stem(string)

    # Menghapus stop words menggunakan Sastrawi
    stopword_factory = StopWordRemoverFactory()
    stopword_remover = stopword_factory.create_stop_word_remover()
    string = stopword_remover.remove(string)
    
    return string

In [6]:
df['Tweet_Clean'] = df.Tweet.apply(cleansing)
df

Unnamed: 0,Tweet,Label,Tweet_Clean
0,warung ini dimiliki oleh pengusaha pabrik tahu...,positive,warung ini dimiliki oleh pengusaha pabrik tahu...
1,mohon ulama lurus dan k212 mmbri hujjah partai...,neutral,mohon ulama lurus dan k212 mmbri hujjah partai...
2,lokasi strategis di jalan sumatera bandung . t...,positive,lokasi strategis di jalan sumatera bandung tem...
3,betapa bahagia nya diri ini saat unboxing pake...,positive,betapa bahagia nya diri ini saat unboxing pake...
4,duh . jadi mahasiswa jangan sombong dong . kas...,negative,duh jadi mahasiswa jangan sombong dong kasih k...
...,...,...,...
10993,f - demokrat dorong upaya kemandirian energi n...,neutral,demokrat dorong upaya kemandirian energi nasional
10994,tidak bosan,positive,tidak bosan
10996,enak rasa masakan nya apalagi kepiting yang me...,positive,enak rasa masakan nya apalagi kepiting yang me...
10998,"pagi pagi di tol pasteur sudah macet parah , b...",negative,pagi pagi di tol pasteur sudah macet parah bik...


# Feature Extraction

## TF-IDF

In [7]:
data_preprocessed = df.Tweet_Clean.tolist()

In [8]:
# Proses feature extraction
tfidf_vect = TfidfVectorizer()

X = tfidf_vect.fit_transform(data_preprocessed)
print("Feature extraction selesai")

Feature extraction selesai


In [9]:
tfidf_array = X.toarray()
df_array = pd.DataFrame(data=tfidf_array, columns=tfidf_vect.get_feature_names_out())
print(df_array)

        00  000  001   01  010  0111  011770465655617   02  021  022  ...  \
0      0.0  0.0  0.0  0.0  0.0   0.0              0.0  0.0  0.0  0.0  ...   
1      0.0  0.0  0.0  0.0  0.0   0.0              0.0  0.0  0.0  0.0  ...   
2      0.0  0.0  0.0  0.0  0.0   0.0              0.0  0.0  0.0  0.0  ...   
3      0.0  0.0  0.0  0.0  0.0   0.0              0.0  0.0  0.0  0.0  ...   
4      0.0  0.0  0.0  0.0  0.0   0.0              0.0  0.0  0.0  0.0  ...   
...    ...  ...  ...  ...  ...   ...              ...  ...  ...  ...  ...   
10928  0.0  0.0  0.0  0.0  0.0   0.0              0.0  0.0  0.0  0.0  ...   
10929  0.0  0.0  0.0  0.0  0.0   0.0              0.0  0.0  0.0  0.0  ...   
10930  0.0  0.0  0.0  0.0  0.0   0.0              0.0  0.0  0.0  0.0  ...   
10931  0.0  0.0  0.0  0.0  0.0   0.0              0.0  0.0  0.0  0.0  ...   
10932  0.0  0.0  0.0  0.0  0.0   0.0              0.0  0.0  0.0  0.0  ...   

       zonpoliticon  zoo  zoom  zubir  zulfikri  zulkarnain  zup  zupa  zup

## Prepare Train and Test Dataset (Split Dataset)

In [10]:
classes = df.Label

# split dataset menjadi 80% untuk train dan 20% untuk test
X_train, X_test, y_train, y_test = train_test_split(X, classes, test_size=0.2)

In [11]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(8746, 17237)
(2187, 17237)
(8746,)
(2187,)


# Model Training

## Neural Network

In [12]:
# model training menggunakan neural network
model = MLPClassifier()
model.fit(X_train, y_train)

print("Training selesai")

Training selesai


In [13]:
pickle.dump(model, open("model.p", "wb"))

# Model Evaluation

In [14]:
test = model.predict(X_test)

print("Testing selesai")
print(classification_report(y_test, test))

Testing selesai
              precision    recall  f1-score   support

    negative       0.78      0.78      0.78       669
     neutral       0.85      0.67      0.75       232
    positive       0.88      0.91      0.89      1286

    accuracy                           0.84      2187
   macro avg       0.84      0.79      0.81      2187
weighted avg       0.84      0.84      0.84      2187



## Cross Validation

In [15]:
X = df[["Tweet_Clean"]] #input feature
Y = df["Label"] #output feature

# split dataset menjadi 80% untuk train dan 20% untuk test
X_train, X_test, y_train, y_test = train_test_split(X, classes, test_size=0.2)

In [20]:
# cross validation dengan split = 5
kf = KFold(n_splits=5, random_state=42, shuffle=True)

accuracies = []

def cross_validation(k, X_train, model, name):
    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []

    # iterate sebanyak k_cross validation
    for iteration, data in enumerate(kf.split(X_train), start=1):
        # mengambil data dan target train
        data_train = X_train[data[0]]
        tfidf_vect = TfidfVectorizer()
        data_train = tfidf_vect.fit_transform(data_train)
        target_train = y_train[data[0]]

        # mengambil data dan target test
        data_test = y_train[data[1]]
        data_test = tfidf_vect.fit_transform(data_test)
        target_test = y_train[data[1]]

        # model training menggunakan data train
        classifier = model
        classifier.fit(data_train, target_train)

        # prediksi data test
        preds = classifier.predict(data_test)

        # menghitung accuracy
        accuracy = accuracy_score(target_test, preds)
        precision = precision_score(target_test, preds)
        recall = recall_score(target_test, preds)
        f1 = f1_score(target_test, preds)

        print("Training ke-", iteration)
        print(classification_report(target_test, preds))
        print("=================================================================")

    result = {'algorithm': name,
                'accuracy_per_iter': accuracies, 'accuracy_mean': mean(accuracies), 'accuracy_std': stdev(accuracies),
                'recall_mean': mean(recalls),'precision_mean': mean(precisions), 'f1_mean': mean(f1_scores)}
    return result

In [21]:
result = cross_validation(5, X_train, MLPClassifier(), "Neural Network")
result

ValueError: X has 3 features, but MLPClassifier is expecting 13786 features as input.