# *Persiapan Dataset*

In [2]:
import pandas as pd

# Memuat dataset CSV
dataTest = pd.read_csv('input/emotion-dataset/test.csv')
dataTraining = pd.read_csv('input/emotion-dataset/training.csv')
dataValidation = pd.read_csv('input/emotion-dataset/validation.csv')





In [3]:

# Menghapus baris yang memiliki nilai kosong
dataTest = dataTest.dropna()
dataTraining = dataTraining.dropna()
dataValidation = dataValidation.dropna()

# Menghapus duplikat
dataTest = dataTest.drop_duplicates()
dataTraining = dataTraining.drop_duplicates()
dataValidation = dataValidation.drop_duplicates()


In [4]:
from sklearn.preprocessing import LabelEncoder

# Inisialisasi LabelEncoder
label_encoder = LabelEncoder()

# Mengubah label emosi menjadi angka
y_train = label_encoder.fit_transform(dataTraining['label'])
y_test = label_encoder.transform(dataTest['label'])
y_val = label_encoder.transform(dataValidation['label'])

# Menampilkan beberapa label yang sudah terencode
print(y_train[:5])

[0 0 3 2 3]


In [5]:
from sklearn.preprocessing import LabelEncoder

# Label encoding untuk label emosi
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(y_train)
y_test = label_encoder.transform(y_test)


In [7]:
import spacy

# Memuat model bahasa Inggris spaCy
nlp = spacy.load("en_core_web_sm")

def clean_text_spacy(text):
    # Membuat objek doc dengan spaCy
    doc = nlp(text)

    # Menghapus stopwords, tanda baca, dan angka
    cleaned_text = ' '.join([token.text for token in doc if not token.is_stop and not token.is_punct and not token.is_digit])

    return cleaned_text

# Terapkan pembersihan pada kolom teks di dataset menggunakan spaCy
dataTraining['cleaned_text'] = dataTraining['text'].apply(clean_text_spacy)
dataTest['cleaned_text'] = dataTest['text'].apply(clean_text_spacy)
dataValidation['cleaned_text'] = dataValidation['text'].apply(clean_text_spacy)

# Menampilkan beberapa baris untuk memverifikasi
print(dataTraining[['text', 'cleaned_text']].head())
print(dataTest[['text', 'cleaned_text']].head())
print(dataValidation[['text', 'cleaned_text']].head())


                                                text  \
0                            i didnt feel humiliated   
1  i can go from feeling so hopeless to so damned...   
2   im grabbing a minute to post i feel greedy wrong   
3  i am ever feeling nostalgic about the fireplac...   
4                               i am feeling grouchy   

                                  cleaned_text  
0                           nt feel humiliated  
1  feeling hopeless damned hopeful cares awake  
2     m grabbing minute post feel greedy wrong  
3    feeling nostalgic fireplace know property  
4                              feeling grouchy  
                                                text  \
0  im feeling rather rotten so im not very ambiti...   
1          im updating my blog because i feel shitty   
2  i never make her separate from me because i do...   
3  i left with my bouquet of red and yellow tulip...   
4    i was feeling a little vain when i did this one   

                                

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer

# Inisialisasi vektorisasi TF-IDF
vectorizer = TfidfVectorizer(max_features=5000)  # Membatasi jumlah fitur

# Vektorisasi data pelatihan dan pengujian
X_train_tfidf = vectorizer.fit_transform(dataTraining['cleaned_text'])
X_test_tfidf = vectorizer.transform(dataTest['cleaned_text'])
X_val_tfidf = vectorizer.transform(dataValidation['cleaned_text'])

# Menampilkan bentuk data vektorisasi (misalnya jumlah fitur)
print(X_train_tfidf.shape)

(15999, 5000)


In [9]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, accuracy_score

# Inisialisasi model Logistic Regression
model = LogisticRegression(max_iter=1000)

# Melatih model
model.fit(X_train_tfidf, y_train)

# Prediksi pada data uji
y_pred = model.predict(X_test_tfidf)

# Evaluasi model
accuracy = accuracy_score(y_test, y_pred)
print(f'Akurasi: {accuracy:.2f}')

# Menampilkan laporan klasifikasi
print(classification_report(y_test, y_pred))

Akurasi: 0.88
              precision    recall  f1-score   support

           0       0.91      0.92      0.92       581
           1       0.86      0.95      0.90       695
           2       0.80      0.66      0.72       159
           3       0.89      0.84      0.86       275
           4       0.87      0.83      0.85       224
           5       0.87      0.52      0.65        66

    accuracy                           0.88      2000
   macro avg       0.87      0.79      0.82      2000
weighted avg       0.88      0.88      0.87      2000



In [10]:
# Evaluasi menggunakan data validasi
y_val_pred = model.predict(X_val_tfidf)

# Evaluasi dengan metrik yang lebih detail
from sklearn.metrics import classification_report

print(classification_report(y_val, y_val_pred))

              precision    recall  f1-score   support

           0       0.89      0.93      0.91       550
           1       0.87      0.94      0.91       704
           2       0.89      0.73      0.80       178
           3       0.89      0.87      0.88       275
           4       0.87      0.75      0.81       212
           5       0.86      0.68      0.76        81

    accuracy                           0.88      2000
   macro avg       0.88      0.82      0.84      2000
weighted avg       0.88      0.88      0.88      2000



In [11]:
import joblib

# Menyimpan model dan vektorisasi
joblib.dump(model, 'emotion_model.pkl')
joblib.dump(vectorizer, 'vectorizer.pkl')

# Menyimpan LabelEncoder untuk label emosi
joblib.dump(label_encoder, 'label_encoder.pkl')

['label_encoder.pkl']