In [41]:
import numpy as np
import pandas as pd
import re
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))

In [42]:
data = pd.read_csv('tweet_emotions.csv')
data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tweet_id   40000 non-null  int64 
 1   sentiment  40000 non-null  object
 2   content    40000 non-null  object
dtypes: int64(1), object(2)
memory usage: 937.6+ KB


Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,@tiffanylue i know i was listenin to bad habi...
1,1956967666,sadness,Layin n bed with a headache ughhhh...waitin o...
2,1956967696,sadness,Funeral ceremony...gloomy friday...
3,1956967789,enthusiasm,wants to hang out with friends SOON!
4,1956968416,neutral,@dannycastillo We want to trade with someone w...


<h1>Pra Pengolahan Data<h1>

In [43]:
def data_processing(content):
    # Case folding (lowercase kata)
    content = content.lower()

    # Filtering (filtering kata)
    content = re.sub(r"https\S+|www\S+http\S+",'', content, flags=re.MULTILINE)
    content = re.sub(r'\@w+|\#','', content)
    content = re.sub(r'[^\w\s]','', content)

    # Tokenizing (memisahkan kata)
    content_token = word_tokenize(content)
    filtered_content = [w for w in content_token if not w in stop_words]
    data = " ".join(filtered_content)
    # Steamming (membakukan kata)
    stemmer = PorterStemmer()
    content = [stemmer.stem(word) for word in data]
    return data

data.content = data['content'].apply(data_processing)

data.info()
data.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40000 entries, 0 to 39999
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tweet_id   40000 non-null  int64 
 1   sentiment  40000 non-null  object
 2   content    40000 non-null  object
dtypes: int64(1), object(2)
memory usage: 937.6+ KB


Unnamed: 0,tweet_id,sentiment,content
0,1956967341,empty,tiffanylue know listenin bad habit earlier sta...
1,1956967666,sadness,layin n bed headache ughhhhwaitin call
2,1956967696,sadness,funeral ceremonygloomy friday
3,1956967789,enthusiasm,wants hang friends soon
4,1956968416,neutral,dannycastillo want trade someone houston ticke...


<h1>Ekstraksi Fitur</h1>
<b>Menggunakan CountVectorizer</b>

In [44]:
from sklearn.feature_extraction.text import CountVectorizer

from sklearn.model_selection import train_test_split

# Ambil data x dan y dari data yang sudah di prosesing
x = data['content']
y = data['sentiment']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=50)

# Inisiasi CountVectorizer
cv = CountVectorizer()
x_train = cv.fit_transform(x_train)
x_test = cv.transform(x_test)

<h1>Pembuatan Model</h1>
<b>Menggunakan Naive Bayes</b>

In [45]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score, precision_score, classification_report

# Inisiasi MultinomialNB
mnb = MultinomialNB()

# Fit model
mnb.fit(x_train, y_train)

# Prediksi dengan data training
y_pred_train = mnb.predict(x_train)

# Evaluasi akurasi data training
acc_train = accuracy_score(y_train, y_pred_train)

# Prediksi dengan data training
y_pred_test = mnb.predict(x_test)

<h1>Evaluasi Model</h1>
<b>Menghitung Akurasi Presisi dan Recall</b>

In [46]:
# Evaluasi akurasi data training
acc_test = accuracy_score(y_test, y_pred_test)

# Menghitung Precission
precission = precision_score(y_test, y_pred_test, average='weighted',zero_division=0)

print(classification_report(y_test, y_pred_test, zero_division=0))

# Print hasil evaluasi
print(f'Hasil akurasi data train: {acc_train}')
print(f'Hasil akurasi data test: {acc_test}')
print(f'Hasil presisi data test: {precission}')

              precision    recall  f1-score   support

       anger       0.00      0.00      0.00        17
     boredom       0.00      0.00      0.00        30
       empty       1.00      0.01      0.01       174
  enthusiasm       0.00      0.00      0.00       153
         fun       0.00      0.00      0.00       347
   happiness       0.33      0.32      0.32      1019
        hate       0.00      0.00      0.00       259
        love       0.50      0.30      0.37       732
     neutral       0.35      0.38      0.36      1774
      relief       0.00      0.00      0.00       306
     sadness       0.30      0.14      0.19       995
    surprise       0.00      0.00      0.00       468
       worry       0.29      0.70      0.41      1726

    accuracy                           0.32      8000
   macro avg       0.21      0.14      0.13      8000
weighted avg       0.29      0.32      0.27      8000

Hasil akurasi data train: 0.6056875
Hasil akurasi data test: 0.320625
Hasil pre