## Importing Necessary Libraries 

In [21]:
import pandas as pd
import numpy as np
import re
import joblib
import matplotlib.pyplot as plt
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from gensim.models import Word2Vec
from transformers import BertTokenizer
from wordcloud import WordCloud
from textblob import TextBlob
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical

nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\AMRINA\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\AMRINA\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

## Loading Dataset 

In [22]:
df_clean = pd.read_csv('Data/data_clean.csv')
df_clean

Unnamed: 0.1,Unnamed: 0,conversation_id_str,created_at,favorite_count,full_text,id_str,lang,quote_count,reply_count,retweet_count,tweet_url,user_id_str,username,translated_text
0,0,1.800000e+18,Fri Jun 14 06:00:09 +0000 2024,20853,NARCISSISM JAEMIN 1st PHOTO EXHIBITION OPEN 29...,1.800000e+18,ko,355,475,13150,https://x.com/NCTsmtown/status/180149481786990...,4.811011e+09,NCTsmtown,NARCISSISM JAEMIN 1st PHOTO EXHIBITION OPEN 29...
1,1,1.800000e+18,Fri Jun 14 01:44:55 +0000 2024,46197,첫 개인 사진전 6월 29일오픈 사진 작가로 데뷔 재민의 독창적인 감성으로 팬들과 ...,1.800000e+18,ko,3011,1600,27395,https://x.com/NCTsmtown/status/180143058501101...,4.811011e+09,NCTsmtown,First solo photo exhibition opens on June 29 D...
2,2,1.800000e+18,Thu Jun 13 06:19:50 +0000 2024,1670,NARCISSISM PHOTOGRAPHER NA JAEMIN 1st PHOTO EX...,1.800000e+18,en,57,0,1023,https://x.com/jamong1323/status/18011373840522...,1.420000e+18,jamong1323,NARCISSISM PHOTOGRAPHER NA JAEMIN 1st PHOTO EX...
3,3,1.800000e+18,Mon Jun 17 15:25:10 +0000 2024,4855,Jaemin got casted while volunteering and till ...,1.800000e+18,en,109,8,2423,https://x.com/user13082000_/status/18027241739...,1.470000e+18,user13082000_,Jaemin got casted while volunteering and till ...
4,4,1.800000e+18,Thu Jun 13 10:38:04 +0000 2024,288,Mini Support Project for NARCISSISM JAEMIN 1st...,1.800000e+18,ko,32,1,255,https://x.com/najaeminnesia/status/18012023710...,1.340000e+18,najaeminnesia,Mini Support Project for NARCISSISM JAEMIN 1st...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1256,1261,1.820000e+18,Tue Jul 30 00:04:09 +0000 2024,0,NARCISSISM BY NA JAEMIN,1.820000e+18,tl,0,0,0,https://x.com/user13082000_/status/18180750710...,1.470000e+18,user13082000_,NARCISSISM BY NA JAEMIN
1257,1262,1.820000e+18,Tue Jul 30 00:04:20 +0000 2024,0,NARCISSISM BY NA JAEMIN,1.820000e+18,tl,0,0,0,https://x.com/Sailorneww/status/18180751175175...,4.739379e+08,Sailorneww,NARCISSISM BY NA JAEMIN
1258,1263,1.820000e+18,Tue Jul 30 00:00:10 +0000 2024,0,D 32 NARCISSISM BY NA JAEMIN,1.820000e+18,tl,0,0,0,https://x.com/jlynrj/status/1818074066542272684,1.430000e+18,jlynrj,D 32 NARCISSISM BY NA JAEMIN
1259,1264,1.820000e+18,Tue Jul 30 00:04:13 +0000 2024,0,NARCISSISM BY NA JAEMIN,1.820000e+18,tl,0,0,0,https://x.com/ffrongg_/status/1818075085485523352,1.260000e+18,ffrongg_,NARCISSISM BY NA JAEMIN


## Data Preprocessing 

In [23]:
# Defining dictionary containing all emojis with their meanings.
emojis = {':)': 'smile', ':-)': 'smile', ';d': 'wink', ':-E': 'vampire', ':(': 'sad', 
          ':-(': 'sad', ':-<': 'sad', ':P': 'raspberry', ':O': 'surprised',
          ':-@': 'shocked', ':@': 'shocked',':-$': 'confused', ':\\': 'annoyed', 
          ':#': 'mute', ':X': 'mute', ':^)': 'smile', ':-&': 'confused', '$_$': 'greedy',
          '@@': 'eyeroll', ':-!': 'confused', ':-D': 'smile', ':-0': 'yell', 'O.o': 'confused',
          '<(-_-)>': 'robot', 'd[-_-]b': 'dj', ":'-)": 'sadsmile', ';)': 'wink', 
          ';-)': 'wink', 'O:-)': 'angel','O*-)': 'angel','(:-D': 'gossip', '=^.^=': 'cat'}

In [24]:
def remove_emojis(text):
    if isinstance(text, str):  # Check if it's a string
        for emoji in emojis:
            text = text.replace(emoji, '')
            return text

In [25]:
df_clean['translated_text'] = df_clean['translated_text'].apply(remove_emojis)

In [26]:
def clean_text(df, column_name):
    df[column_name] = df[column_name].fillna('').astype(str).str.replace(r'(.)\1', r'\1', regex=True)
    df[column_name] = df[column_name].str.replace(r'[^a-zA-Z0-9]', ' ', regex=True)  # Apply non-alphanumeric replacement
    return df

In [27]:
df_clean = clean_text(df_clean.copy(), 'translated_text')

In [28]:
df_clean

Unnamed: 0.1,Unnamed: 0,conversation_id_str,created_at,favorite_count,full_text,id_str,lang,quote_count,reply_count,retweet_count,tweet_url,user_id_str,username,translated_text
0,0,1.800000e+18,Fri Jun 14 06:00:09 +0000 2024,20853,NARCISSISM JAEMIN 1st PHOTO EXHIBITION OPEN 29...,1.800000e+18,ko,355,475,13150,https://x.com/NCTsmtown/status/180149481786990...,4.811011e+09,NCTsmtown,NARCISISM JAEMIN 1st PHOTO EXHIBITION OPEN 29 ...
1,1,1.800000e+18,Fri Jun 14 01:44:55 +0000 2024,46197,첫 개인 사진전 6월 29일오픈 사진 작가로 데뷔 재민의 독창적인 감성으로 팬들과 ...,1.800000e+18,ko,3011,1600,27395,https://x.com/NCTsmtown/status/180143058501101...,4.811011e+09,NCTsmtown,First solo photo exhibition opens on June 29 D...
2,2,1.800000e+18,Thu Jun 13 06:19:50 +0000 2024,1670,NARCISSISM PHOTOGRAPHER NA JAEMIN 1st PHOTO EX...,1.800000e+18,en,57,0,1023,https://x.com/jamong1323/status/18011373840522...,1.420000e+18,jamong1323,NARCISISM PHOTOGRAPHER NA JAEMIN 1st PHOTO EXH...
3,3,1.800000e+18,Mon Jun 17 15:25:10 +0000 2024,4855,Jaemin got casted while volunteering and till ...,1.800000e+18,en,109,8,2423,https://x.com/user13082000_/status/18027241739...,1.470000e+18,user13082000_,Jaemin got casted while voluntering and til th...
4,4,1.800000e+18,Thu Jun 13 10:38:04 +0000 2024,288,Mini Support Project for NARCISSISM JAEMIN 1st...,1.800000e+18,ko,32,1,255,https://x.com/najaeminnesia/status/18012023710...,1.340000e+18,najaeminnesia,Mini Suport Project for NARCISISM JAEMIN 1st P...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1256,1261,1.820000e+18,Tue Jul 30 00:04:09 +0000 2024,0,NARCISSISM BY NA JAEMIN,1.820000e+18,tl,0,0,0,https://x.com/user13082000_/status/18180750710...,1.470000e+18,user13082000_,NARCISISM BY NA JAEMIN
1257,1262,1.820000e+18,Tue Jul 30 00:04:20 +0000 2024,0,NARCISSISM BY NA JAEMIN,1.820000e+18,tl,0,0,0,https://x.com/Sailorneww/status/18180751175175...,4.739379e+08,Sailorneww,NARCISISM BY NA JAEMIN
1258,1263,1.820000e+18,Tue Jul 30 00:00:10 +0000 2024,0,D 32 NARCISSISM BY NA JAEMIN,1.820000e+18,tl,0,0,0,https://x.com/jlynrj/status/1818074066542272684,1.430000e+18,jlynrj,D 32 NARCISISM BY NA JAEMIN
1259,1264,1.820000e+18,Tue Jul 30 00:04:13 +0000 2024,0,NARCISSISM BY NA JAEMIN,1.820000e+18,tl,0,0,0,https://x.com/ffrongg_/status/1818075085485523352,1.260000e+18,ffrongg_,NARCISISM BY NA JAEMIN


In [29]:
def preprocess_text(text):
    # Lowercase
    text = text.lower()
    
    # Tokenization
    words = text.split()
    
    # Stopwords removal
    stop_words = set(stopwords.words('english'))  # Ganti 'english' dengan bahasa yang sesuai
    words = [word for word in words if word not in stop_words]
    
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    words = [lemmatizer.lemmatize(word) for word in words]
    
    # Join words back to text
    text = ' '.join(words)
    
    return text

In [30]:
nltk.download('omw-1.4')
df_clean['text'] = df_clean['translated_text'].apply(preprocess_text)

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\AMRINA\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [31]:
# Replacing short words
df_clean['text'] = df_clean['text'].apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))

In [32]:
# Kata-kata yang ingin dihapus
kata_dihapus = ['po', 'pre order', 'md', 'wtb', 'want to buy', 'iso', 'open', 'pre', 'shiping', 'ver', 'mention', 'tag', 'day', 'd23', 'freebies', 'giveaway']

# Fungsi lambda untuk menghapus kata
def hapus_kata(text):
    for kata in kata_dihapus:
        text = text.replace(kata, '')
    return text

# Hapus kata-kata menggunakan lambda
df_clean['text'] = df_clean['text'].apply(hapus_kata)

In [33]:
df_clean

Unnamed: 0.1,Unnamed: 0,conversation_id_str,created_at,favorite_count,full_text,id_str,lang,quote_count,reply_count,retweet_count,tweet_url,user_id_str,username,translated_text,text
0,0,1.800000e+18,Fri Jun 14 06:00:09 +0000 2024,20853,NARCISSISM JAEMIN 1st PHOTO EXHIBITION OPEN 29...,1.800000e+18,ko,355,475,13150,https://x.com/NCTsmtown/status/180149481786990...,4.811011e+09,NCTsmtown,NARCISISM JAEMIN 1st PHOTO EXHIBITION OPEN 29 ...,narcisism jaemin 1st photo exhibition june 20...
1,1,1.800000e+18,Fri Jun 14 01:44:55 +0000 2024,46197,첫 개인 사진전 6월 29일오픈 사진 작가로 데뷔 재민의 독창적인 감성으로 팬들과 ...,1.800000e+18,ko,3011,1600,27395,https://x.com/NCTsmtown/status/180143058501101...,4.811011e+09,NCTsmtown,First solo photo exhibition opens on June 29 D...,first solo photo exhibition june debut photog...
2,2,1.800000e+18,Thu Jun 13 06:19:50 +0000 2024,1670,NARCISSISM PHOTOGRAPHER NA JAEMIN 1st PHOTO EX...,1.800000e+18,en,57,0,1023,https://x.com/jamong1323/status/18011373840522...,1.420000e+18,jamong1323,NARCISISM PHOTOGRAPHER NA JAEMIN 1st PHOTO EXH...,narcisism photographer jaemin 1st photo exhibi...
3,3,1.800000e+18,Mon Jun 17 15:25:10 +0000 2024,4855,Jaemin got casted while volunteering and till ...,1.800000e+18,en,109,8,2423,https://x.com/user13082000_/status/18027241739...,1.470000e+18,user13082000_,Jaemin got casted while voluntering and til th...,jaemin got casted voluntering til stil contin...
4,4,1.800000e+18,Thu Jun 13 10:38:04 +0000 2024,288,Mini Support Project for NARCISSISM JAEMIN 1st...,1.800000e+18,ko,32,1,255,https://x.com/najaeminnesia/status/18012023710...,1.340000e+18,najaeminnesia,Mini Suport Project for NARCISISM JAEMIN 1st P...,mini surt project narcisism jaemin 1st photo e...
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1256,1261,1.820000e+18,Tue Jul 30 00:04:09 +0000 2024,0,NARCISSISM BY NA JAEMIN,1.820000e+18,tl,0,0,0,https://x.com/user13082000_/status/18180750710...,1.470000e+18,user13082000_,NARCISISM BY NA JAEMIN,narcisism jaemin
1257,1262,1.820000e+18,Tue Jul 30 00:04:20 +0000 2024,0,NARCISSISM BY NA JAEMIN,1.820000e+18,tl,0,0,0,https://x.com/Sailorneww/status/18180751175175...,4.739379e+08,Sailorneww,NARCISISM BY NA JAEMIN,narcisism jaemin
1258,1263,1.820000e+18,Tue Jul 30 00:00:10 +0000 2024,0,D 32 NARCISSISM BY NA JAEMIN,1.820000e+18,tl,0,0,0,https://x.com/jlynrj/status/1818074066542272684,1.430000e+18,jlynrj,D 32 NARCISISM BY NA JAEMIN,narcisism jaemin
1259,1264,1.820000e+18,Tue Jul 30 00:04:13 +0000 2024,0,NARCISSISM BY NA JAEMIN,1.820000e+18,tl,0,0,0,https://x.com/ffrongg_/status/1818075085485523352,1.260000e+18,ffrongg_,NARCISISM BY NA JAEMIN,narcisism jaemin


In [34]:
df_clean.drop(df_clean.columns[[1,2,3,4,5,6,7,8,9,10,11,12,13]],axis=1, inplace=True)

In [35]:
df_clean

Unnamed: 0.1,Unnamed: 0,text
0,0,narcisism jaemin 1st photo exhibition june 20...
1,1,first solo photo exhibition june debut photog...
2,2,narcisism photographer jaemin 1st photo exhibi...
3,3,jaemin got casted voluntering til stil contin...
4,4,mini surt project narcisism jaemin 1st photo e...
...,...,...
1256,1261,narcisism jaemin
1257,1262,narcisism jaemin
1258,1263,narcisism jaemin
1259,1264,narcisism jaemin


In [36]:
df_clean.to_csv('Data/data_label.csv')