## Label Encoding

In [None]:
sentiment_mapping = {'negatif': 0, 'netral': 1, 'positif': 2}

df['sentimen_labels'] = df['sentimen'].map(sentiment_mapping)
df

Unnamed: 0,sentimen,tweet,hashtags,mentions,tweet_length,sentimen_labels
0,negatif,indonesia dihargai bangsa asing berita hoax bu...,[],"[@prabowo, @rockygerung]",195,0
1,netral,batuan langka tasbih jokowi hadiah habib luthf...,[],[],93,1
2,netral,era jokowi ekonomi indonesia,"[#01IndonesiaMaju, #JokowiLagi, #JokowiMenangT...",[],126,1
3,positif,sumatera selatan asian games berdampak pd ekon...,[],[],128,2
4,negatif,negara ngutang bngun infrastruktur dipake masy...,[],[],244,0
...,...,...,...,...,...,...
1810,netral,negarawan sejati sll bangga mengedepankan harg...,[#2019GantiPresiden],[],99,1
1811,netral,1 hrs ceramah damai indonesiaku 2 perekonomian...,[#PutihkanGBK],[],80,1
1812,netral,mari bangun bangsa dgn mendukung perekonomian ...,[],[],148,1
1813,netral,bantu majukan perekonomian bangsa jokowi yuk,[],[],121,1


## Feature Extraction / Text Vectorizer

<b>Using Bag-of-Words (BoW) 1-grams [CountVectorizer]</b>

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

bow_vectorizer = CountVectorizer(analyzer='word', ngram_range=(1, 1))
BoW = bow_vectorizer.fit_transform(df['tweet'].values.astype('U'))

<b>Get a few unique Tokens from CountVectorizer</b>

In [None]:
bow_vectorizer.get_feature_names_out()[200:300]

array(['akidah', 'akma', 'akn', 'aksa', 'akses', 'aktif', 'aktifiktas',
       'akting', 'akui', 'akuin', 'akuisisi', 'akum', 'akumulasi', 'akun',
       'akunnya', 'akuntabilitas', 'akupun', 'akutuh', 'al', 'ala2',
       'aladin', 'alam', 'alamat', 'alami', 'alamin', 'alangkah', 'alap',
       'alasan', 'alasannya', 'alat', 'alesan', 'alhamdullilah',
       'alhmdllh', 'aliansi', 'alias', 'alih', 'alihkan', 'aliran',
       'alirkan', 'all', 'allah', 'allahu', 'allohuakbar', 'alm',
       'alokasikan', 'alquran', 'alternatif', 'alurnya', 'ama', 'amal',
       'aman', 'amanah', 'amanat', 'amang', 'amatiran', 'ambil',
       'ambilin', 'ambisi', 'ambisinya', 'ambruk', 'amburadul', 'amda',
       'amerika', 'amien', 'amiiiiin', 'amiin', 'amin', 'amit', 'amnesti',
       'amnesty', 'ampe', 'ampun', 'an', 'ana', 'anak', 'anak2',
       'anaknya', 'analis', 'analisanya', 'analisis', 'and', 'andalan',
       'andi', 'andil', 'andre', 'android', 'androidnya', 'ane', 'aneh',
       'anehhh', 

<b>Check BoW array shape</b>

In [None]:
BoW.toarray().shape

(1815, 6462)

<b>Using TF-IDF feature extraction</b>

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vectorizer = TfidfVectorizer()
tfidf = tfidf_vectorizer.fit_transform(df['tweet'].values.astype('U'))
print(tfidf)

  (0, 4704)	0.4603507904643786
  (0, 913)	0.4603507904643786
  (0, 2118)	0.3378002068391623
  (0, 648)	0.35988253038094153
  (0, 369)	0.35988253038094153
  (0, 464)	0.25892921099071964
  (0, 1300)	0.32767231968274224
  (0, 2213)	0.16911819227906244
  (1, 3835)	0.36109428597337984
  (1, 5397)	0.34235064839105983
  (1, 3210)	0.36109428597337984
  (1, 1988)	0.36109428597337984
  (1, 1998)	0.34235064839105983
  (1, 2445)	0.09812135717764796
  (1, 5855)	0.34235064839105983
  (1, 3087)	0.34235064839105983
  (1, 504)	0.36109428597337984
  (2, 1662)	0.2941124459565659
  (2, 1707)	0.7584230340282168
  (2, 2445)	0.3458800606605086
  (2, 2213)	0.4676102588976887
  (3, 2063)	0.2327983651893471
  (3, 3248)	0.207088684600593
  (3, 6134)	0.2965140259093501
  (3, 26)	0.31848182583972473
  :	:
  (1811, 4764)	0.2646896981048109
  (1812, 3331)	0.4209368072097417
  (1812, 3600)	0.3963950604438956
  (1812, 467)	0.37106813471162764
  (1812, 407)	0.37106813471162764
  (1812, 2767)	0.25761429154141513
  (1812

<b>Check TF-IDF array shape</b>

In [None]:
tfidf.toarray().shape

(1815, 6462)

## Splitting Data

<b>Define X and Y</b>

In [None]:
X = tfidf.toarray()
y = np.array(df['sentimen_labels'])

<b>Split Data by 80% and 20%</b>

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=42)

In [None]:
X_train.shape

(1452, 6462)

In [None]:
X_test.shape

(363, 6462)

## Modeling

<b>SVM Linear kernel</b>

In [None]:
from sklearn.svm import SVC

model_svm = SVC(kernel='linear')

In [None]:
model_svm.fit(X_train, y_train)

In [None]:
y_pred = model_svm.predict(X_train)

train_accuracy = accuracy_score(y_train, y_pred)
train_accuracy

0.9359504132231405

<b>Random Forest</b>

In [None]:
from sklearn.ensemble import RandomForestClassifier

model_rf = RandomForestClassifier()

In [None]:
model_rf.fit(X_train, y_train)

In [None]:
y_pred = model_rf.predict(X_train)

train_accuracy = accuracy_score(y_train, y_pred)
train_accuracy

0.9986225895316805

## Model Evaluation

<b>SVM Linear kernel</b>

In [None]:
y_pred = model_svm.predict(X_test)

print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("Precision Score (per-class):", precision_score(y_test, y_pred, average=None))
print("Recall Score (per-class):", recall_score(y_test, y_pred, average=None))
print("F1 Score (per-class):", f1_score(y_test, y_pred, average=None))

Accuracy Score: 0.6088154269972452
Precision Score (per-class): [0.55479452 0.63063063 0.66037736]
Recall Score (per-class): [0.70434783 0.59322034 0.53846154]
F1 Score (per-class): [0.62068966 0.61135371 0.59322034]


<b>Random Forest</b>

In [None]:
y_pred = model_rf.predict(X_test)

print("Accuracy Score:", accuracy_score(y_test, y_pred))
print("Precision Score (per-class):", precision_score(y_test, y_pred, average=None))
print("Recall Score (per-class):", recall_score(y_test, y_pred, average=None))
print("F1 Score (per-class):", f1_score(y_test, y_pred, average=None))

Accuracy Score: 0.5840220385674931
Precision Score (per-class): [0.57251908 0.53571429 0.67391304]
Recall Score (per-class): [0.65217391 0.63559322 0.47692308]
F1 Score (per-class): [0.6097561  0.58139535 0.55855856]
