In [None]:
# import libraries
import os
import pandas as pd

# Machine Learning Approach
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import metrics

# Evaluation Model
from sklearn.metrics import classification_report

In [None]:
import glob
from google.colab import drive

drive.mount('/content/drive')

path = r'/content/drive/MyDrive/FYP/Latest_Data/'       # path
all_files = glob.iglob(os.path.join(path, "*.csv"))     # advisable to use os.path.join as this makes concatenation OS independent

df_from_each_file = (pd.read_csv(f) for f in all_files)
tweet_text  = pd.concat((pd.read_csv(f) for f in all_files), ignore_index=True)


Mounted at /content/drive


In [None]:
tweet_text

Unnamed: 0.1,Unnamed: 0,text,label,Column1
0,0.0,imho tidak ltg racun racun vaksin dimas,neutral,
1,1.0,ganteng banget tolong maju gua takut vaksin,positive,
2,2.0,khawatir dg dokter risiko vaksin mempromosika,neutral,
3,3.0,anak didiagnosa sakit divaksin sinovac mahu ta...,positive,
4,4.0,setahun televisi rusia menerbitkan berita baha...,neutral,
...,...,...,...,...
46548,8757.0,vaksin prokes ketat ampuh cegah berdiri jokowi,positive,
46549,8758.0,vaksin kaya,positive,
46550,8759.0,vaksin hpv kaget beropini liat,negative,
46551,8760.0,vaksin prokes ketat ampuh cegah berdiri jokowi,positive,


In [None]:
tweet_text = tweet_text.iloc[: , 1:]
del tweet_text['Column1']

In [None]:
# dropping ALL duplicate values
tweet_text.drop_duplicates(subset ="text", inplace = True)

In [None]:
tweet_text = tweet_text.reset_index()

In [None]:
del tweet_text['index']

In [None]:
tweet_text

Unnamed: 0,text,label
0,imho tidak ltg racun racun vaksin dimas,neutral
1,ganteng banget tolong maju gua takut vaksin,positive
2,khawatir dg dokter risiko vaksin mempromosika,neutral
3,anak didiagnosa sakit divaksin sinovac mahu ta...,positive
4,setahun televisi rusia menerbitkan berita baha...,neutral
...,...,...
26565,baro gank petojo fikir berhubungan dianggap va...,neutral
26566,vaksin tidak membatalkan puasa ramadhan,neutral
26567,didu sibuk ngejar vaksin,neutral
26568,percepatan vaksinasi bpbd cikarang ajak masyar...,neutral


In [None]:
tweet_text['label'].value_counts()

neutral     15729
positive     7304
negative     3537
Name: label, dtype: int64

In [None]:
import plotly.graph_objects as go

# Percentage of Sentiment Label
counts_label = tweet_text['label'].value_counts()*100/sum(tweet_text['label'].value_counts())
counts_label = round(counts_label, 2)

float_counts_label = counts_label.apply(lambda x: round(x, 2))

# 10 most present labels
popular_labels_label  = counts_label.index

# Plot
colors_label = ['gainsboro'] * len(popular_labels_label)
colors_label[0] = '#412525'
colors_label[1] = '#714433'
colors_label[2] = '#c46352'

fig = go.Figure(data=[go.Bar(
    y=counts_label,
    x=popular_labels_label,
    marker_color=colors_label,
    orientation='v',
    textposition='auto',
    text=counts_label
    )])

fig.update_layout(title_text='Percentage of Sentiment Label (in %)',
                  yaxis_title="Percentage (%)",
                  xaxis_title="Sentiment Label")

In [None]:
minor = tweet_text['label'].value_counts().min()
print(minor)

3537


In [None]:
neu_tweet_text = tweet_text[tweet_text['label']=='neutral']
neu_tweet_text = neu_tweet_text.sample(n=minor, random_state=62)
pos_tweet_text = tweet_text[tweet_text['label']=='positive']
pos_tweet_text = pos_tweet_text.sample(n=minor, random_state=62)
neg_tweet_text = tweet_text[tweet_text['label']=='negative']
neg_tweet_text = neg_tweet_text.sample(n=minor, random_state=62)

In [None]:
frames = [neg_tweet_text, pos_tweet_text, neu_tweet_text]

tweet_text = pd.concat(frames)
tweet_text = tweet_text.reset_index()
tweet_text = tweet_text.iloc[: , 1:]

In [None]:
tweet_text

Unnamed: 0,text,label
0,gila kaya antrian vaksin,negative
1,buset jarum vaksin pakai paku bumi tangan kya ...,negative
2,lisaamartatara jokowi presiden anies jongos jo...,negative
3,lisaamartatara diras disuntik vaksin palsu,negative
4,sertifikat vaksin astaga malas bangat dinkes p...,negative
...,...,...
10606,vaksin moderna kabarin jn jerman,neutral
10607,fkm ui keamanan vaksin jokowi majukan bangsa,neutral
10608,kameng engga ko minimal vaksin,neutral
10609,kayak ya hubungin vaksin,neutral


In [None]:
tweet_text['label'].value_counts()

negative    3537
positive    3537
neutral     3537
Name: label, dtype: int64

In [None]:
# Percentage of Sentiment Label
counts_label = tweet_text['label'].value_counts()*100/sum(tweet_text['label'].value_counts())
counts_label = round(counts_label, 2)

float_counts_label = counts_label.apply(lambda x: round(x, 2))

# 10 most present labels
popular_labels_label  = counts_label.index

# Plot
colors_label = ['gainsboro'] * len(popular_labels_label)
colors_label[0] = '#412525'
colors_label[1] = '#714433'
colors_label[2] = '#c46352'

fig = go.Figure(data=[go.Bar(
    y=counts_label,
    x=popular_labels_label,
    marker_color=colors_label,
    orientation='v',
    textposition='auto',
    text=counts_label
    )])

fig.update_layout(title_text='Percentage of Sentiment Label (in %)',
                  yaxis_title="Percentage (%)",
                  xaxis_title="Sentiment Label")

In [None]:
import pickle

#feature extraction
#field = TEXT column
def extract_features(df,field,training_data,testing_data):
  #logging.info("Extracting features and creating vocabulary...")   
    # TF-IDF BASED FEATURE REPRESENTATION
    tfidf_vectorizer=TfidfVectorizer(use_idf=True, max_df=0.95)
    tfidf_vectorizer.fit_transform(training_data[field].values)
    # Dump the file
    pickle.dump(tfidf_vectorizer, open("tfidf1.pkl", "wb"))
    train_feature_set=tfidf_vectorizer.transform(training_data[field].values)
    test_feature_set=tfidf_vectorizer.transform(testing_data[field].values)
    return train_feature_set,test_feature_set,tfidf_vectorizer

In [None]:
#create features
#field  - column name contains the review text
#feature_rep   - can be binary, counts or tf
field = 'text'
feature_rep = 'tf'
# GET A TRAIN TEST SPLIT (set seed for consistent results)
training_data,testing_data = train_test_split(tweet_text, random_state = 2000)#the default for test set is 25%
# GET FEATURES
X_train,X_test, feature_transformer=extract_features(tweet_text,field,training_data,testing_data)
# GET LABELS
Y_train=training_data['label'].values
Y_test=testing_data['label'].values

Use a	classifier	for	the	classification	task

In [None]:
#build the classifier model - SVM
from sklearn import svm

SVM = svm.SVC() # C=1.0, kernel='linear', degree=3, gamma='auto'
SVM.fit(X_train, Y_train)

predictions_SVM = SVM.predict(X_test)

accuracy_svm = metrics.accuracy_score(Y_test, predictions_SVM)
accuracy_svm_per = metrics.accuracy_score(Y_test, predictions_SVM)*100

print("SVM Accuracy with TFIDF:", accuracy_svm)
print("Percentage of SVM Accuracy with TFIDF:", accuracy_svm_per,"%")
print("\n")
print(classification_report(Y_test, predictions_SVM))

SVM Accuracy with TFIDF: 0.7723332076894082
Percentage of SVM Accuracy with TFIDF: 77.23332076894081 %


              precision    recall  f1-score   support

    negative       0.81      0.75      0.78       862
     neutral       0.69      0.87      0.77       902
    positive       0.86      0.69      0.77       889

    accuracy                           0.77      2653
   macro avg       0.79      0.77      0.77      2653
weighted avg       0.79      0.77      0.77      2653



In [None]:
#build the classifier model - LR
from sklearn.linear_model import LogisticRegression

LR = LogisticRegression() # random_state = 0
LR.fit(X_train, Y_train)

predictions_LR = LR.predict(X_test)

accuracy_lr = metrics.accuracy_score(Y_test, predictions_LR)
accuracy_lr_per = metrics.accuracy_score(Y_test, predictions_LR)*100

print("LR Accuracy with TFIDF:",accuracy_lr)
print("Percentage of LR Accuracy with TFIDF:", accuracy_lr_per,"%")
print("\n")
print(classification_report(Y_test, predictions_LR))

LR Accuracy with TFIDF: 0.7689408217112702
Percentage of LR Accuracy with TFIDF: 76.89408217112702 %


              precision    recall  f1-score   support

    negative       0.80      0.76      0.78       862
     neutral       0.70      0.83      0.76       902
    positive       0.83      0.71      0.77       889

    accuracy                           0.77      2653
   macro avg       0.78      0.77      0.77      2653
weighted avg       0.78      0.77      0.77      2653



In [None]:
#build the classifier model - NB
from sklearn.naive_bayes import GaussianNB

X_train_nb = X_train.toarray()
X_test_nb = X_test.toarray()

NB = GaussianNB()
NB.fit(X_train_nb, Y_train)

predictions_NB = NB.predict(X_test_nb)

accuracy_nb = metrics.accuracy_score(Y_test, predictions_NB)
accuracy_nb_per = metrics.accuracy_score(Y_test, predictions_NB)*100

print("NB Accuracy with TFIDF:",accuracy_nb)
print("Percentage of NB Accuracy with TFIDF:", accuracy_nb_per,"%")
print("\n")
print(classification_report(Y_test, predictions_NB))

NB Accuracy with TFIDF: 0.4519411986430456
Percentage of NB Accuracy with TFIDF: 45.19411986430456 %


              precision    recall  f1-score   support

    negative       0.55      0.38      0.45       862
     neutral       0.49      0.25      0.33       902
    positive       0.40      0.73      0.52       889

    accuracy                           0.45      2653
   macro avg       0.48      0.45      0.43      2653
weighted avg       0.48      0.45      0.43      2653



In [None]:
#build the classifier model - KNN
from sklearn.neighbors import KNeighborsClassifier

# X_test_knn = X_test.toarray()

KNN = KNeighborsClassifier()
KNN.fit(X_train, Y_train)

predictions_KNN = KNN.predict(X_test)

accuracy_knn = metrics.accuracy_score(Y_test, predictions_KNN)
accuracy_knn_per = metrics.accuracy_score(Y_test, predictions_KNN)*100

print("KNN Accuracy with TFIDF:",accuracy_knn)
print("Percentage of KNN Accuracy with TFIDF:", accuracy_knn_per,"%")
print("\n")
print(classification_report(Y_test, predictions_KNN))

KNN Accuracy with TFIDF: 0.5917828872973991
Percentage of KNN Accuracy with TFIDF: 59.17828872973991 %


              precision    recall  f1-score   support

    negative       0.59      0.66      0.63       862
     neutral       0.53      0.58      0.55       902
    positive       0.68      0.54      0.60       889

    accuracy                           0.59      2653
   macro avg       0.60      0.59      0.59      2653
weighted avg       0.60      0.59      0.59      2653



In [None]:
# initialize list of lists
data = [['LR', accuracy_lr_per],['SVM', accuracy_svm_per], ['NB', accuracy_nb_per], ['KNN', accuracy_knn_per]]
  
# Create the pandas DataFrame
df = pd.DataFrame(data, columns = ['Classifier', 'Accuracy (%)'])

df = df.sort_values(by='Accuracy (%)', ascending=False)

# Percentage of Sentiment Label
counts_label = df['Accuracy (%)']
counts_label = round(counts_label, 2)

float_counts_label = counts_label

# 10 most present labels
popular_labels_label  = df['Classifier']

# Plot
colors_label = ['gainsboro'] * len(popular_labels_label)
colors_label[0] = '#1b4332'
colors_label[1] = '#40916c'
colors_label[2] = '#74c69d'
colors_label[3] = '#95d5b2'

fig = go.Figure(data=[go.Bar(
    y=counts_label,
    x=popular_labels_label,
    marker_color=colors_label,
    orientation='v',
    textposition='auto',
    text=counts_label
    )])

fig.update_layout(title_text='Percentage of Classifier Model (in %)',
                  yaxis_title="Percentage of Accuracy (%)",
                  xaxis_title="Classifier Model")

Save SVM model

In [None]:
# save the model to disk
filename = 'svm_finalized_model.sav'
pickle.dump(SVM, open(filename, 'wb'))