<a href="https://colab.research.google.com/github/DuongVinh2609/NLP/blob/main/NLP.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
import pandas as pd
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import Perceptron, PassiveAggressiveClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.metrics import precision_score, recall_score, f1_score


# read data
X_train = pd.read_csv('drive/MyDrive/UIT-VSFC/UIT-VSFC/train/sents.txt', sep='.', header=None, index_col=None)
y1_train = pd.read_csv('drive/MyDrive/UIT-VSFC/UIT-VSFC/train/sentiments.txt', sep='.', header=None, index_col=None)
y2_train = pd.read_csv('drive/MyDrive/UIT-VSFC/UIT-VSFC/train/topics.txt', sep='.', header=None, index_col=None)

X_dev = pd.read_csv('drive/MyDrive/UIT-VSFC/UIT-VSFC/dev/sents.txt', sep='.', header=None, index_col=None)
y1_dev = pd.read_csv('drive/MyDrive/UIT-VSFC/UIT-VSFC/dev/sentiments.txt', sep='.', header=None, index_col=None)
y2_dev = pd.read_csv('drive/MyDrive/UIT-VSFC/UIT-VSFC/dev/topics.txt',  sep='.', header=None, index_col=None)

X_test = pd.read_csv('drive/MyDrive/UIT-VSFC/UIT-VSFC/test/sents.txt',  sep='.', header=None, index_col=None)
y1_test = pd.read_csv('drive/MyDrive/UIT-VSFC/UIT-VSFC/test/sentiments.txt',  sep='.', header=None, index_col=None)
y2_test = pd.read_csv('drive/MyDrive/UIT-VSFC/UIT-VSFC/test/topics.txt',  sep='.', header=None, index_col=None)

y1_train = y1_train.values.flatten()
y1_dev = y1_dev.values.flatten()
y1_test = y1_test.values.flatten()

y2_train = y2_train.values.flatten()
y2_dev = y2_dev.values.flatten()
y2_test = y2_test.values.flatten()

In [None]:
# pre processing data using tf-idf
pipe = Pipeline([('count', TfidfVectorizer(ngram_range=(1, 4))),     # using 1-gram, 2-gram, 3-gram, and 4-gram
                 ('tf_idf', TfidfTransformer(sublinear_tf=True))])   # normalized Tf-idf count matrix (using log for computation)

X_train_encoded = pipe.fit_transform(X_train[0]).toarray()
X_test_encoded = pipe.transform(X_test[0]).toarray()
X_dev_encoded = pipe.transform(X_dev[0]).toarray()

In [None]:
# CREATE MACHINE LEARNING MODEL

# (Modified) Perceptron
model = PassiveAggressiveClassifier(C=1,  # maximum step size
                                    max_iter=1000,
                                    loss="hinge",
                                    class_weight="balanced", # balanced the weight inversely proportional to class frequencies
                                    average=True,
                                    early_stopping=False,
                                    warm_start=False,
                                    verbose=True,
                                    n_jobs=-1     # faster computation
                                    )

In [None]:
# sentiment analysis model using (Modified) Perceptron
sentiment_model = model
sentiment_model.fit(X_train_encoded, y1_train)
y1_pred = sentiment_model.predict(X_test_encoded)



# sentiments performance
sentiment_precision_score = precision_score(y1_test, y1_pred, average=None)*100
sentiment_weighted_average_precision = precision_score(y1_test, y1_pred, average="weighted")*100

sentiment_recall_score = recall_score(y1_test, y1_pred, average=None)*100
sentiment_weighted_average_recall = recall_score(y1_test, y1_pred, average="weighted")*100

sentiment_f1_score = f1_score(y1_test, y1_pred, average=None)*100
sentiment_weighted_average_f1_score = f1_score(y1_test, y1_pred, average="weighted")*100

print("Sentiments Performance:")
print("           precision          recall           f1 score")
print("positive:", [sentiment_precision_score[2], sentiment_recall_score[2], sentiment_f1_score[2]])
print("negative:", [sentiment_precision_score[0], sentiment_recall_score[0], sentiment_f1_score[0]])
print("neutral: ", [sentiment_precision_score[1], sentiment_recall_score[1], sentiment_f1_score[1]])
# weighted average
print("average: ", [sentiment_weighted_average_precision, sentiment_weighted_average_recall, sentiment_weighted_average_f1_score])

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.


-- Epoch 1
-- Epoch 1
Norm: 133.86, NNZs: 68349, Bias: 5.600312, T: 11426, Avg. loss: 0.725935
Total training time: 12.19 seconds.
-- Epoch 2
Norm: 50.63, NNZs: 113113, Bias: -1.212269, T: 11426, Avg. loss: 0.390719
Total training time: 13.16 seconds.
-- Epoch 2
Norm: 171.23, NNZs: 86797, Bias: 0.236166, T: 22852, Avg. loss: 0.382403
Total training time: 22.29 seconds.
-- Epoch 3
Norm: 65.06, NNZs: 129525, Bias: -0.584494, T: 22852, Avg. loss: 0.210270
Total training time: 23.68 seconds.
-- Epoch 3
Norm: 193.08, NNZs: 93398, Bias: -0.640071, T: 34278, Avg. loss: 0.238052
Total training time: 32.04 seconds.
-- Epoch 4
Norm: 74.96, NNZs: 136980, Bias: -0.689522, T: 34278, Avg. loss: 0.141185
Total training time: 34.08 seconds.
-- Epoch 4
Norm: 208.65, NNZs: 98143, Bias: -0.204271, T: 45704, Avg. loss: 0.164157
Total training time: 41.22 seconds.
-- Epoch 5
Norm: 82.47, NNZs: 140824, Bias: -0.823403, T: 45704, Avg. loss: 0.104021
Total training time: 44.85 seconds.
-- Epoch 5
Norm: 220.89

[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  7.1min finished


Sentiments Performance:
           precision          recall           f1 score
positive: [89.98716302952504, 88.17610062893083, 89.07242693773824]
negative: [85.88628762541805, 91.12845990063875, 88.42975206611571]
neutral:  [37.16814159292036, 25.149700598802394, 30.0]
average:  [85.37601014756049, 86.16550852811118, 85.67046098931169]


In [None]:
# topics analysis model using (Modified) Perceptron
topic_model = model
topic_model.fit(X_train_encoded, y2_train)
y2_pred = topic_model.predict(X_test_encoded)


# Topics performance
topics_precision_score = precision_score(y2_test, y2_pred, average=None)*100
topics_weighted_average_precision = precision_score(y2_test, y2_pred, average="weighted")*100

topics_recall_score = recall_score(y2_test, y2_pred, average=None)*100
topics_weighted_average_recall = recall_score(y2_test, y2_pred, average="weighted")*100

topics_f1_score = f1_score(y2_test, y2_pred, average=None)*100
topics_weighted_average_f1_score = f1_score(y2_test, y2_pred, average="weighted")*100

print("Topics Performance")
print("             precision          recall           f1 score")
print("lecturer:  ", [topics_precision_score[0], topics_recall_score[0], topics_f1_score[0]])
print("curriculum:", [topics_precision_score[1], topics_recall_score[1], topics_f1_score[1]])
print("facility:  ", [topics_precision_score[2], topics_recall_score[2], topics_f1_score[2]])
print("others:    ", [topics_precision_score[3], topics_recall_score[3], topics_f1_score[3]])
# weighted average
print("average:   ", [topics_weighted_average_precision, topics_weighted_average_recall, topics_weighted_average_f1_score])

[Parallel(n_jobs=-1)]: Using backend ThreadingBackend with 2 concurrent workers.


-- Epoch 1
-- Epoch 1
Norm: 57.92, NNZs: 103395, Bias: -2.303037, T: 11426, Avg. loss: 0.404330
Total training time: 10.97 seconds.
-- Epoch 2
Norm: 37.06, NNZs: 131231, Bias: -1.134262, T: 11426, Avg. loss: 0.475677
Total training time: 11.51 seconds.
-- Epoch 2
Norm: 77.65, NNZs: 123744, Bias: -1.678129, T: 22852, Avg. loss: 0.237173
Total training time: 21.36 seconds.
-- Epoch 3
Norm: 48.98, NNZs: 145573, Bias: -0.409772, T: 22852, Avg. loss: 0.310476
Total training time: 22.45 seconds.
-- Epoch 3
Norm: 91.72, NNZs: 132250, Bias: -2.036893, T: 34278, Avg. loss: 0.158511
Total training time: 32.52 seconds.
-- Epoch 4
Norm: 59.33, NNZs: 152746, Bias: -0.786634, T: 34278, Avg. loss: 0.243575
Total training time: 34.70 seconds.
-- Epoch 4
Norm: 101.67, NNZs: 136247, Bias: -1.655470, T: 45704, Avg. loss: 0.110372
Total training time: 41.72 seconds.
-- Epoch 5
Norm: 67.30, NNZs: 156272, Bias: -0.571008, T: 45704, Avg. loss: 0.192804
Total training time: 44.01 seconds.
-- Epoch 5
Norm: 109

[Parallel(n_jobs=-1)]: Done   4 out of   4 | elapsed: 10.0min finished


Topics Performance
             precision          recall           f1 score
lecturer:   [91.33893062306673, 90.26200873362446, 90.79727652097519]
curriculum: [65.72769953051643, 73.42657342657343, 69.36416184971098]
facility:   [90.97744360902256, 83.44827586206897, 87.05035971223022]
others:     [40.458015267175576, 33.33333333333333, 36.55172413793103]
average:    [84.1399080887768, 84.04927353126975, 84.02908721013021]
