In [None]:
# we will be using various libraries like os for taking the input,etc
# I have used some libraries like seaborn, wordcloud, matplotlib for data visualization so
# you can skip them if you don't understand

import os
import pandas as pd
import numpy as np
import gc
import matplotlib.pyplot as plt
import operator
import seaborn as sns
from wordcloud import WordCloud,STOPWORDS

# re is used for cleaning the dataset

import re

import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from tensorflow import keras

# callbacks are important here as sometimes you get the best accuracy earlies and then it
# goes down so as to stop the training there you need to use them
from sklearn.feature_extraction.text import CountVectorizer


from keras.callbacks import EarlyStopping,ModelCheckpoint
from keras.utils import to_categorical
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Embedding,Conv1D,LSTM,GRU,BatchNormalization,Flatten,Dense

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df= pd.read_csv('/content/drive/MyDrive/DS102_Đồ án cuối kì/cr7-cr7-1-cr7-cr7-1-cr7-cr7-1-cr7-cr7-1.csv')
df.head(10)

In [None]:
df.info()

In [None]:
df.isnull().sum()

In [None]:
sns.countplot(x=df['sentiment'])
plt.grid()

In [None]:
sentences=df['vi_review']
le=LabelEncoder()
df['sentiment']= le.fit_transform(df['sentiment'])

In [None]:
df['sentiment']

In [None]:
stopwords = set(STOPWORDS)

pos=' '.join(map(str,sentences[df['sentiment']==2]))
neg=' '.join(map(str,sentences[df['sentiment']==0]))

wordcloud1 = WordCloud(width = 800, height = 800,
                background_color ='black',
                stopwords = stopwords,
                min_font_size = 10).generate(pos)

plt.figure(figsize=(8,8))
plt.imshow(wordcloud1)
plt.title('Positive Sentiment')
plt.axis('off')

In [None]:
plt.figure(figsize=(8,8))
wordcloud2 = WordCloud(width = 800, height = 800,
                background_color ='black',
                stopwords = stopwords,
                min_font_size = 10).generate(neg)

plt.imshow(wordcloud2)
plt.title('Negative Sentiment')
plt.axis('off')

plt.show()

In [None]:
neutral = ' '.join(map(str, sentences[df['sentiment'] == 1]))
wordcloud2 = WordCloud(width=800, height=800,
                       background_color='black',
                       stopwords=stopwords,
                       min_font_size=10).generate(neutral)

plt.figure(figsize=(8, 8))
plt.imshow(wordcloud2)
plt.title('Neutral Sentiment')
plt.axis('off')

plt.show()

In [None]:
num_classes = df['sentiment'].nunique()
labels = to_categorical(df['sentiment'], num_classes=num_classes)
X_train,X_test,Y_train,Y_test = train_test_split(df['vi_review'],labels,test_size=0.1,random_state=10)


In [None]:
glove_embeddings= np.load('/content/drive/MyDrive/DS102_Đồ án cuối kì/glove.840B.300d.pkl',
                          allow_pickle=True)

In [None]:
def vocab_build(review):

    comments = review.apply(lambda s: s.split()).values #Chia câu thành các từ và chuyển sang thành numpy
    vocab={}
#Duyệt qua từng đánh giá và từng từ trong đánh giá , cập nhật từ vựng.
    for comment in comments:
        for word in comment:
            try:
                vocab[word]+=1

            except KeyError:
                vocab[word]=1
    return vocab


In [None]:
def embedding_coverage(review,embeddings):

    vocab=vocab_build(review)

    covered={}
    word_count={}
    oov={}
    covered_num=0
    oov_num=0

    for word in vocab:
        try:
            covered[word]=embeddings[word]
            covered_num+=vocab[word]
            word_count[word]=vocab[word]
        except:
            oov[word]=vocab[word]
            oov_num+=oov[word]

    vocab_coverage=len(covered)/len(vocab)*100
    text_coverage = covered_num/(covered_num+oov_num)*100

    sorted_oov=sorted(oov.items(), key=operator.itemgetter(1))[::-1]
    sorted_word_count=sorted(word_count.items(), key=operator.itemgetter(1))[::-1]

    return sorted_word_count,sorted_oov,vocab_coverage,text_coverage

In [None]:
train_covered,train_oov,train_vocab_coverage,train_text_coverage=embedding_coverage(X_train,glove_embeddings)
test_covered,test_oov, test_vocab_coverage, test_text_coverage = embedding_coverage(X_test,glove_embeddings)

print(f"Glove embeddings cover {round(train_vocab_coverage,2)}% of vocabulary and {round(train_text_coverage,2)}% text in training set")
print(f"Glove embeddings cover {round(test_vocab_coverage,2)}% of vocabulary and {round(test_text_coverage,2)}% text in testing set")

In [None]:
train_oov[:20]

In [None]:
!pip install nltk
import nltk
nltk.download('stopwords')


In [None]:
import re
from nltk.corpus import stopwords



def clean_sentences(line):

    line=re.sub('<.*?>','',line) # removing html tags
    #special characters and emojis
    line=re.sub('\x91The','The',line)
    line=re.sub('\x97','',line)
    line=re.sub('\x84The','The',line)
    line=re.sub('\uf0b7','',line)
    line=re.sub('¡¨','',line)
    line=re.sub('\x95','',line)
    line=re.sub('\x8ei\x9eek','',line)
    line=re.sub('\xad','',line)
    line=re.sub('\x84bubble','bubble',line)

    # remove concated words
    line=re.sub('trivialBoring','trivial Boring',line)
    line=re.sub('Justforkix','Just for kix',line)
    line=re.sub('Nightbeast','Night beast',line)
    line=re.sub('DEATHTRAP','Death Trap',line)
    line=re.sub('CitizenX','Citizen X',line)
    line=re.sub('10Rated','10 Rated',line)
    line=re.sub('_The','_ The',line)
    line=re.sub('1Sound','1 Sound',line)
    line=re.sub('blahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblahblah','blah blah',line)
    line=re.sub('ResidentHazard','Resident Hazard',line)
    line=re.sub('iameracing','i am racing',line)
    line=re.sub('BLACKSNAKE','Black Snake',line)
    line=re.sub('DEATHSTALKER','Death Stalker',line)
    line=re.sub('_is_','is',line)
    line=re.sub('10Fans','10 Fans',line)
    line=re.sub('Yellowcoat','Yellow coat',line)
    line=re.sub('Spiderbabe','Spider babe',line)
    line=re.sub('Frightworld','Fright world',line)

    #removing punctuations
    punctuations = '@#!~?+&*[]-%._-:/£();$=><|{}^' + '''"“´”'`'''
    for p in punctuations:
        line = line.replace(p, f' {p} ')

    line=re.sub(',',' , ',line)

    # ... and ..
    line = line.replace('...', ' ... ')

    if '...' not in line:
        line = line.replace('..', ' ... ')

    return line

In [None]:
X_train=X_train.apply(lambda s: clean_sentences(s))
X_test=X_test.apply(lambda s: clean_sentences(s))

train_covered,train_oov,train_vocab_coverage,train_text_coverage=embedding_coverage(X_train,glove_embeddings)
print(f"Glove embeddings cover {round(train_vocab_coverage,2)}% of vocabulary and {round(train_text_coverage,2)}% text in training set")

test_covered,test_oov,test_vocab_coverage,test_text_coverage=embedding_coverage(X_test,glove_embeddings)
print(f"Glove embeddings cover {round(test_vocab_coverage,2)}% of vocabulary and {round(test_text_coverage,2)}% text in testing set")

In [None]:
punctuations = '@#!~?+&*[]-%._-:/£();$=><|{},^' + '''"“´”'`'''
train_word=[]
train_count=[]

i=1
for word,count in train_covered:
    if word not in punctuations:
        train_word.append(word)
        train_count.append(count)
        i+=1
    if(i==15):
        break

In [None]:
test_word=[]
test_count=[]

i=1
for word,count in test_covered:
    if word not in punctuations:
        test_word.append(word)
        test_count.append(count)
        i+=1
    if(i==15):
        break


In [None]:
plt.figure(figsize=(12,8))
sns.barplot(x=train_count,y=train_word).set_title('Count of 15 most used word in training set')
plt.grid()

In [None]:
plt.figure(figsize=(12,8))
sns.barplot(x=test_count,y=test_word).set_title('Count of 15 most used word in testing set')
plt.grid()

In [None]:
del glove_embeddings,train_oov,test_oov
gc.collect()

In [None]:
num_words=80000
embeddings=256

In [None]:
tokenizer=Tokenizer(num_words=num_words,oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)
word_index=tokenizer.word_index
total_vocab=len(word_index)

In [None]:
print("Vocabulary of the dataset is : ",total_vocab)

In [None]:
sequences_train=tokenizer.texts_to_sequences(X_train)
sequences_test=tokenizer.texts_to_sequences(X_test)

max_len=max(max([len(x) for x in sequences_train]),max([len(x) for x in sequences_test]))

train_padded=pad_sequences(sequences_train,maxlen=max_len)
test_padded=pad_sequences(sequences_test,maxlen=max_len)

In [None]:
X_train,X_val,Y_train,Y_val=train_test_split(train_padded,Y_train,
                                             test_size=0.05,random_state=10)

In [None]:
model= keras.Sequential()
model.add(Embedding(num_words,embeddings,input_length=max_len))
model.add(Conv1D(256,10,activation='relu'))
model.add(keras.layers.Bidirectional(LSTM(128,return_sequences=True)))
model.add(LSTM(64))
model.add(keras.layers.Dropout(0.4))
model.add(Dense(3,activation='softmax'))

In [None]:
model.summary()

In [None]:
model.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy']
             )

In [None]:
es= EarlyStopping(monitor='val_accuracy',
                  patience=3
                 )

checkpoints=ModelCheckpoint(filepath='/content/drive/MyDrive',
                            monitor="val_accuracy",
                            verbose=0,
                            save_best_only=True
                           )

callbacks=[es,checkpoints]

In [None]:
history=model.fit(X_train,Y_train,validation_data=(X_val,Y_val),epochs=5,callbacks=callbacks)

In [None]:
def plot_graph(history,string):

    plt.plot(history.history[string],label='training '+string)
    plt.plot(history.history['val_'+string],label='validation '+string)
    plt.legend()
    plt.xlabel('epochs')
    plt.ylabel(string)
    plt.title(string+' vs epochs')
    plt.show()

In [None]:
plot_graph(history,'loss')

In [None]:
plot_graph(history,'accuracy')

In [None]:
import numpy as np

while True:
    # Người dùng nhập câu
    user_input = input("Nhập câu của bạn (hoặc nhập 'exit' để thoát): ")

    # Kiểm tra điều kiện để thoát vòng lặp
    if user_input.lower() == 'exit':
        break

    # Tiền xử lý câu và đưa vào mô hình dự đoán
    cleaned_input = clean_sentences(user_input)
    tokenized_input = tokenizer.texts_to_sequences([cleaned_input])
    padded_input = pad_sequences(tokenized_input, maxlen=max_len)

    # Dự đoán
    predictions = model.predict(padded_input)
    sentiment_labels = ['Negative', 'Neutral', 'Positive']
    predicted_sentiment_index = np.argmax(predictions, axis=1)[0]
    predicted_sentiment = sentiment_labels[predicted_sentiment_index]

    # In kết quả
    print(f"Máy dự đoán: {predicted_sentiment}")


#KNN


In [None]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score

# Khởi tạo danh sách lưu trữ độ chính xác
score_list = []

# Lặp qua các giá trị k từ 1 đến 14
for k in range(1, 15):
    # Tạo mô hình KNN với số láng giềng là k
    knn_model = KNeighborsClassifier(n_neighbors=k)

    # Huấn luyện mô hình trên dữ liệu huấn luyện
    knn_model.fit(X_train, Y_train)

    # Dự đoán nhãn trên tập kiểm thử
    y_pred = knn_model.predict(X_val)

    # Tính độ chính xác và thêm vào danh sách
    accuracy = accuracy_score(Y_val, y_pred)
    score_list.append(accuracy)

# Vẽ đồ thị
plt.plot(range(1, 15), score_list)
plt.xlabel("Number of Neighbors (k)")
plt.ylabel("Accuracy")
plt.title("KNN Accuracy for Different Values of k")
plt.show()

In [None]:
knn = KNeighborsClassifier(n_neighbors=3)
knn.fit(X_train, Y_train)
Y_val_single_dim = np.argmax(Y_val, axis=1)
y_pred = knn.predict(X_val)
y_pred_single_dim = np.argmax(y_pred, axis=1)
# Evaluate the model using accuracy_score
accuracy = accuracy_score(Y_val_single_dim, y_pred_single_dim)
print("Độ chính xác với Số láng giềng = 3: {:.2f}%".format(accuracy * 100))

In [None]:
from sklearn.metrics import confusion_matrix, accuracy_score

In [None]:
# Calculate and display the confusion matrix
conf_mat = confusion_matrix(Y_val_single_dim, y_pred_single_dim)
f, ax = plt.subplots(figsize=(5, 5))
sns.heatmap(conf_mat, annot=True, linewidths=0.5, linecolor="red", fmt=".0f", ax=ax)
plt.xlabel("Dự đoán")
plt.ylabel("Thực tế")
plt.show()


In [None]:
from sklearn.metrics import precision_score, recall_score, f1_score, classification_report

# Tính toán precision, recall, F1-score
precision = precision_score(Y_val_single_dim, y_pred_single_dim, average='weighted')
recall = recall_score(Y_val_single_dim, y_pred_single_dim, average='weighted')
f1 = f1_score(Y_val_single_dim, y_pred_single_dim, average='weighted')

# In ra các thông số
print(f'Precision: {precision:.2f}')
print(f'Recall: {recall:.2f}')
print(f'F1-score: {f1:.2f}')

# In ra classification report, bao gồm precision, recall, F1-score và support
print('Classification Report:')
print(classification_report(Y_val_single_dim, y_pred_single_dim))

#SVM

In [None]:
method_names = []
method_scores = []
from sklearn.metrics import confusion_matrix
from sklearn.svm import SVC
import matplotlib.pyplot as plt
import seaborn as sns

svm = SVC(random_state=42)
svm.fit(X_train, np.argmax(Y_train, axis=1))  # Chuyển đổi one-hot encoding về dạng số nguyên
svm_score = svm.score(X_val, np.argmax(Y_val, axis=1))
print("SVM Classification Score is: {}".format(svm_score))

In [None]:
# Confusion Matrix
y_pred = svm.predict(X_val)
conf_mat = confusion_matrix(np.argmax(Y_val, axis=1), y_pred)
# Visualization Confusion Matrix
f, ax = plt.subplots(figsize=(5, 5))
sns.heatmap(conf_mat, annot=True, linewidths=0.5, linecolor="red", fmt=".0f", ax=ax)
plt.xlabel("Predicted Values")
plt.ylabel("True Values")
plt.show()

In [None]:
from sklearn.metrics import confusion_matrix, classification_report

# Calculate F1 score, recall, and precision
classification_rep = classification_report(np.argmax(Y_val, axis=1), y_pred)

# Print and append to the lists
print("Classification Report for SVM:\n", classification_rep)

# Extract individual metrics
f1_score = float(classification_rep.split()[-4])
recall = float(classification_rep.split()[-3])
precision = float(classification_rep.split()[-2])

# Append to the lists
method_names.extend(["F1 Score", "Recall", "Precision"])
method_scores.extend([f1_score, recall, precision])