# Sentiment Analysis

## Load model


### Download and model configuration

In [None]:
!gdown --id 1wmF255Z6bLssLdLWDz8bpiJgVpfbfLu1 --output model.h5
!gdown --id 1LXnz3ZA_OBee3NX5pIHhXQ9Cfmmjdh0A --output my_word2vec_model.model
!gdown --id 16GFdmIELoIXtnenB0tIlamX0x-putZuu --output encoder.pkl
!gdown --id 11HRLfRX3CEuF2Iuxke0GkAPpZgXHUWzC --output tokenizer.pkl

Downloading...
From: https://drive.google.com/uc?id=1wmF255Z6bLssLdLWDz8bpiJgVpfbfLu1
To: /content/model.h5
100% 350M/350M [00:01<00:00, 220MB/s]
Downloading...
From: https://drive.google.com/uc?id=1LXnz3ZA_OBee3NX5pIHhXQ9Cfmmjdh0A
To: /content/my_word2vec_model.model
100% 73.8M/73.8M [00:00<00:00, 271MB/s]
Downloading...
From: https://drive.google.com/uc?id=16GFdmIELoIXtnenB0tIlamX0x-putZuu
To: /content/encoder.pkl
100% 714/714 [00:00<00:00, 4.78MB/s]
Downloading...
From: https://drive.google.com/uc?id=11HRLfRX3CEuF2Iuxke0GkAPpZgXHUWzC
To: /content/tokenizer.pkl
100% 22.6M/22.6M [00:00<00:00, 173MB/s]


### Import packages


In [None]:
# DataFrame
import pandas as pd

# Matplot
import matplotlib.pyplot as plt
%matplotlib inline

# Scikit-learn
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.manifold import TSNE
from sklearn.feature_extraction.text import TfidfVectorizer

# Keras
from keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
# from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Activation, Dense, Dropout, Embedding, Flatten, Conv1D, MaxPooling1D, LSTM
from keras import utils
from keras.callbacks import ReduceLROnPlateau, EarlyStopping
from keras.models import load_model

# nltk
import nltk
from nltk.corpus import stopwords
from  nltk.stem import SnowballStemmer

# Word2vec
import gensim

# Utility
import re
import numpy as np
import os
from collections import Counter
import logging
import time
import pickle
import itertools

# Set log
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

### Settings

In [None]:
TRAIN = False
EVALUATE = False
# change TRAIN into True, you will retrain the model, else you will load the pretrained model

# DATASET
DATASET_COLUMNS = ["target", "ids", "date", "flag", "user", "text"]
DATASET_ENCODING = "ISO-8859-1"
TRAIN_SIZE = 0.8

# TEXT CLENAING
TEXT_CLEANING_RE = "@\S+|https?:\S+|http?:\S|[^A-Za-z0-9]+"

# WORD2VEC 
W2V_SIZE = 300
W2V_WINDOW = 7
W2V_EPOCH = 32
W2V_MIN_COUNT = 10

# KERAS
SEQUENCE_LENGTH = 300
EPOCHS = 20
BATCH_SIZE = 1024

# SENTIMENT
POSITIVE = "POSITIVE"
EXE_NEGATIVE = "EXETREAMLY NEGATIVE"
NEGATIVE = "NEGATIVE"
NEUTRAL = "NEUTRAL"
SENTIMENT_THRESHOLDS = (0.15, 0.5)

# EXPORT
KERAS_MODEL = "model.h5"
WORD2VEC_MODEL = "model.w2v"
TOKENIZER_MODEL = "tokenizer.pkl"
ENCODER_MODEL = "encoder.pkl"


### model


In [None]:
# Word2Vec word embedding
w2v_model = gensim.models.word2vec.Word2Vec.load("my_word2vec_model.model")

# Tokenize Text
tokenizer = Tokenizer()
with open('tokenizer.pkl', 'rb') as f:
  tokenizer = pickle.load(f)
vocab_size = len(tokenizer.word_index) + 1
print("Total words", vocab_size)

# Encode label
encoder = LabelEncoder()

with open('encoder.pkl', 'rb') as f:
  encoder = pickle.load(f)

# Embedding layer
embedding_matrix = np.zeros((vocab_size, W2V_SIZE))
for word, i in tokenizer.word_index.items():
  if word in w2v_model.wv:
    embedding_matrix[i] = w2v_model.wv[word]
 
print(embedding_matrix.shape)
embedding_layer = Embedding(vocab_size, W2V_SIZE, weights=[embedding_matrix], input_length=SEQUENCE_LENGTH, trainable=False)


model = load_model('model.h5')

Total words 290419




(290419, 300)


### Predict

In [None]:
def decode_sentiment(score):
  label = NEGATIVE
  if score <= SENTIMENT_THRESHOLDS[0]:
      label = EXE_NEGATIVE
  elif score >= SENTIMENT_THRESHOLDS[1]:
      label = NEUTRAL

  return label
  

def predict(text):
    # Tokenize text
    x_test = pad_sequences(tokenizer.texts_to_sequences([text]), maxlen=SEQUENCE_LENGTH)
    # Predict
    score = model.predict([x_test])[0]
    # Decode sentiment
    label = decode_sentiment(score)

    return {"label": label, "score": float(score)}  

#### Utils for prediction


In [44]:
import requests
import hashlib
import json
import random

url = 'https://fanyi-api.baidu.com/api/trans/vip/translate'
app_id = '20230502001663190'
app_secret = '8f9RgPf43xL9LGLN6qE8'


# 计算签名

salt = str(random.randint(32768, 65536))
def translate(text):
  sign = app_id + text + salt + app_secret
  sign = hashlib.md5(sign.encode()).hexdigest()

  # 发送POST请求
  data = {
      'q': text,
      'from': 'zh',
      'to': 'en',
      'appid': app_id,
      'salt': salt,
      'sign': sign
  }
  response = requests.post(url, data=data)

  # 解析响应
  result = json.loads(response.text)
  print(response.text)
  translated_text = result['trans_result'][0]['dst']

  return translated_text




## 请上传excel表格

In [None]:
from google.colab import files

# 选择要上传的文件
uploaded = files.upload()

# 将上传的文件重命名
import os
os.rename(list(uploaded.keys())[0], "complaint.xlsx")

Saving comment.xlsx to comment.xlsx


In [None]:
import xlrd
import openpyxl
import time

# 打开旧文件
old_wb = openpyxl.load_workbook('complaint.xlsx')
old_sheet = old_wb.active

# 创建新的工作簿
new_wb = openpyxl.Workbook()
new_sheet = new_wb.active

# 将数据从旧文件复制到新文件中
first_row = True
new_sheet.cell(row=1, column=1, value='canteen')
new_sheet.cell(row=1, column=2, value='stall')
new_sheet.cell(row=1, column=3, value='comment')
new_sheet.cell(row=1, column=4, value='label')
new_sheet.cell(row=1, column=5, value='score_value')
for row_index in range(1,old_sheet.max_row+1):
  # 读取第一列和第二列的值
  col1_value = old_sheet.cell(row_index, 1).value
  col2_value = old_sheet.cell(row_index, 2).value
  complaint = old_sheet.cell(row_index, 3).value

  if (first_row):
    first_row = False
    continue
  else:
    # 预测得分
    print(complaint)
    eng_complaint = translate(complaint)
    score = predict(eng_complaint)
    print(score)

    label = score['label']
    score_value = score['score']

  new_sheet.cell(row=row_index, column=1, value=col1_value)
  new_sheet.cell(row=row_index, column=2, value=col2_value)
  new_sheet.cell(row=row_index, column=3, value=complaint)
  new_sheet.cell(row=row_index, column=4, value=label)
  new_sheet.cell(row=row_index, column=5, value=score_value)
  time.sleep(1)

# 保存新文件
new_wb.save('complaint_score.xlsx')
files.download('complaint_score.xlsx')
