In [None]:
import numpy as np
import pandas as pd

https://www.kaggle.com/datasets/ankitkumar2635/sentiment-and-emotions-of-tweets

Датасет содержит анализ упоминаний компании Dell в твитах.
Задача: приложение, которое будет анализировать сентимент высказывания, т.е. определять, какой отзыв: негативный, позитивный или нейтральный.
Для этого обучим нейросеть, написанную на Keras, на имеющемся датасете и подключим к Flask

**Готовим датасет**

In [None]:
from google.colab import drive
drive.mount('/content/drive/')

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


In [None]:
%cd /content/drive/My Drive/texts_for_net/

/content/drive/My Drive/texts_for_net


In [None]:
dataset = pd.read_csv('sentiment-emotion-labelled_Dell_tweets.csv')

In [None]:
dataset.head()

Unnamed: 0.1,Unnamed: 0,Datetime,Tweet Id,Text,Username,sentiment,sentiment_score,emotion,emotion_score
0,0,2022-09-30 23:29:15+00:00,1575991191170342912,@Logitech @apple @Google @Microsoft @Dell @Len...,ManjuSreedaran,neutral,0.853283,anticipation,0.587121
1,1,2022-09-30 21:46:35+00:00,1575965354425131008,@MK_habit_addict @official_stier @MortalKombat...,MiKeMcDnet,neutral,0.51947,joy,0.886913
2,2,2022-09-30 21:18:02+00:00,1575958171423752203,"As @CRN celebrates its 40th anniversary, Bob F...",jfollett,positive,0.763791,joy,0.960347
3,3,2022-09-30 20:05:24+00:00,1575939891485032450,@dell your customer service is horrible especi...,daveccarr,negative,0.954023,anger,0.983203
4,4,2022-09-30 20:03:17+00:00,1575939359160750080,@zacokalo @Dell @DellCares @Dell give the man ...,heycamella,neutral,0.52917,anger,0.776124


Выберем основную информацию

In [None]:
dataset = dataset[['Text', 'sentiment']]

In [None]:
dataset.head(5)

Unnamed: 0,Text,sentiment
0,@Logitech @apple @Google @Microsoft @Dell @Len...,neutral
1,@MK_habit_addict @official_stier @MortalKombat...,neutral
2,"As @CRN celebrates its 40th anniversary, Bob F...",positive
3,@dell your customer service is horrible especi...,negative
4,@zacokalo @Dell @DellCares @Dell give the man ...,neutral


In [None]:
dataset['sentiment'].value_counts() #проверим баланс классов - датасет достаточно сбалансированный

negative    10556
positive     7366
neutral      7048
Name: sentiment, dtype: int64

In [None]:
dataset['sentiment'] = dataset['sentiment'].map({'negative' : 0, 'neutral': 1, 'positive': 2})

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataset['sentiment'] = dataset['sentiment'].map({'negative' : 0, 'neutral': 1, 'positive': 2})


**Разделим на train и test**

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(dataset['Text'], dataset['sentiment'],
                                                    train_size=0.7,
                                                    random_state=1)

Осуществим **предобработку текста**

In [None]:
from string import punctuation
import re

In [None]:
pip install pymorphy2



In [None]:
from pymorphy2 import MorphAnalyzer

In [None]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [None]:
from nltk.corpus import stopwords

In [None]:
sw = stopwords.words('english')
exclude = set(punctuation)
morpher = MorphAnalyzer()

In [None]:
def preprocess_text(txt):
    txt = str(txt)
    txt = "".join(c for c in txt if c not in exclude)
    txt = txt.lower()
    txt = re.sub("[^a-zA-Z]"," ",txt)
    txt = [morpher.parse(word)[0].normal_form for word in txt.split() if word not in sw]
    return " ".join(txt)

In [None]:
X_train = X_train.apply(preprocess_text)
X_test = X_test.apply(preprocess_text)

Создаем **словарь токенов на основе обучающего датасета**

In [None]:
train_corpus = " ".join(X_train)
train_corpus = train_corpus.lower()

In [None]:
from nltk.tokenize import word_tokenize
nltk.download("punkt")

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [None]:
tokens = word_tokenize(train_corpus) # токенизируем слова учебного датасета

In [None]:
tokens_filtered = [word for word in tokens if word.isalnum()]

In [None]:
max_words = 5000 #определим максимальное число слов в словаре токенов
max_len = 50 #максимально возможная длина высказывания

In [None]:
from nltk.probability import FreqDist
dist = FreqDist(tokens_filtered)
tokens_filtered_top = [pair[0] for pair in dist.most_common(max_words-1)]

In [None]:
tokens_filtered_top[:10] #самые популярные слова

['dell',
 'laptop',
 'dellcares',
 'service',
 'michaeldell',
 'new',
 'httpstco',
 'one',
 'get',
 'amp']

In [None]:
vocabulary = {v: k for k, v in dict(enumerate(tokens_filtered_top, 1)).items()}

Функция **преобразования высказываний в последовательность токенов, читаемую нейросетью**

In [None]:
def text_to_sequence(text, maxlen):
    result = []
    tokens = word_tokenize(text.lower())
    tokens_filtered = [word for word in tokens if word.isalnum()]
    for word in tokens_filtered:
        if word in vocabulary:
            result.append(vocabulary[word])
    padding = [0]*(maxlen-len(result))
    return padding + result[-maxlen:]

Превращаем датасет в последовательности токенов

In [None]:
X_train = np.asarray([text_to_sequence(text, max_len) for text in X_train], dtype=np.int32)
X_test = np.asarray([text_to_sequence(text, max_len) for text in X_test], dtype=np.int32)

Посмотрим, как выглядит высказывание (использовали 0 в качестве паддинга)

In [None]:
X_train[1]

array([   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    1, 3875,   69,   37], dtype=int32)

Создадим **нейросеть**

In [None]:
import keras
from keras.models import Sequential, Model
from keras.layers import Dense, Dropout, Activation, Input, Embedding, Conv1D, GlobalMaxPool1D, MaxPooling1D, Flatten, SimpleRNN, LSTM, GRU, Masking
from keras.callbacks import TensorBoard
from keras.callbacks import EarlyStopping
from keras.utils import pad_sequences
from keras.losses import categorical_crossentropy

In [None]:
num_classes = 3 #распознаем 3 класса
y_train = keras.utils.to_categorical(y_train, num_classes)
y_test = keras.utils.to_categorical(y_test, num_classes)

In [None]:
epochs = 20
batch_size = 512
print_batch_n = 100

In [None]:
random_state = 1

model_1 = Sequential()
model_1.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
model_1.add(Conv1D(15, 3))
model_1.add(Activation("relu"))
model_1.add(GlobalMaxPool1D())
model_1.add(Dense(9))
model_1.add(Activation("relu"))
model_1.add(Dense(num_classes))
model_1.add(Activation('softmax'))

In [None]:
model_1.compile(loss='categorical_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [None]:
tensorboard=TensorBoard(log_dir='./logs', write_graph=True, write_images=True)
early_stopping=EarlyStopping(monitor='val_loss')


history_1 = model_1.fit(X_train, y_train,
                    batch_size=batch_size,
                    epochs=epochs,
                    verbose=1,
                    validation_split=0.1,
                    callbacks=[tensorboard, early_stopping])

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20


In [None]:
score_1 = model_1.evaluate(X_test, y_test, batch_size=batch_size, verbose=1)
print('\n')
print('Test score:', score_1[0])
print('Test accuracy:', score_1[1])



Test score: 0.5972892642021179
Test accuracy: 0.758243203163147


In [None]:
model_1.save('model_1') #сохраняем модель



Создаем приложение Flask

In [None]:
!pip install flask-ngrok



In [None]:
!wget https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.tgz
!tar -xvf /content/ngrok-stable-linux-amd64.tgz
!./ngrok authtoken 2CtRWGgw0ouFALKzNUFLCNlIctX_31GHdnbChyX9LKiW6beyy
!./ngrok http 80

--2023-07-16 23:11:53--  https://bin.equinox.io/c/4VmDzA7iaHb/ngrok-stable-linux-amd64.tgz
Resolving bin.equinox.io (bin.equinox.io)... 52.202.168.65, 54.161.241.46, 18.205.222.128, ...
Connecting to bin.equinox.io (bin.equinox.io)|52.202.168.65|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 13856790 (13M) [application/octet-stream]
Saving to: ‘ngrok-stable-linux-amd64.tgz.2’


2023-07-16 23:11:55 (16.5 MB/s) - ‘ngrok-stable-linux-amd64.tgz.2’ saved [13856790/13856790]

tar: /content/ngrok-stable-linux-amd64.tgz: Cannot open: No such file or directory
tar: Error is not recoverable: exiting now
/bin/bash: ./ngrok: No such file or directory
/bin/bash: ./ngrok: No such file or directory


In [None]:
from flask_ngrok import run_with_ngrok
from flask import Flask, request, jsonify

Функции обработки текста запроса и определения его сентимента

In [None]:
def preproc_text(txt):
    txt = "".join(c for c in txt if c not in exclude)
    txt = txt.lower()
    txt = re.sub("[^a-zA-Z]"," ",txt)
    txt = [morpher.parse(word)[0].normal_form for word in txt.split() if word not in sw]
    return " ".join(txt)

In [None]:
def define_sentiment(text):
    data_text = preproc_text(text)
    data_text = np.asarray([text_to_sequence(data_text, max_len)], dtype=np.int32)
    prediction = model_1.predict(data_text)
    prediction = np.argmax(prediction, axis = 1)
    if prediction == 0:
        sentiment = 'negative',
    elif prediction == 1:
        sentiment = 'neutral',
    else:
        sentiment = 'positive'
    return sentiment

In [None]:
#Обработчики и запуск Flask
app = Flask(__name__)
run_with_ngrok(app) #start ngrok when app runs


@app.route("/", methods = ["Get"])
def general():
  return "Welcome to prediction process"

@app.route("/predict", methods = ["Post"])
def predict():

  comment_text = ""
  request_json = request.get_json()

  if request_json["comment_text"]:
    comment_text = request_json['comment_text']

  predictions = define_sentiment(comment_text)
  return jsonify(predictions)

if __name__ == '__main__':
  app.run()


 * Serving Flask app '__main__'
 * Debug mode: off


 * Running on http://127.0.0.1:5000
INFO:werkzeug:[33mPress CTRL+C to quit[0m


 * Running on http://53d1-34-125-78-238.ngrok.io
 * Traffic stats available on http://127.0.0.1:4040


INFO:werkzeug:127.0.0.1 - - [17/Jul/2023 00:03:53] "POST /predict HTTP/1.1" 200 -


Код запроса клиента

In [None]:
#import requests
#import urllib.request
#import json

In [None]:
#пример данных
#data = (
#   "D'aww! He matches this background colour I'm seemingly stuck with. Thanks.  (talk) 21:51, January 11, 2016 (UTC)"
#)

In [None]:
#формируем запрос
#def send_json(x):

#  comment_text = x

#  body = {
#     'comment_text' : comment_text
#  }

#  myurl = 'http://53d1-34-125-78-238.ngrok.io' + '/predict'
#  headers = {'content-type': 'application/json; charset=utf-8'}
#  response = requests.post(myurl, json=body, headers=headers)
#  return response.json()

In [None]:
#обращение к серверу из одного набора
#response = send_json(data)
#print('предсказание', response)