In [29]:
import pickle
import itertools
import random
from collections import Counter
import joblib
import keras
import pandas as pd
import numpy as np
import re
from keras import Sequential, optimizers
from keras.layers import Embedding, Conv1D, MaxPooling1D, Flatten, Dense, Bidirectional, LSTM
from keras.utils.np_utils import to_categorical
from nltk.corpus import stopwords
from parsivar import Normalizer
import emoji
import emojies
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.preprocessing import LabelEncoder

In [30]:
class CleanText:
    def __init__(self, data_frame, column_name):
        self.cln_list = data_frame[column_name].tolist()
    def __new__(cls, data_frame, column_name,*args, **kwargs):
        data_frame[column_name] = data_frame[column_name].apply(lambda x: x[:400])
        return super().__new__(cls,*args, **kwargs)
    def clean_punctual(self):
        tmp_lst = list(map(lambda x: re.sub(r'https?:\S*', ' ', x), self.cln_list))
        tmp_lst = list(map(lambda x: re.sub(r'@[A-Za-z0-9]\S+', ' ', x), tmp_lst))
        tmp_lst = list(map(lambda x: re.sub(r'[0-9]\S+', ' ', x), tmp_lst))
        self.cln_list = list(map(lambda x: re.sub(r'#|_|:|/d+', ' ', x), tmp_lst))
        return self.cln_list
    def normalize_text(self):
        normalizer = Normalizer(pinglish_conversion_needed=True)
        cln_list = list(map(lambda x: normalizer.normalize(x), self.cln_list))
        self.cln_list = list(map(lambda x: ''.join(ch for ch, _ in itertools.groupby(x)), cln_list))
        return self.cln_list
    def remove_stop_words(self):
        stop_words = set(stopwords.words('RD_persian_01'))
        self.cln_list = list(map(lambda x: ' '.join([w for w in x.split() if not w in stop_words]), self.cln_list))
        return self.cln_list
    def extract_emojis(self):
        self.cln_list = list(map(lambda x: ''.join((' '+c+' ') if c in emoji.UNICODE_EMOJI['en'] else c for c in x), self.cln_list))
        return self.cln_list
    def convert_emojies(self):
        self.cln_list = list(map(lambda x: emojies.replace(x), self.cln_list))
        return self.cln_list
    def frequency_words(self):
        freq = dict(Counter(" ".join(self.cln_list).split()))
        sort_orders = sorted(freq.items(), key=lambda x: x[1], reverse=True)
        sort_orders = sort_orders[:4000]
        # print(sort_orders)
        print(len(sort_orders))
        most_common_word = [i[0] for i in sort_orders]
        most_common_word = set(most_common_word)
        print(most_common_word)
        # print(len(most_common_word))
        self.cln_list = list(map(lambda x: ' '.join([w for w in x.split() if w in most_common_word]), self.cln_list))
        return self.cln_list

In [31]:
class EncodeText:
    def __init__(self,train_text):
        self.tokenizer = tokenizer

    def encode_text(self,input_list, max_length):
        # integer encode
        encoded = self.tokenizer.texts_to_sequences(input_list)
        # pad encoded sequences
        padded = pad_sequences(encoded, maxlen=max_length, padding='post')
        return padded


In [32]:
data_df = pd.read_excel('posnegtest.xlsx', index_col=False)
# Load tokenizer
with open(r'F:/sourcecode/sentiment_analysis_01/model/2_cat_softmax_posneg/cnn+bilstm/CNN_BiLSTM_6_tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)
# Load model
filename = r'\sourcecode\sentiment_analysis_01\model\2_cat_softmax_posneg\cnn+bilstm\CNN_BiLSTM_6_model.h5'
model = keras.models.load_model(filename)

In [33]:
tmp_call_cleantext = CleanText(data_df, 'caption')
tmp_get_ex_emoji = tmp_call_cleantext.extract_emojis()
tmp_get_emoji_list = tmp_call_cleantext.convert_emojies()
tmp_get_norm_list = tmp_call_cleantext.normalize_text()

In [34]:
list_a = []
max_len = 100

call_encodetext = EncodeText(tokenizer)
encode_text = call_encodetext.encode_text(tmp_get_norm_list, max_len)
# print(encode_text)

for item in encode_text:
    item = item.tolist()
    item = [item]
    output = model.predict(item)
    list_a.append(output)

result_df = pd.DataFrame()
result_df['text'] = data_df['caption'].copy()
result_df['my_model'] = list_a
result_df.to_excel('result_df.xlsx', index=False)

In [35]:
abs_df = pd.read_excel('result_df.xlsx')
list_b = abs_df['my_model']
list_c = list(map(lambda x: list(x[2:-2].split(" ")), list_b))
list_c = list(map(lambda x: list(filter(None, x)), list_c))

list_d = list(map(lambda x: [re.sub('\n','', i) for i in x], list_c))

list_e = list(map(lambda x: [float(i) for i in x], list_d))
list_f = list(map(lambda x: x.index(max(x)), list_e))
result_df['split_my_model'] = list_f
result_df.to_excel('result_df.xlsx', index=False)

