In [None]:
import numpy as np
import pandas as pd

from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, Embedding,LSTM
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import load_model

from sklearn.model_selection import train_test_split

import re
import nltk
import string
from string import punctuation
from nltk.corpus import stopwords
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import WordNetLemmatizer

# from warnings import filterwarnings
# filterwarnings('ignore')

In [None]:
nltk.download("punkt")
nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
WPT = nltk.WordPunctTokenizer()
stop_word_list = nltk.corpus.stopwords.words('english')
stop_word_list

In [None]:
# WPT

In [None]:
df = pd.read_excel("comments.xlsx")
df

In [None]:
# def cleaning(data):
#     #1. Tokenize
#     text_tokens = word_tokenize(data.replace("'", "").lower())
#     #2. Remove Puncs and numbers
#     tokens_without_punc = [w for w in text_tokens if w.isalpha()]
#     #3. Removing Stopwords
#     tokens_without_sw = [t for t in tokens_without_punc if t not in stop_words]
#     #4. lemma
#     text_cleaned = [WordNetLemmatizer().lemmatize(t) for t in tokens_without_sw]
#     #joining
#     return " ".join(text_cleaned)

In [None]:
#the process of cleaning the punctuation marks found in our data
df["comments"] = df["comments"].apply(lambda x: re.sub('[,\.!?:()"Ÿ˜Š]', '', str(x)))
#conversion of uppercase letters to lowercase
df["comments"] = df["comments"].apply(lambda x: x.lower())
#cleaning of extra spaces
df["comments"] = df["comments"].apply(lambda x: x.strip())
#removal of stopwords contained in sentences
def token(comment):
    words = nltk.tokenize.word_tokenize(comment)
    filtered_words = [word for word in words if word not in stop_word_list]
    not_stopword_doc = " ".join(filtered_words)
    return not_stopword_doc
df["comments"] = df["comments"].apply(lambda x: token(x))

In [None]:
df["comments"].head()

In [None]:
df = df[['comments','rating']]
df = df[df["comments"] != "nan"]
df.head()

In [None]:
df.shape

In [None]:
df.rating.value_counts()

In [None]:
df.rating

In [None]:
df['rating'] = df['rating'].map({5:'1' , 4:'1' , 1:'0' , 2:'0' , 3: np.nan})

In [None]:
df['rating'].value_counts(dropna=False)

In [None]:
df.dropna(inplace=True)

In [None]:
df['rating'].value_counts(dropna=False)

In [None]:
df["comments"].values.tolist()

In [None]:
df

In [None]:
#df = df["comments"].values.tolist()
comments = df["comments"].values.tolist()
sentiments = df['rating'].values.tolist()

X_train, X_test, y_train, y_test = train_test_split(comments,sentiments,test_size = 0.15, random_state = 53 , stratify=sentiments)

In [None]:
from collections import Counter

In [None]:
df["comments"]

In [None]:
all_words = " ".join(df["comments"])

In [None]:
all_words[:100]

In [None]:
counter = Counter(word_tokenize(all_words)) # How many words(words) do we have, we're looking at this.
counter

In [None]:
counter.most_common()

In [None]:
tokenizer = Tokenizer(num_words = 10000)
# tokenizer.fit_on_texts(df['comments'])
tokenizer.fit_on_texts(comments)
tokenizer.word_index

In [None]:
X_train_tokens = tokenizer.texts_to_sequences(X_train)
X_test_tokens = tokenizer.texts_to_sequences(X_test)

In [None]:
num_tokens = [len(tokens) for tokens in X_train_tokens + X_test_tokens] # The number of individual tokens for each comment in the Train and Test comments
num_tokens = np.array(num_tokens)
num_tokens

In [None]:
# The maximum number of words that can be in a comment
max_tokens = np.mean(num_tokens) + 2 * np.std(num_tokens)
max_tokens = int(max_tokens)
max_tokens

In [None]:
num_tokens

In [None]:
np.sum(num_tokens < max_tokens) / len(num_tokens) # 95% of our comments are ideal comments that do not exceed the max word.

In [None]:
#it is necessary to bring each comment to the same size, this is how RNN works.
X_train_pad = pad_sequences(X_train_tokens, maxlen=max_tokens)
X_test_pad = pad_sequences(X_test_tokens, maxlen=max_tokens)

In [None]:
X_train_pad[15]

In [None]:
# A function needs to be written in order for tokenized words to become strings again.

# idx = tokenizer.word_index
# inverse_map = dict(zip(idx.values(), idx.keys()))
# #tokenlaştırılan cümleyi tekrar string hale getirmek
# def tokens_to_string(tokens):
#     words = [inverse_map[token] for token in tokens if token !=0]
#     text = ' '.join(words)
#     return text

In [None]:
model = Sequential()
# A vector of length 50 is created corresponding to each word. (Embedding matrix)
embedding_size = 50
# The matrix will be the number of words and the size of the embedding, that is, it will be 10 by 50 long. This is also given a name with the name variable.
model.add(Embedding(input_dim=10000,
                    output_dim=embedding_size,
                    input_length=max_tokens,
                    name='embedding_layer'))
## LSTM with 16 neurons (with 16 outputs, return_sequences=True means give the entire output)
model.add(LSTM(units=16, return_sequences=True))
## LSTM with 8 neurons (with 8 outputs, return_sequences=True means give the entire output)
model.add(LSTM(units=8, return_sequences=True))
# LSTM with 4 neurons (with 4 outputs, return_sequences=False, i.e. the default value, will give a single output)
model.add(LSTM(units=4))
## The output layer is used when displaying the dense layer visually. Since it consists of a single neuron, it is written 1.
model.add(Dense(1,activation='sigmoid'))

# optimizer = Adam(lr=0.001)

# To compile the model, the loss function binary_crossentropy -> is used only for 2 classes, but categorical_crossentropy for more classes.

model.compile(loss='binary_crossentropy',
              optimizer='adam',
              #optimizer=optimizer,
              metrics=['accuracy'])

In [None]:
model.summary()

In [None]:
# Validation can be performed according to the situation

In [None]:
a = list()
for x in y_train:
    a.append(np.fromstring(x, dtype=np.int, sep=','))
y_train = np.array(a)
y_train

In [None]:
X_train_pad.shape

In [None]:
# Model training, going through the training once -> epoch, batch_size -> 16
history = model.fit(X_train_pad, np.array(y_train), epochs=50, batch_size=16)

In [None]:
history.history

In [None]:
model_loss = pd.DataFrame(model.history.history)
model_loss.plot();

In [None]:
# Done