<a href="https://colab.research.google.com/github/Dimildizio/DS_course/blob/main/Neural_networks/NLP/Text_classification/JobsMessageClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import libs

In [2]:
%%capture
!pip install nltk
!pip install emoji --upgrade
!pip install catboost

In [3]:
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
import re
import emoji
import string

from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords, reuters
from nltk.stem.snowball import SnowballStemmer
from pymystem3 import Mystem


In [4]:
nltk.download('stopwords')
nltk.download("reuters")

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package reuters to /root/nltk_data...


True

## Freeze seeds

In [5]:
np.random.seed(42)

## Get the dataset

In [6]:
text_data = pd.read_excel('msg_type.xlsx')

## Tokenization

In [7]:
tokenizer = TweetTokenizer()

## Stemming

In [8]:
stemmer = SnowballStemmer("russian")
stemmer_eng = SnowballStemmer("english")

## Lemmaization

In [9]:
mystem = Mystem()

Installing mystem to /root/.local/bin/mystem from http://download.cdn.yandex.net/mystem/mystem-3.1-linux-64bit.tar.gz


In [10]:
stemmer_eng.stem('python')#.analyze('python')

'python'

## Vectorize using TFIDF

In [11]:
tfidf_vectorizer = TfidfVectorizer(stop_words=stopwords.words('russian'))

## Split dataset to parameters and encode target labels

In [12]:
df = text_data.copy()
#df['category'] = df['category'].replace({'ads': 'message', 'project': 'vacancy'})

In [13]:
df['category']

0          ads
1          ads
2          ads
3          ads
4          ads
        ...   
475    vacancy
476    vacancy
477    vacancy
478    vacancy
479    vacancy
Name: category, Length: 480, dtype: object

###  Label encode categories

In [14]:
label_encoder = LabelEncoder()
encoded_target = label_encoder.fit_transform(df['category'])

### Split into params and target values

In [15]:
X = df['text']
y = encoded_target

## Perform transformation on df

In [16]:
def remove_emoji(text: str) -> str:
    return emoji.replace_emoji(text, " ")


def remove_links(text: str) -> str:
    return re.sub(r"http\S+", " ", text, flags=re.MULTILINE)


def remove_usernames_and_emails(text: str) -> str:
    """Удалеяет юзернеймы и email"""
    return re.sub(r"\S*@\S*", " ", text, flags=re.MULTILINE)


def remove_punctuation(text: str) -> str:
    """Удаляем символы пунктуации"""
    return "".join([ch if ch not in string.punctuation else " " for ch in text])


def remove_numbers(text: str) -> str:
    """Удаляем числа"""
    return "".join([i if not i.isdigit() else " " for i in text])


def remove_multiple_spaces(text: str) -> str:
    """Удаляем двойные (и более) пробелы"""
    return re.sub(r"\s+", " ", text, flags=re.I)

In [17]:
def get_prep_text(text: str) -> str:
  return remove_multiple_spaces(
      remove_numbers(
          remove_punctuation(
              remove_usernames_and_emails(
                  remove_links(
                      remove_emoji(text)
                      )
                  )
              )
          )
      )

In [18]:
#our new dataset with stemmed lemmatized and later vectorized texts
stemmed_lemma_txts = []

for text in X:
  tok = tokenizer.tokenize(get_prep_text(text).lower())
  stem_tok = [stemmer.stem(token) for token in tok]
  # lem_tok = [lem for lem in mystem.lemmatize(" ".join(stem_tok)) if not lem.isspace()]
  # stemmed_lemma_txts.append(' '.join(lem_tok))
  stemmed_lemma_txts.append(' '.join(stem_tok))

df['text_lemm'] = stemmed_lemma_txts

### TFIDF Vectorize

In [19]:
tfidfd = tfidf_vectorizer.fit_transform(stemmed_lemma_txts)

## Split dataset

pass tfidf'd and transfromed data instead of texts as X

In [20]:
X_train, X_test, y_train, y_test = train_test_split(tfidfd, y, stratify=y, test_size=0.25, random_state=42)

# Model

## Create and train baseline model

In [21]:
model = LogisticRegression(C=0.004)
model.fit(X_train, y_train)

## Predict

In [22]:
y_pred = model.predict(X_test)

## Evaluate

### Accuracy

In [23]:
acc = accuracy_score(y_test, y_pred)
print('Accuracy:', acc)

Accuracy: 0.85


### Report

In [24]:
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.73      0.79        30
           1       0.81      0.97      0.88        30
           2       0.88      0.73      0.80        30
           3       0.88      0.97      0.92        30

    accuracy                           0.85       120
   macro avg       0.85      0.85      0.85       120
weighted avg       0.85      0.85      0.85       120



## Sum it up

### Catboost

In [25]:
cat_model = CatBoostClassifier(iterations=400, depth=6, learning_rate=0.04, loss_function='MultiClass', verbose=False)
cat_model.fit(X_train, y_train, eval_set=(X_test, y_test))


<catboost.core.CatBoostClassifier at 0x7fc4b537fbe0>

In [26]:
y_pred = cat_model.predict(X_test)
accuracy = accuracy_score(y_pred, y_test)
print(f"Accuracy: {accuracy:.2f}")
class_names = ['ads','message', 'project', 'vacancy']
report = classification_report(y_test, y_pred, target_names=class_names)
print("Classification Report:")
print(report)

Accuracy: 0.82
Classification Report:
              precision    recall  f1-score   support

         ads       0.91      0.67      0.77        30
     message       0.74      0.97      0.84        30
     project       0.81      0.73      0.77        30
     vacancy       0.84      0.90      0.87        30

    accuracy                           0.82       120
   macro avg       0.83      0.82      0.81       120
weighted avg       0.83      0.82      0.81       120



In [27]:
def test_model(model):
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  acc = accuracy_score(y_test, y_pred)
  print('Accuracy:', acc)
  print("Classification Report:")
  print(classification_report(y_test, y_pred))


In [28]:
log_model = LogisticRegression(C=0.004)
nb_model = MultinomialNB()
svm_model = SVC(kernel='linear', random_state=42, gamma="auto", probability=True)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
xgb_model = xgb.XGBClassifier()
knn_model = KNeighborsClassifier(n_neighbors=5)

ensemble = VotingClassifier(estimators=[
                              ('rf', rf_model),
                              ('svm', svm_model),
                              ('xgb', xgb_model)],
                            voting='soft')        # soft for probability-based voting

rfensemble = VotingClassifier(estimators=[
                              ('rf', rf_model),
                              ('svm', svm_model),
                              ('rf1', RandomForestClassifier(n_estimators=100, random_state=42))],
                            voting='hard')

models = [log_model, nb_model, svm_model, rf_model, xgb_model, knn_model, ensemble,rfensemble]
model_names = ['logreg', 'bayes', 'SVM', 'RandomForest', 'XGB', 'KNN', 'Ensemble', 'RF_ensemble']

In [29]:
for i in range(len(models)):
  print(model_names[i])
  print()
  test_model(models[i])
  print('\n\n')

logreg

Accuracy: 0.85
Classification Report:
              precision    recall  f1-score   support

           0       0.85      0.73      0.79        30
           1       0.81      0.97      0.88        30
           2       0.88      0.73      0.80        30
           3       0.88      0.97      0.92        30

    accuracy                           0.85       120
   macro avg       0.85      0.85      0.85       120
weighted avg       0.85      0.85      0.85       120




bayes

Accuracy: 0.775
Classification Report:
              precision    recall  f1-score   support

           0       0.71      0.90      0.79        30
           1       0.89      0.53      0.67        30
           2       0.70      0.70      0.70        30
           3       0.85      0.97      0.91        30

    accuracy                           0.78       120
   macro avg       0.79      0.78      0.77       120
weighted avg       0.79      0.78      0.77       120




SVM

Accuracy: 0.858333333333333

In [30]:
label_encoder.inverse_transform([0, 1])

array(['ads', 'message'], dtype=object)

In [31]:
df['text'][0]

'Ищу экспертов, кто хочет продавать свои услуги быстро и легко 🔥\n\nПривет, я Олеся - графический и веб-дизайнер экспертов. Я создаю лаконичные, с правильной структурой и собственным вайбом сайты для экспертов на Taplink. \n\nВся нужная и важная информация собрана в одном месте, чтобы ваш клиент проходил минимальный путь к покупке 💸\n\nЧтобы узнать подробнее пиши в лс "СОТРУДНИЧЕСТВО"🤍'

## One Element

In [32]:
df.iloc[50]

text         Ищешь копирайтера и PR специалиста? \n\nПривет...
category                                                   ads
text_lemm    ищеш копирайтер и pr специалист привет мен зов...
Name: 50, dtype: object

In [33]:
mywords = []
text = df['text'][50]#'Продам гараж. Мопед не мой. Я просто разместил объяву.'
tk = tokenizer.tokenize(get_prep_text(text).lower())
stk = [stemmer.stem(token) for token in tk]
lmtk = [lem for lem in mystem.lemmatize(" ".join(stk)) if not lem.isspace()]
mywords.append(' '.join(lmtk))
wrd = tfidf_vectorizer.transform(mywords)
rf_model.predict(wrd)

array([0])

#### Add Stacking of models

In [34]:
base_models = [
    ('rf', RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)),
    ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42)),
    ('svm', SVC(kernel='linear', random_state=42, gamma="auto", probability=True)),
     ('logreg', LogisticRegression(C=0.0045)),
    ('xgb', xgb.XGBClassifier())]
final_model = RandomForestClassifier(n_estimators=100, max_depth=7, random_state=42)

stacking_model = StackingClassifier(estimators=base_models, final_estimator=final_model)
test_model(stacking_model)

Accuracy: 0.875
Classification Report:
              precision    recall  f1-score   support

           0       0.89      0.83      0.86        30
           1       0.85      0.93      0.89        30
           2       0.86      0.80      0.83        30
           3       0.90      0.93      0.92        30

    accuracy                           0.88       120
   macro avg       0.88      0.88      0.87       120
weighted avg       0.88      0.88      0.87       120



## Implement as pipeline

In [35]:
class iPipe:
  def __init__(self, estimator):
    self.estimator = estimator
    self.model = None

  def prepipe(self, text):
    tk = tokenizer.tokenize(get_prep_text(text).lower())
    stk = [stemmer.stem(token) for token in tk]
    return ' '.join(stk)


  def train(self, X_train, y_train):
    self.model = Pipeline([
        ('tfidf', TfidfVectorizer(preprocessor=self.prepipe)),
        ('clf', self.estimator)])

    self.model.fit(X_train, y_train)
    return self.model


  def predict(self, x):
    return self.model.predict(x)


  def evaluate(self, X_test, y_test):
    y_pred = self.predict(X_test)
    print('predicted')
    accuracy = accuracy_score(y_test, y_pred)
    print("Accuracy:", accuracy)
    print("Classification Report:")
    print(classification_report(y_test, y_pred))


  def fit_predict(self, df):
    X = df['text']
    y = df['category']
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42, test_size=0.2)
    self.train(X_train, y_train)
    print('trained')
    self.evaluate(X_test, y_test)

In [36]:
pipe = iPipe(RandomForestClassifier(n_estimators=100, random_state=42, max_depth=6))
pipe.fit_predict(text_data)

trained
predicted
Accuracy: 0.84375
Classification Report:
              precision    recall  f1-score   support

         ads       0.94      0.71      0.81        24
     message       0.75      1.00      0.86        24
     project       0.89      0.71      0.79        24
     vacancy       0.85      0.96      0.90        24

    accuracy                           0.84        96
   macro avg       0.86      0.84      0.84        96
weighted avg       0.86      0.84      0.84        96



In [37]:
wrd = ['Для мегастартапа новый гугл ищу датасатаниста. \
        плачу 300000 баксов\наносек. надо все и сразу - подготовка, обработка, написание моделей, \
        создание красивеньких графиков и продажа данных. Нужен мидл с запросами джуна и компетенциями сеньора. \
        тимлид и техлид обязанности обязательно. возможно придется искать клиентов, презентовать им работу и вести \
        документации предприятия. Иногда мыть полы. Не задаваться вопросом зачем я нужен в проекте.']
pipe.predict(wrd)

array(['project'], dtype=object)

## Add Word2Vec

In [38]:
%%capture
!pip install gensim

In [39]:
import gensim.downloader as api
from sklearn.base import BaseEstimator, TransformerMixin

In [40]:
#api.info()
word2vec_model = api.load('word2vec-ruscorpora-300')



In [41]:
convlist='''A       ADJ
ADV     ADV
ADVPRO  ADV
ANUM    ADJ
APRO    DET
COM     ADJ
CONJ    SCONJ
INTJ    INTJ
NONLEX  X
NUM     NUM
PART    PART
PR      ADP
S       NOUN
SPRO    PRON
UNKN    X
V       VERB'''.split()
conv_dict = {}
for i in range(0, len(convlist), 2):
    keyword = convlist[i]
    value = convlist[i + 1]
    conv_dict[keyword] = value


In [108]:
def tag(word='пожар'):
    processed = mystem.analyze(word)[0]
    if not ('analysis' in processed):
      return 'unknown'
    elif not processed['analysis']:
      return word
    lemma = processed["analysis"][0]["lex"].lower().strip()
    pos = processed["analysis"][0]["gr"].split(',')[0]
    pos = pos.split('=')[0].strip()
    tagged = lemma+'_'+conv_dict[pos]

    return tagged

In [80]:
tag('привет') in word2vec_model

[{'lex': 'привет', 'wt': 1, 'gr': 'S,муж,неод=(вин,ед|им,ед)'}]


True

In [92]:
tag('cpfsdm')

[]


'unknown'

In [72]:
word2vec_model.key_to_index['искать_VERB']

368

In [73]:
w0 = df['text'][0].split()[0]
print(f'"{w0}" in corpora: {w0 in word2vec_model}')
print(f'tag "{tag(w0)}" in corpora: {tag(w0) in word2vec_model}')

"Ищу" in corpora: False
[{'lex': 'искать', 'wt': 1, 'gr': 'V,несов,пе=непрош,ед,изъяв,1-л'}]
[{'lex': 'искать', 'wt': 1, 'gr': 'V,несов,пе=непрош,ед,изъяв,1-л'}]
tag "искать_VERB" in corpora: True


Might need to try and train w2v from zero or solve the _VERB _NOUN shyte of rus corpora

UPD: No need. converting the word using stemma into base form and adding parts of speech type (with conversion) worked

#### Add english corpora for english words

In [46]:
en_mbeddings = api.load("glove-wiki-gigaword-300")



In [47]:
sum(en_mbeddings['unknown'])

7.2751107837684685

#### Create custom class for using word2vec in pipeline

In [85]:
class Word2VecTrans(BaseEstimator, TransformerMixin):
    def __init__(self, word2vec_model):
        self.word2vec_model = word2vec_model

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        embeddings = []
        for text in X:
            tokens = text.split()  # Split text into tokens
            text_embeddings = []
            for token in tokens:
                token = tag(token)
                if token in self.word2vec_model:
                    text_embeddings.append(self.word2vec_model[token])
                elif token in en_mbeddings:
                    text_embeddings.append(en_mbeddings[token])
            embeddings.append(text_embeddings)
        return embeddings

#### Create custom class for preprocessing stuff before pass it to word2vec

In [49]:
class Preprocessor(BaseEstimator, TransformerMixin):
    def __init__(self, preprocessor):
        self.preprocessor = preprocessor

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        return [self.preprocessor(text) for text in X]


In [50]:
def preprocess(text):
  tk = tokenizer.tokenize(get_prep_text(text).lower())
  return ' '.join(tk)

#### Create a separate class for w2v pipeline

In [127]:
class W2VPipe(iPipe):
  def __init__(self, preprocessor, w2v, estimator):
    super().__init__(estimator)
    self.preprocessor = preprocessor
    self.w2v = w2v

  def train(self, X_train, y_train):
    self.model = Pipeline([
        #('preprocess', self.preprocessor),
        #('w2v', self.w2v),
        ('clf', self.estimator)])

    self.model.fit(X_train, y_train)
    return self.model

  def fit_predict(self, X, y):
    X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state=42, test_size=0.2)
    self.train(X_train, y_train)
    print('trained')
    self.evaluate(X_test, y_test)

#### Gather it all together

In [152]:
word2vec_model.vector_size

300

In [259]:
w2v_transformer = Word2VecTrans(word2vec_model)
preprop = Preprocessor(preprocessor=preprocess)
svm_w2v = SVC(kernel='linear', random_state=42, gamma="auto", probability=True, C=5000)

In [260]:
w2v = w2v_transformer.transform(preprop.transform(text_data['text']))

In [261]:
wmaxlen = max(len(x) for x in w2v)
for entry in w2v:
  while len(entry) < wmaxlen:
    entry.append(0.0)
avg_w2v = [np.mean(emb, axis=0) if emb else np.zeros(word2vec_model.vector_size) for emb in w2v]


  arr = asanyarray(a)


In [267]:
wX_train, wX_test, wy_train, wy_test = train_test_split(avg_w2v, text_data['category'], random_state=42, stratify=y, test_size=0.2)

In [263]:
wsvm = svm_w2v.fit(wX_train, wy_train)
wy_pred = svm_w2v.predict(wX_test)
waccuracy = accuracy_score(wy_test, wy_pred)
print("Accuracy:", waccuracy)
print("Classification Report:")
print(classification_report(wy_test, wy_pred))

Accuracy: 0.84375
Classification Report:
              precision    recall  f1-score   support

         ads       0.82      0.75      0.78        24
     message       0.88      0.96      0.92        24
     project       0.87      0.83      0.85        24
     vacancy       0.80      0.83      0.82        24

    accuracy                           0.84        96
   macro avg       0.84      0.84      0.84        96
weighted avg       0.84      0.84      0.84        96



#### Add fastText for multilanguage word2vec

In [264]:
fasttext_model = api.load("fasttext-wiki-news-subwords-300")



In [None]:
ft_transformer = Word2VecTrans(fasttext_model)
ftpreprop = Preprocessor(preprocessor=preprocess)

In [266]:
ft = w2v_transformer.transform(preprop.transform(text_data['text']))
ftmaxlen = max(len(x) for x in ft)
for entry in ft:
  while len(entry) < ftmaxlen:
    entry.append(np.zeros(fasttext_model.vector_size))
favg_w2v = [np.mean(emb, axis=0) if emb else np.zeros(fasttext_model.vector_size) for emb in ft]
ftX_train, ftX_test, fty_train, fty_test = train_test_split(favg_w2v, text_data['category'], random_state=42, stratify=y, test_size=0.2)


In [296]:
svm_ft = SVC(kernel='linear', random_state=42, gamma="auto", probability=True, C=6000)
ft_forest = RandomForestClassifier(max_depth=5, random_state=42, n_estimators=100)

In [297]:
for model_ft in (svm_ft, ft_forest):
  wsvm = model_ft.fit(ftX_train, fty_train)
  fty_pred = model_ft.predict(ftX_test)
  ftaccuracy = accuracy_score(fty_test, fty_pred)
  print("Accuracy:", ftaccuracy)
  print("Classification Report:")
  print(classification_report(fty_test, fty_pred))

Accuracy: 0.84375
Classification Report:
              precision    recall  f1-score   support

         ads       0.82      0.75      0.78        24
     message       0.88      0.96      0.92        24
     project       0.87      0.83      0.85        24
     vacancy       0.80      0.83      0.82        24

    accuracy                           0.84        96
   macro avg       0.84      0.84      0.84        96
weighted avg       0.84      0.84      0.84        96

Accuracy: 0.75
Classification Report:
              precision    recall  f1-score   support

         ads       0.71      0.62      0.67        24
     message       0.87      0.83      0.85        24
     project       0.69      0.75      0.72        24
     vacancy       0.73      0.79      0.76        24

    accuracy                           0.75        96
   macro avg       0.75      0.75      0.75        96
weighted avg       0.75      0.75      0.75        96



## Neural networks solution

#### import NN stuff

In [298]:
import tensorflow as tf

from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from keras.utils import pad_sequences, to_categorical

### LSTM

#### Prerocessing

Split data

In [None]:
df

In [167]:
def k_process(df):
    X = df['text'].copy()
    y = df['category'].copy()
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y)
    return train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

Tokenize

In [168]:
def k_tokenize(x1, x2):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(x1)

  x_train = tokenizer.texts_to_sequences(x1)
  x_test = tokenizer.texts_to_sequences(x2)

  return tokenizer, x_train, x_test

Add padding

In [169]:
def k_padme(x1, x2, maxlen=100):
  x_train = pad_sequences(x1, maxlen=maxlen, padding='post')
  x_test = pad_sequences(x2, maxlen=maxlen, padding='post')
  return x_train, x_test

#### Create model

In [170]:
class LSTMClassifier:
  def __init__(self, vocab_size, embedding_dim, maxlen):
    self.vocab = vocab_size
    self.embedding_dim = embedding_dim
    self.maxlen = maxlen
    self.model = self._create_model()

  def _create_model(self)->Sequential:
    '''creates lstm with embedding, rnn and classificator'''
    model = Sequential()
    model.add(Embedding(input_dim=self.vocab, output_dim=self.embedding_dim, input_length=self.maxlen))
    model.add(LSTM(units=64))
    model.add(Dense(units=1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model


  def fit(self, X_train:np.array, y_train:np.array, batch_size:int=64, epochs:int=5)->None:
    '''trains model'''

    self.model.fit(X_train, y_train.astype(int), epochs=epochs, batch_size=batch_size)


  def predict(self, X_test:np.array)->np.array:
    '''predics values for validation and test'''
    print("PREDICTING")
    X_test_padded = pad_sequences(X_test, maxlen=self.maxlen)
    return self.model.predict(X_test_padded)


  def evaluate(self, pred:np.array, y_test:np.array)->None:
    '''reports statistics'''
    loss, accuracy = self.model.evaluate(pred, y_test)
    pred = (pred > 0.5).astype(int)
    print(f"Loss: {loss}\nAccuracy: {accuracy}")
    print(classification_report(y_test, pred))


  def fit_predict(self, X_train, X_test, y_train, y_test, batch=64, epochs=5):
    '''fits data into model, predicts values and reports sttistics'''

    self.fit(X_train, y_train, batch, epochs)
    y_pred = self.predict(X_test)
    print('\n\n\n\n\ntest binary\n\n\n\n')
    # Convert predicted values to binary values (0 or 1)
    y_pred_binary = (y_pred > 0.5).astype(int)

    self.evaluate(y_pred_binary, y_test)

#### Run it

In [171]:
kX_train, kX_test, ky_train, ky_test = k_process(df)
k_tokenizer, kX_train, kX_test = k_tokenize(kX_train, kX_test)
kX_train, kXtest = k_padme(kX_train, kX_test)

kX_train = np.array(kX_train)
kX_test = np.array(kX_test)
ky_train = np.array(ky_train)
ky_test = np.array(ky_test)


  kX_test = np.array(kX_test)


In [172]:
k_vocab_size = len(k_tokenizer.word_index) + 1    # num of unique words in all texts +1  for padding token
k_embedding_dim = 100                             #vectors representing a word
maxlen = 100                                      # sequence length after tokenization. too long - cut it

In [None]:
kLSTM = LSTMClassifier(k_vocab_size, k_embedding_dim, maxlen)
kLSTM.fit_predict(kX_train, kX_test, ky_train, ky_test, epochs=10)

## Try multiclass

In [299]:
class M_LSTMTextClassifier:
    def __init__(self, num_classes=4, vocab_size=10000, embedding_dim=128, lstm_units=128):
        self.num_classes = num_classes
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.lstm_units = lstm_units
        self.model = self.build_model()


    def build_model(self):
        model = Sequential()
        model.add(Embedding(input_dim=self.vocab_size, output_dim=self.embedding_dim))
        model.add(LSTM(self.lstm_units, dropout=0.2, recurrent_dropout=0.2))
        model.add(Dense(self.num_classes, activation='softmax'))

        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        return model


    def preprocess_data(self, text_data, target):
        tokenizer = Tokenizer(num_words=self.vocab_size)
        tokenizer.fit_on_texts(text_data)
        sequences = tokenizer.texts_to_sequences(text_data)
        word_index = tokenizer.word_index

        max_sequence_length = max([len(seq) for seq in sequences])
        padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length)

        one_hot_target = tf.keras.utils.to_categorical(target, self.num_classes)

        return padded_sequences, one_hot_target


    def train(self, X_train, y_train, epochs=10, batch_size=32, validation_split=0.2):
        history = self.model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=validation_split)
        return history


    def evaluate(self, X_test, y_test):
        return self.model.evaluate(X_test, y_test)

In [300]:
def lstm_X_y(df):
  mtext_data = df['text']# .drop('category', axis=1) causes dimension problem when passed to the Sequential gotta find a way to solve it if added nontext cols
  mtarget = df['category']
  return mtext_data, mtarget

def get_encoded_lstm(mtarget):
  mlabel_encoder = LabelEncoder()
  y = mlabel_encoder.fit_transform(mtarget)
  return mlabel_encoder, y


def split_lstm(params, target):
  return train_test_split(params, target, test_size=0.2, random_state=42)

In [301]:
def train_lstm(X_train, y_train, classnum, epochs = 10):
  classifier = M_LSTMTextClassifier(num_classes=classnum, vocab_size=10000)
  # Preprocess and train the model
  X_train_processed, y_train_processed = classifier.preprocess_data(X_train, y_train)
  classifier.train(X_train_processed, y_train_processed, epochs=epochs)
  return classifier


def test_lstm(classifier, label_encoder, mX_test, my_test):
  X_test_processed, y_test_processed = classifier.preprocess_data(mX_test, my_test)
  loss, accuracy = classifier.evaluate(X_test_processed, y_test_processed)
  print(f"Test Loss: {loss:.4f}, Test Accuracy: {accuracy:.4f}")

  # Convert predicted labels to original class names
  y_pred = classifier.model.predict(X_test_processed)
  predicted_labels = np.argmax(y_pred, axis=1)
  predicted_class_names = label_encoder.inverse_transform(predicted_labels)

  # Generate classification report
  classification_rep = classification_report(y_test_processed.argmax(axis=1), predicted_labels, target_names=label_encoder.classes_)
  print("Classification Report:")
  print(classification_rep)
  return


In [302]:
mX, my = lstm_X_y(text_data)
multi_encoder, my_labels = get_encoded_lstm(my)
mX_train, mX_test, my_train, my_test = split_lstm(mX, my_labels)

mclassifier = train_lstm(mX_train, my_train, len(multi_encoder.classes_), 10)

test_lstm(mclassifier, multi_encoder, mX_test, my_test)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
Test Loss: 1.3704, Test Accuracy: 0.4688
Classification Report:
              precision    recall  f1-score   support

         ads       0.40      0.34      0.37        29
     message       0.50      0.39      0.44        18
     project       0.47      0.61      0.53        23
     vacancy       0.52      0.54      0.53        26

    accuracy                           0.47        96
   macro avg       0.47      0.47      0.47        96
weighted avg       0.47      0.47      0.46        96

