<a href="https://colab.research.google.com/github/Dimildizio/DS_course/blob/main/Neural_networks/NLP/Text_classification/JobsMessageClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import libs

In [1]:
!pip install nltk
!pip install emoji --upgrade
!pip install catboost

Collecting emoji
  Downloading emoji-2.8.0-py2.py3-none-any.whl (358 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m358.9/358.9 kB[0m [31m3.3 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: emoji
Successfully installed emoji-2.8.0
Collecting catboost
  Downloading catboost-1.2-cp310-cp310-manylinux2014_x86_64.whl (98.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.6/98.6 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2


In [2]:
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb
import re
import emoji
import string

from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from pymystem3 import Mystem

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

## Freeze seeds

In [4]:
np.random.seed(42)

## Get the dataset

In [7]:
text_data = pd.read_excel('msg_type.xlsx')

## Tokenization

In [85]:
tokenizer = TweetTokenizer()

## Stemming

In [9]:
stemmer = SnowballStemmer("russian")

## Lemmaization

In [10]:
mystem = Mystem()

Installing mystem to /root/.local/bin/mystem from http://download.cdn.yandex.net/mystem/mystem-3.1-linux-64bit.tar.gz


## Vectorize using TFIDF

In [86]:
tfidf_vectorizer = TfidfVectorizer(stop_words=stopwords.words('russian'))

## Split dataset to parameters and encode target labels

In [157]:
df = text_data.copy()
df['category'] = df['category'].replace({'ads': 'message', 'project': 'vacancy'})

In [158]:
df['category']

0      message
1      message
2      message
3      message
4      message
        ...   
475    vacancy
476    vacancy
477    vacancy
478    vacancy
479    vacancy
Name: category, Length: 480, dtype: object

###  Label encode categories

In [159]:
label_encoder = LabelEncoder()
encoded_target = label_encoder.fit_transform(df['category'])

### Split into params and target values

In [160]:
X = df['text']
y = encoded_target

## Perform transformation on df

In [161]:
def remove_emoji(text: str) -> str:
    return emoji.replace_emoji(text, " ")


def remove_links(text: str) -> str:
    return re.sub(r"http\S+", " ", text, flags=re.MULTILINE)


def remove_usernames_and_emails(text: str) -> str:
    """Удалеяет юзернеймы и email"""
    return re.sub(r"\S*@\S*", " ", text, flags=re.MULTILINE)


def remove_punctuation(text: str) -> str:
    """Удаляем символы пунктуации"""
    return "".join([ch if ch not in string.punctuation else " " for ch in text])


def remove_numbers(text: str) -> str:
    """Удаляем числа"""
    return "".join([i if not i.isdigit() else " " for i in text])


def remove_multiple_spaces(text: str) -> str:
    """Удаляем двойные (и более) пробелы"""
    return re.sub(r"\s+", " ", text, flags=re.I)

In [162]:
def get_prep_text(text: str) -> str:
  return remove_multiple_spaces(
      remove_numbers(
          remove_punctuation(
              remove_usernames_and_emails(
                  remove_links(
                      remove_emoji(text)
                      )
                  )
              )
          )
      )

In [163]:
#our new dataset with stemmed lemmatized and later vectorized texts
stemmed_lemma_txts = []

for text in X:
  tok = tokenizer.tokenize(get_prep_text(text).lower())
  stem_tok = [stemmer.stem(token) for token in tok]
  # lem_tok = [lem for lem in mystem.lemmatize(" ".join(stem_tok)) if not lem.isspace()]
  # stemmed_lemma_txts.append(' '.join(lem_tok))
  stemmed_lemma_txts.append(' '.join(stem_tok))

df['text_lemm'] = stemmed_lemma_txts

### TFIDF Vectorize

In [164]:
tfidfd = tfidf_vectorizer.fit_transform(stemmed_lemma_txts)

## Split dataset

pass tfidf'd and transfromed data instead of texts as X

In [165]:
X_train, X_test, y_train, y_test = train_test_split(tfidfd, y, stratify=y, test_size=0.25, random_state=42)

# Model

## Create and train baseline model

In [166]:
model = LogisticRegression(C=0.004)
model.fit(X_train, y_train)

## Predict

In [167]:
y_pred = model.predict(X_test)

## Evaluate

### Accuracy

In [168]:
acc = accuracy_score(y_test, y_pred)
print('Accuracy:', acc)

Accuracy: 0.9333333333333333


### Report

In [169]:
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.95      0.93        60
           1       0.95      0.92      0.93        60

    accuracy                           0.93       120
   macro avg       0.93      0.93      0.93       120
weighted avg       0.93      0.93      0.93       120



## Sum it up

### Catboost

In [170]:
cat_model = CatBoostClassifier(iterations=400, depth=6, learning_rate=0.04, loss_function='MultiClass', verbose=False)
cat_model.fit(X_train, y_train, eval_set=(X_test, y_test))


<catboost.core.CatBoostClassifier at 0x7bb432ea5900>

In [171]:
y_pred = cat_model.predict(X_test)
accuracy = accuracy_score(y_pred, y_test)
print(f"Accuracy: {accuracy:.2f}")
class_names = ['0','1']
report = classification_report(y_test, y_pred, target_names=class_names)
print("Classification Report:")
print(report)

Accuracy: 0.94
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.97      0.94        60
           1       0.96      0.92      0.94        60

    accuracy                           0.94       120
   macro avg       0.94      0.94      0.94       120
weighted avg       0.94      0.94      0.94       120



In [172]:
def test_model(model):
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  acc = accuracy_score(y_test, y_pred)
  print('Accuracy:', acc)
  print("Classification Report:")
  print(classification_report(y_test, y_pred))


In [173]:
log_model = LogisticRegression(C=0.004)
nb_model = MultinomialNB()
svm_model = SVC(kernel='linear', random_state=42, gamma="auto", probability=True)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
xgb_model = xgb.XGBClassifier()
knn_model = KNeighborsClassifier(n_neighbors=5)

ensemble = VotingClassifier(estimators=[
                              ('rf', rf_model),
                              ('svm', svm_model),
                              ('xgb', xgb_model)],
                            voting='soft')        # soft for probability-based voting

rfensemble = VotingClassifier(estimators=[
                              ('rf', rf_model),
                              ('svm', svm_model),
                              ('rf1', RandomForestClassifier(n_estimators=100, random_state=42))],
                            voting='hard')

models = [log_model, nb_model, svm_model, rf_model, xgb_model, knn_model, ensemble,rfensemble]
model_names = ['logreg', 'bayes', 'SVM', 'RandomForest', 'XGB', 'KNN', 'Ensemble', 'RF_ensemble']

In [174]:
for i in range(len(models)):
  print(model_names[i])
  print()
  test_model(models[i])
  print('\n\n')

logreg

Accuracy: 0.9333333333333333
Classification Report:
              precision    recall  f1-score   support

           0       0.92      0.95      0.93        60
           1       0.95      0.92      0.93        60

    accuracy                           0.93       120
   macro avg       0.93      0.93      0.93       120
weighted avg       0.93      0.93      0.93       120




bayes

Accuracy: 0.8916666666666667
Classification Report:
              precision    recall  f1-score   support

           0       0.96      0.82      0.88        60
           1       0.84      0.97      0.90        60

    accuracy                           0.89       120
   macro avg       0.90      0.89      0.89       120
weighted avg       0.90      0.89      0.89       120




SVM

Accuracy: 0.9416666666666667
Classification Report:
              precision    recall  f1-score   support

           0       0.93      0.95      0.94        60
           1       0.95      0.93      0.94        60



In [175]:
label_encoder.inverse_transform([0, 1])

array(['message', 'vacancy'], dtype=object)

#### Add Stacking of models

In [176]:
base_models = [
    ('rf', RandomForestClassifier(n_estimators=100, max_depth=5, random_state=42)),
    ('gb', GradientBoostingClassifier(n_estimators=100, random_state=42)),
    ('svm', SVC(kernel='linear', random_state=42, gamma="auto", probability=True)),
     ('logreg', LogisticRegression(C=0.0045)),
    ('xgb', xgb.XGBClassifier())]
final_model = RandomForestClassifier(n_estimators=100, max_depth=7, random_state=42)

stacking_model = StackingClassifier(estimators=base_models, final_estimator=final_model)
test_model(stacking_model)

Accuracy: 0.9583333333333334
Classification Report:
              precision    recall  f1-score   support

           0       0.95      0.97      0.96        60
           1       0.97      0.95      0.96        60

    accuracy                           0.96       120
   macro avg       0.96      0.96      0.96       120
weighted avg       0.96      0.96      0.96       120



## Neural networks solution

#### import NN stuff

In [177]:
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from keras.utils import pad_sequences

### LSTM

#### Prerocessing

Split data

In [178]:
df

Unnamed: 0,text,category,text_lemm
0,"Ищу экспертов, кто хочет продавать свои услуги...",message,ищ эксперт кто хочет продава сво услуг быстр и...
1,Продаю места в совсем свежем канале. \n\nНазва...,message,прода мест в совс свеж канал назван анекдотн с...
2,🗽Вам нужен Адвокат?\nBausat Union предлагает с...,message,вам нуж адвокат bausat union предлага след усл...
3,"Я юрист, но если нужен хороший адвокат, то дам...",message,я юрист но есл нуж хорош адвокат то дам конечн...
4,#ищу #продюсер #эксперт #запуски #прогревы\n\n...,message,ищ продюсер эксперт запуск прогрев ищ продюсер...
...,...,...,...
475,В кафе Чебуречная в Парке Сокольники открыта в...,vacancy,в каф чебуречн в парк сокольник открыт ваканс ...
476,"#ищу#копирайтер\n\n❗️ВАКАНСИЯ: ""Копирайтер / с...",vacancy,ищ копирайтер ваканс копирайтер создател стат ...
477,"#ищу#копирайтер\n\n❗️ВАКАНСИЯ: ""Копирайтер / с...",vacancy,ищ копирайтер ваканс копирайтер создател стат ...
478,#ищу #ассистент\n\nКомпания TURDZEN в поисках ...,vacancy,ищ ассистент компан turdzen в поиск ассистент ...


In [179]:
def k_process(df):
    X = df['text'].copy()
    y = df['category'].copy()
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y)
    return train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

Tokenize

In [180]:
def k_tokenize(x1, x2):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(x1)

  x_train = tokenizer.texts_to_sequences(x1)
  x_test = tokenizer.texts_to_sequences(x2)

  return tokenizer, x_train, x_test

Add padding

In [181]:
def k_padme(x1, x2, maxlen=100):
  x_train = pad_sequences(x1, maxlen=maxlen, padding='post')
  x_test = pad_sequences(x2, maxlen=maxlen, padding='post')
  return x_train, x_test

#### Create model

In [182]:
class LSTMClassifier:
  def __init__(self, vocab_size, embedding_dim, maxlen):
    self.vocab = vocab_size
    self.embedding_dim = embedding_dim
    self.maxlen = maxlen
    self.model = self._create_model()

  def _create_model(self)->Sequential:
    '''creates lstm with embedding, rnn and classificator'''
    model = Sequential()
    model.add(Embedding(input_dim=self.vocab, output_dim=self.embedding_dim, input_length=self.maxlen))
    model.add(LSTM(units=64))
    model.add(Dense(units=1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model


  def fit(self, X_train:np.array, y_train:np.array, batch_size:int=64, epochs:int=5)->None:
    '''trains model'''

    self.model.fit(X_train, y_train.astype(int), epochs=epochs, batch_size=batch_size)


  def predict(self, X_test:np.array)->np.array:
    '''predics values for validation and test'''
    print("PREDICTING")
    X_test_padded = pad_sequences(X_test, maxlen=self.maxlen)
    return self.model.predict(X_test_padded)


  def evaluate(self, pred:np.array, y_test:np.array)->None:
    '''reports statistics'''
    loss, accuracy = self.model.evaluate(pred, y_test)
    pred = (pred > 0.5).astype(int)
    print(f"Loss: {loss}\nAccuracy: {accuracy}")
    print(classification_report(y_test, pred))


  def fit_predict(self, X_train, X_test, y_train, y_test, batch=64, epochs=5):
    '''fits data into model, predicts values and reports sttistics'''

    self.fit(X_train, y_train, batch, epochs)
    y_pred = self.predict(X_test)
    print('\n\n\n\n\ntest binary\n\n\n\n')
    # Convert predicted values to binary values (0 or 1)
    y_pred_binary = (y_pred > 0.5).astype(int)

    self.evaluate(y_pred_binary, y_test)

#### Run it

In [183]:
kX_train, kX_test, ky_train, ky_test = k_process(df)
k_tokenizer, kX_train, kX_test = k_tokenize(kX_train, kX_test)
kX_train, kXtest = k_padme(kX_train, kX_test)

kX_train = np.array(kX_train)
kX_test = np.array(kX_test)
ky_train = np.array(ky_train)
ky_test = np.array(ky_test)


  kX_test = np.array(kX_test)


In [184]:
k_vocab_size = len(k_tokenizer.word_index) + 1    # num of unique words in all texts +1  for padding token
k_embedding_dim = 100                             #vectors representing a word
maxlen = 100                                      # sequence length after tokenization. too long - cut it

In [185]:
kLSTM = LSTMClassifier(k_vocab_size, k_embedding_dim, maxlen)
kLSTM.fit_predict(kX_train, kX_test, ky_train, ky_test, epochs=10)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10
PREDICTING





test binary








Loss: 0.6907978057861328
Accuracy: 0.5520833134651184
              precision    recall  f1-score   support

           0       0.86      0.12      0.22        48
           1       0.53      0.98      0.69        48

    accuracy                           0.55        96
   macro avg       0.69      0.55      0.45        96
weighted avg       0.69      0.55      0.45        96



## Try multiclass

In [None]:
from keras.utils import to_categorical


In [None]:
def mk_process(df):
    X = df['text'].copy()
    y = df['category'].copy()
    label_encoder = LabelEncoder()
    y = label_encoder.fit_transform(y)
    y = to_categorical(y, num_classes=4)
    return train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [203]:
class MultiLSTMClassifier(LSTMClassifier):

    def _create_model(self) -> Sequential:
        '''creates lstm with embedding, rnn and classificator'''
        model = Sequential()
        model.add(Embedding(input_dim=self.vocab, output_dim=self.embedding_dim, input_length=self.maxlen))
        model.add(LSTM(units=64))
        model.add(Dense(units=4, activation='softmax'))
        model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
        return model

    def fit(self, X_train: np.array, y_train: np.array, batch_size: int = 64, epochs: int = 5) -> None:
        y_train = mky_train.reshape(-1, 1)
        y_train = to_categorical(y_train, num_classes=4)
        '''trains model'''
        self.model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size)

    def fit_predict(self, X_train, X_test, y_train, y_test, batch=64, epochs=5):
        '''fits data into model, predicts values and reports statistics'''
        self.fit(X_train, y_train, batch, epochs)
        print('fitted')
        y_pred = self.predict(X_test)
        # Convert predicted probabilities to predicted class indices
        y_pred_classes = np.argmax(y_pred, axis=1)

        self.evaluate(y_pred_classes, y_test)

    def evaluate(self, pred: np.array, y_test: np.array) -> None:
        '''reports statistics'''
        y_pred_classes = np.argmax(pred, axis=1)
        y_test_classes = np.argmax(y_test, axis=1)

        print(classification_report(y_test_classes, y_pred_classes))


In [204]:
mkX_train, mkX_test, mky_train, mky_test = mk_process(text_data)
mk_tokenizer, mkX_train, mkX_test = k_tokenize(mkX_train, mkX_test)
mkX_train, mkXtest = k_padme(mkX_train, mkX_test)

mk_vocab_size = len(mk_tokenizer.word_index) + 1    # num of unique words in all texts +1  for padding token
mk_embedding_dim = 100                             #vectors representing a word
mmaxlen = 100                                      # sequence length after tokenization. too long - cut it

mkX_train = np.array(mkX_train)
mkX_test = np.array(mkX_test)
mky_train = np.array(mky_train)
mky_test = np.array(mky_test)

mky_train = to_categorical(mky_train, num_classes=4)
mky_test = to_categorical(mky_test, num_classes=4)

multi_kLSTM = MultiLSTMClassifier(mk_vocab_size, mk_embedding_dim, mmaxlen)
multi_kLSTM.fit_predict(mkX_train, mkX_test, mky_train, mky_test)

  mkX_test = np.array(mkX_test)


ValueError: ignored

In [207]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from sklearn.model_selection import train_test_split

class LSTMTextClassifier:
    def __init__(self, num_classes=4, vocab_size=10000, embedding_dim=128, lstm_units=128):
        self.num_classes = num_classes
        self.vocab_size = vocab_size
        self.embedding_dim = embedding_dim
        self.lstm_units = lstm_units
        self.model = self.build_model()

    def build_model(self):
        model = Sequential()
        model.add(Embedding(input_dim=self.vocab_size, output_dim=self.embedding_dim))
        model.add(LSTM(self.lstm_units, dropout=0.2, recurrent_dropout=0.2))
        model.add(Dense(self.num_classes, activation='softmax'))

        model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
        return model

    def preprocess_data(self, text_data, target):
        tokenizer = Tokenizer(num_words=self.vocab_size)
        tokenizer.fit_on_texts(text_data)
        sequences = tokenizer.texts_to_sequences(text_data)
        word_index = tokenizer.word_index

        max_sequence_length = max([len(seq) for seq in sequences])
        padded_sequences = pad_sequences(sequences, maxlen=max_sequence_length)

        one_hot_target = tf.keras.utils.to_categorical(target, self.num_classes)

        return padded_sequences, one_hot_target

    def train(self, X_train, y_train, epochs=10, batch_size=32, validation_split=0.1):
        history = self.model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size, validation_split=validation_split)
        return history

    def evaluate(self, X_test, y_test):
        return self.model.evaluate(X_test, y_test)


mtext_data = text_data['text']
mtarget = text_data['category']

mlabel_encoder = LabelEncoder()
m_labels = mlabel_encoder.fit_transform(mtarget)

mX_train, mX_test, my_train, my_test = train_test_split(mtext_data, m_labels, test_size=0.2, random_state=42)

classifier = LSTMTextClassifier(num_classes=4, vocab_size=10000)
mX_train_processed, my_train_processed = classifier.preprocess_data(mX_train, my_train)
classifier.train(mX_train_processed, my_train_processed, epochs=10)


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7bb432f8fcd0>

In [209]:
mX_test_processed, my_test_processed = classifier.preprocess_data(mX_test, my_test)
mloss, maccuracy = classifier.evaluate(mX_test_processed, my_test_processed)
print(f"Test Loss: {mloss:.4f}, Test Accuracy: {maccuracy:.4f}")

# Convert predicted labels to original class names
my_pred = classifier.model.predict(mX_test_processed)
mpredicted_labels = np.argmax(my_pred, axis=1)
mpredicted_class_names = mlabel_encoder.inverse_transform(mpredicted_labels)

# Generate classification report
mclassification_rep = classification_report(my_test_processed.argmax(axis=1), mpredicted_labels, target_names=mlabel_encoder.classes_)
print("Classification Report:")
print(mclassification_rep)

Test Loss: 1.3737, Test Accuracy: 0.4792
Classification Report:
              precision    recall  f1-score   support

         ads       0.33      0.07      0.11        29
     message       0.45      0.50      0.47        18
     project       0.46      0.48      0.47        23
     vacancy       0.52      0.92      0.67        26

    accuracy                           0.48        96
   macro avg       0.44      0.49      0.43        96
weighted avg       0.44      0.48      0.42        96

