<a href="https://colab.research.google.com/github/Dimildizio/DS_course/blob/main/Neural_networks/NLP/Text_classification/JobsMessageClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import libs

In [1]:
!pip install nltk
!pip install catboost

Collecting catboost
  Downloading catboost-1.2-cp310-cp310-manylinux2014_x86_64.whl (98.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m98.6/98.6 MB[0m [31m8.1 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: catboost
Successfully installed catboost-1.2


In [2]:
import matplotlib.pyplot as plt
import nltk
import numpy as np
import pandas as pd
import seaborn as sns
import xgboost as xgb

from catboost import CatBoostClassifier
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from nltk.stem.snowball import SnowballStemmer
from pymystem3 import Mystem

In [3]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

## Freeze seeds

In [4]:
np.random.seed(42)

## Get the dataset

In [6]:
text_data = pd.read_excel('text_data.xlsx')

In [8]:
text_data

Unnamed: 0,text,category
0,"Ищу экспертов, кто хочет продавать свои услуги...",ads
1,Продаю места в совсем свежем канале. \n\nНазва...,ads
2,🗽Вам нужен Адвокат?\nBausat Union предлагает с...,ads
3,"Я юрист, но если нужен хороший адвокат, то дам...",ads
4,#ищу #продюсер #эксперт #запуски #прогревы\n\n...,ads
...,...,...
475,В кафе Чебуречная в Парке Сокольники открыта в...,vacancy
476,"#ищу#копирайтер\n\n❗️ВАКАНСИЯ: ""Копирайтер / с...",vacancy
477,"#ищу#копирайтер\n\n❗️ВАКАНСИЯ: ""Копирайтер / с...",vacancy
478,#ищу #ассистент\n\nКомпания TURDZEN в поисках ...,vacancy


## Tokenization

In [7]:
tokenizer = TweetTokenizer()

## Stemming

In [9]:
stemmer = SnowballStemmer("russian")

## Lemmaization

In [10]:
mystem = Mystem()

Installing mystem to /root/.local/bin/mystem from http://download.cdn.yandex.net/mystem/mystem-3.1-linux-64bit.tar.gz


## Vectorize using TFIDF

In [11]:
tfidf_vectorizer = TfidfVectorizer(stop_words=stopwords.words('russian'))

## Split dataset to parameters and encode target labels

In [12]:
df = text_data.copy()

###  Label encode categories

In [14]:
label_encoder = LabelEncoder()
encoded_target = label_encoder.fit_transform(df['category'])

### Split into params and target values

In [16]:
X = df['text']
y = encoded_target

## Perform transformation on df

In [113]:
#our new dataset with stemmed lemmatized and later vectorized texts
stemmed_lemma_txts = []

for text in X:
  tok = tokenizer.tokenize(text.lower())
  stem_tok = [stemmer.stem(token) for token in tok]
  lem_tok = [lem for lem in mystem.lemmatize(" ".join(stem_tok)) if not lem.isspace()]
  stemmed_lemma_txts.append(' '.join(lem_tok))

### TFIDF Vectorize

In [114]:
tfidfd = tfidf_vectorizer.fit_transform(stemmed_lemma_txts)

## Split dataset

pass tfidf'd and transfromed data instead of texts as X

In [115]:
X_train, X_test, y_train, y_test = train_test_split(tfidfd, y, stratify=y, test_size=0.2, random_state=42)

# Model

## Create and train baseline model

In [20]:
model = LogisticRegression(C=0.004)
model.fit(X_train, y_train)

## Predict

In [21]:
y_pred = model.predict(X_test)

## Evaluate

### Accuracy

In [22]:
acc = accuracy_score(y_test, y_pred)
print('Accuracy:', acc)

Accuracy: 0.8645833333333334


### Report

In [23]:
print("Classification Report:")
print(classification_report(y_test, y_pred))

Classification Report:
              precision    recall  f1-score   support

           0       0.86      0.79      0.83        24
           1       0.82      0.96      0.88        24
           2       0.89      0.71      0.79        24
           3       0.89      1.00      0.94        24

    accuracy                           0.86        96
   macro avg       0.87      0.86      0.86        96
weighted avg       0.87      0.86      0.86        96



## Sum it up

### Catboost

In [128]:
model = CatBoostClassifier(iterations=400, depth=6, learning_rate=0.1, loss_function='MultiClass', verbose=False)
model.fit(X_train, y_train, eval_set=(X_test, y_test))


<catboost.core.CatBoostClassifier at 0x7aa23e526680>

In [129]:
y_pred = model.predict(X_test)
accuracy = accuracy_score(y_pred, y_test)
print(f"Accuracy: {accuracy:.2f}")
class_names = ['0','1',"2","3"]
report = classification_report(y_test, y_pred, target_names=class_names)
print("Classification Report:")
print(report)

Accuracy: 0.81
Classification Report:
              precision    recall  f1-score   support

           0       0.83      0.62      0.71        24
           1       0.70      0.88      0.78        24
           2       0.91      0.83      0.87        24
           3       0.85      0.92      0.88        24

    accuracy                           0.81        96
   macro avg       0.82      0.81      0.81        96
weighted avg       0.82      0.81      0.81        96



In [25]:
def test_model(model):
  model.fit(X_train, y_train)
  y_pred = model.predict(X_test)
  acc = accuracy_score(y_test, y_pred)
  print('Accuracy:', acc)
  print("Classification Report:")
  print(classification_report(y_test, y_pred))

In [120]:
log_model = LogisticRegression(C=0.004)
nb_model = MultinomialNB()
svm_model = SVC(kernel='linear', random_state=42, probability=True)
rf_model = RandomForestClassifier(n_estimators=100, random_state=42)
xgb_model = xgb.XGBClassifier()
knn_model = KNeighborsClassifier(n_neighbors=5)

ensemble = VotingClassifier(estimators=[
                              ('rf', rf_model),
                              ('svm', svm_model),
                              ('xgb', xgb_model)],
                            voting='soft')        # soft for probability-based voting

models = [log_model, nb_model, svm_model, rf_model, xgb_model, knn_model, ensemble]
model_names = ['LogReg', 'Nayive Bayes', 'SupportVectorMachine', 'RandomForest', 'XGB', 'KNN', 'Ensemble(rf, svm, xgb)']

In [121]:
for mod in range(len(models)):
  print(model_names[mod])
  print()
  test_model(models[mod])
  print('\n\n\n')

LogReg

Accuracy: 0.84375
Classification Report:
              precision    recall  f1-score   support

           0       0.81      0.71      0.76        24
           1       0.76      0.92      0.83        24
           2       0.90      0.75      0.82        24
           3       0.92      1.00      0.96        24

    accuracy                           0.84        96
   macro avg       0.85      0.84      0.84        96
weighted avg       0.85      0.84      0.84        96





Nayive Bayes

Accuracy: 0.78125
Classification Report:
              precision    recall  f1-score   support

           0       0.72      0.96      0.82        24
           1       0.86      0.50      0.63        24
           2       0.76      0.67      0.71        24
           3       0.83      1.00      0.91        24

    accuracy                           0.78        96
   macro avg       0.79      0.78      0.77        96
weighted avg       0.79      0.78      0.77        96





SupportVectorMachi

In [118]:
rf_model_1 = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model_2 = RandomForestClassifier(n_estimators=100, random_state=42)
rf_model_3 = RandomForestClassifier(n_estimators=100, random_state=42)

rf_model_1.fit(X_train, y_train)
rf_model_2.fit(X_train, y_train)
rf_model_3.fit(X_train, y_train)

ensemble = VotingClassifier(estimators=[
    ('rf_1', rf_model_1),
    ('rf_2', rf_model_2),
    ('rf_3', rf_model_3)],
    voting='hard')  # 'hard' for majority voting

test_model(ensemble)

Accuracy: 0.8854166666666666
Classification Report:
              precision    recall  f1-score   support

           0       1.00      0.71      0.83        24
           1       0.80      1.00      0.89        24
           2       0.91      0.88      0.89        24
           3       0.88      0.96      0.92        24

    accuracy                           0.89        96
   macro avg       0.90      0.89      0.88        96
weighted avg       0.90      0.89      0.88        96



## Neural networks solution

#### import NN stuff

In [124]:
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Embedding, LSTM, Dense
from keras.utils import pad_sequences

### LSTM

#### Prerocessing

Split data

In [125]:
def k_process(df):
  X = df['text'].copy()
  y = df['category'].copy()
  return train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

Tokenize

In [126]:
def k_tokenize(x1, x2):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(x1)

  x_train = tokenizer.text_to_seq(x1)
  x_test = tokenizer.text_to_seq(x2)

  return tokenizer, x_train, x_test

Add padding

In [127]:
def k_padme(x1, x2, maxlen=100):
  x_train = pad_sequences(x1, maxlen=maxlen, padding='post')
  x_test = pad_sequences(x2, maxlen=maxlen, padding='post')
  return x_train, x_test

#### Create model

In [None]:
class LSTMClassifier:
  def __init__(self, vocab_size, embedding_dim=100, maxlen=100):
    self.vocab = vocab_size
    self.embedding_dim = embedding_dim
    self.maxlen = maxlen
    self.model = self._create_model()

  def _create_model(self)->Sequential:
    '''creates lstm with embedding, rnn and classificator'''
    model = Sequential()
    model.add(Embedding(input_dim=self.vocab), output_dim=self.embedding_dim, input_length=self.maxlen)
    model.add(LSTM(units=64))
    model.add(Dense(units=1, activation='sigmoid'))
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
    return model


  def fit(self, X_train:np.array, y_train:np.array, batch_size:int=64, epochs:int=5)->None:
    '''trains model'''
    self.model.fit(X_train, y_train, epochs=epochs, batch_size=batch_size)


  def predict(self, X_test:np.array)->np.array:
    '''predics values for validation and test'''
    return self.model.predict(X_test)


  def evaluate(self, pred:np.array, y_test:np.array)->None:
    '''reports statistics'''
    loss, accuracy = self.model.evaluate(pred, y_test)
    pred = (pred > 0.5).astype(int)
    print(f"Loss: {loss}\nAccuracy: {accuracy}")
    print(classification_report(y_test, pred))


  def fit_predict(self, X_train, X_test, y_train, y_test, batch=64, epochs=5):
    '''fits data into model, predicts values and reports sttistics'''
    self.fit(X_train, y_train, batch, epochs)
    y_pred = self.predcit(X_test)
    self.evaluate(y_pred, y_test)

#### Run it

In [None]:
kX_train, kX_test, ky_train, ky_test = k_process(text_data)
k_tokenizer, kX_train, kX_test = k_tokenize(kX_train, kX_test)
kX_train, kXtest = k_padme(kX_train, kX_test)

In [None]:
k_vocab_size = len(k_tokenizer.word_index) + 1    # num of unique words in all texts +1  for padding token
k_embedding_dim = 100                             #vectors representing a word
maxlen = 200                                      # sequence length after tokenization. too long - cut it

In [None]:
kLSTM = LSTMClassifier(k_vocab_size, k_embedding_dim, maxlen)
kLSTM.fit_predict(kX_train, kX_test, ky_train, ky_test)