<a href="https://colab.research.google.com/github/Dimildizio/DS_course/blob/main/Neural_networks/NLP/Text_classification/JobsMessageClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import libs

In [6]:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix, accuracy_score, classification_report
from sklearn.model_selection import train_test_split
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
from snowballstemmer import stemmer
from pymystem3 import Mystem

## Get the dataset

In [None]:
text_data = pd.read_csv('textdata.csv')

## Tokenization

In [None]:
tokenizer = TweetTokenizer()

## Stemming

In [None]:
stemmer_russian = stemmer("russian")

## Lemmaization

In [None]:
mystem = Mystem()

## Vectorize using TFIDF

In [None]:
tfidf_vectorizer = TfidfVectorizer(stop_words=stopwords.words('russian'))

## Split dataset to parameters and labels

In [None]:
df = text_data.copy()
X = df['message']
y = df['label']

## Perform transformation on df

In [None]:
#our new dataset with stemmed lemmatized and later vectorized texts
stemmed_lemma_txts = []

for text in X:
  tok = tokenizer.tokenize(text.lower())
  stem_tok = [stemmer.stemWord(token) for token in tok]
  lem_tok = [lem for lem in mystem.lemmatize(" ".join(stem_tok)) if not lem.isspace()]
  stemmed_lemma_txts.append(' '.join(lem_tok))

## TFIDF Vectorize

In [None]:
tfidfd = tfidf_vectorizer.fit_transform(stemmed_lemma_txts)

## Split dataset

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

# Model

## Create and train baseline model

In [None]:
model = LogisticRegression()
model.fit(X_train, y_train)

## Predict

In [None]:
y_pred = model.predcit(X_test)

## Evaluate

### Accuracy

In [None]:
acc = accuracy_score(y_test, y_pred)
print('Accuracy:', acc)

### Report

In [None]:
print("Classification Report:")
print(classification_report(y_test, y_pred))

## Confusion matrix

In [None]:
conf_matrix = confusion_matrix(y_test, y_pred)

plt.figure(figsize=(8, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=["Other", "Job Message"], yticklabels=["Other", "Job Message"])
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix")
plt.show()