Случайное предсказание в нашем случае не имеет значения, тк у нас многоклассовая классификация на 100 классов и вероятность предсказания класса стремится к 1/100. По этой же причине не имеет смысла предсказание самого частого класса.

## TF-IDF

In [93]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import MinMaxScaler
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import FunctionTransformer
from sklearn.preprocessing import StandardScaler
import numpy as np

In [2]:
df_train = pd.read_parquet('df_train.pq')
df_test = pd.read_parquet('df_test.pq')

In [71]:
df_train['text_cleaned'] = df_train['text'].str.replace(r'\r|\n', ' ', regex=True)
df_test['text_cleaned'] = df_test['text'].str.replace(r'\r|\n', ' ', regex=True)

In [75]:
df_train.shape, df_test.shape

((100, 40), (50, 40))

In [76]:
df_train.columns

Index(['author', 'text_', 'cnt_sent', 'text_len', 'text', 'text_len2',
       'words_cnt', 'wrds_sent_cnt', 'cnt_words_unique', 'unwords_words',
       'median_word_length', 'mean_word_length', 'max_word_length',
       'words_symbols', 'words_dots', 'words_commas', 'words_excls',
       'words_questions', 'words_semicolons', 'words_colons', 'words_dashs',
       'words_aposts', 'words_ellipsis', 'words_quots', 'cnt_adv_freq',
       'cnt_swadesh_freq', 'cnt_word_eng', 'prc_wrds_not_eng', 'uniq_word_cnt',
       'cnt_punct_frq', 'lex_div', 'tfidf_keywords', 'pos_frq', 'pos_cnt',
       'ent_frq', 'ent_cnt', 'uchars_frq', 'uchars_cnt', 'fk_score',
       'text_cleaned'],
      dtype='object')

In [97]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100 entries, 0 to 99
Data columns (total 41 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   author              100 non-null    object 
 1   text_               100 non-null    object 
 2   cnt_sent            100 non-null    int64  
 3   text_len            100 non-null    int64  
 4   text                100 non-null    object 
 5   text_len2           100 non-null    int64  
 6   words_cnt           100 non-null    int64  
 7   wrds_sent_cnt       100 non-null    float64
 8   cnt_words_unique    100 non-null    int64  
 9   unwords_words       100 non-null    float64
 10  median_word_length  100 non-null    float64
 11  mean_word_length    100 non-null    float64
 12  max_word_length     100 non-null    int64  
 13  words_symbols       100 non-null    float64
 14  words_dots          100 non-null    float64
 15  words_commas        100 non-null    float64
 16  words_exc

In [77]:
df_test = df_test[df_test['author'].isin(df_train['author'])]

In [78]:
vectorizer = TfidfVectorizer(min_df=0.05, max_df=0.9, stop_words='english')
tfidf_matr_train = vectorizer.fit_transform(df_train['text_cleaned'])
tfidf_matr_test = vectorizer.transform(df_test['text_cleaned'])
tfidf_matr_train.shape, tfidf_matr_test.shape

((100, 31072), (50, 31072))

In [79]:
X_train = tfidf_matr_train
X_test = tfidf_matr_test
label_encoder = LabelEncoder()
y_train = label_encoder.fit_transform(df_train['author'])
y_test = label_encoder.transform(df_test['author'])

In [80]:
model_1 = LogisticRegression(max_iter=5000, random_state=5)

In [81]:
model_1.fit(X_train, y_train)

In [82]:
y_pred = model_1.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
accuracy

0.52

In [27]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.50      1.00      0.67         1
           1       0.00      0.00      0.00         1
           6       1.00      1.00      1.00         1
           7       0.50      1.00      0.67         1
           9       1.00      1.00      1.00         1
          10       1.00      1.00      1.00         1
          11       0.00      0.00      0.00         1
          13       0.00      0.00      0.00         0
          15       0.00      0.00      0.00         0
          18       0.00      0.00      0.00         1
          20       0.00      0.00      0.00         1
          21       0.50      1.00      0.67         1
          24       0.00      0.00      0.00         1
          27       1.00      1.00      1.00         1
          28       1.00      1.00      1.00         1
          30       1.00      1.00      1.00         1
          31       0.00      0.00      0.00         0
          32       0.00    

  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))
  _warn_prf(average, modifier, f"{metric.capitalize()} is", len(result))


### Добавим масштабирование



In [83]:
pipeline = Pipeline([
    ('vectorizer', TfidfVectorizer(stop_words='english')),
    ('scaler', StandardScaler(with_mean=False)),
    ('classifier', LogisticRegression(max_iter=5000, random_state=5))
])

In [84]:
pipeline.fit(df_train['text_cleaned'], y_train)

In [85]:
y_pred_scaled = pipeline.predict(df_test['text_cleaned'])
accuracy_scaled = accuracy_score(y_test, y_pred_scaled)
accuracy_scaled

0.54

Попробуем с неочищенным текстом, без удаления стоп слов

In [52]:
pipeline_2 = Pipeline([
    ('vectorizer', TfidfVectorizer()),
    ('scaler', StandardScaler(with_mean=False)),
    ('classifier', LogisticRegression(max_iter=10000))
])

In [53]:
pipeline_2.fit(df_train['text'], y_train)

In [55]:
y_pred_scaled_2 = pipeline_2.predict(df_test['text'])
accuracy_scaled_2 = accuracy_score(y_test, y_pred_scaled_2)
accuracy_scaled_2

0.62

Поменяла способ масштабирования и поигралась с параметрами tf-idf

In [60]:
pipeline_3 = Pipeline([
    ('vectorizer', TfidfVectorizer(max_features=5000, stop_words='english')),
    ('to_dense', FunctionTransformer(lambda x: x.toarray(), accept_sparse=True)),  # Преобразую разреженную марицу в плотную
    ('scaler', MinMaxScaler()),
    ('classifier', LogisticRegression(max_iter=5000, random_state=5))
])

In [61]:
pipeline_3.fit(df_train['text'], y_train)

In [62]:
y_pred_scaled_3 = pipeline_3.predict(df_test['text'])
accuracy_scaled_3 = accuracy_score(y_test, y_pred_scaled_3)
accuracy_scaled_3

0.7

Прибавим к tf-idf числовые признаки

In [101]:
text_column = 'text_cleaned'
numeric_columns = df_train.select_dtypes(include='int').columns

text_transformer = Pipeline([
    ('vectorizer', TfidfVectorizer(max_features=5000, stop_words='english')),
    ('to_dense', FunctionTransformer(lambda x: x.toarray(), accept_sparse=True)),
    ('scaler', MinMaxScaler())
])

numeric_transformer = Pipeline([
    ('scaler', StandardScaler())
])

preprocessor = ColumnTransformer(
    transformers=[
        ('text', text_transformer, text_column),
        ('num', numeric_transformer, numeric_columns)
    ]
)

pipeline_4 = Pipeline([
    ('preprocessor', preprocessor),
    ('classifier', LogisticRegression(max_iter=5000, random_state=5))
])

In [102]:
pipeline_4.fit(df_train, y_train)

In [103]:
y_pred_4 = pipeline_4.predict(df_test)

In [104]:
accuracy_4 = accuracy_score(y_test, y_pred_4)
accuracy_4

0.64

##Пробую sentence-BERT

In [63]:
!pip install sentence-transformers
from sentence_transformers import SentenceTransformer



  from tqdm.autonotebook import tqdm, trange


In [64]:
model = SentenceTransformer('all-MiniLM-L6-v2')

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.7k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [86]:
df_train['embeddings'] = df_train['text_cleaned'].apply(lambda x: model.encode(x))

In [87]:
df_test['embeddings'] = df_test['text_cleaned'].apply(lambda x: model.encode(x))

In [94]:
X_train_bert = np.vstack(df_train['embeddings'].values)
X_test_bert = np.vstack(df_test['embeddings'].values)

In [90]:
model_bert = LogisticRegression(max_iter=5000, random_state=5)

In [95]:
model_bert.fit(X_train_bert, y_train)

In [96]:
y_pred_bert = model_bert.predict(X_test_bert)

accuracy_bert = accuracy_score(y_test, y_pred_bert)
accuracy_bert

0.54

Метрики даже ухудшились.  
В итоге лучшее качество было с использованием tf-idf и minmaxscaler.  

У меня не получилось определить тональность текстов ни с помощью VADER ни с помощью BERT, тк ячейка кода крутилась слишком долго.