In [1]:
import pandas as pd
import numpy as np

In [2]:
import nltk
from nltk.corpus import stopwords

from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import TruncatedSVD
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB, MultinomialNB
from sklearn.metrics import classification_report
from transformers import BertTokenizer, BertModel
import torch

In [3]:
tweet_train = pd.read_csv('train.csv')
tweet_test = pd.read_csv('test.csv')

In [4]:
tweet_train.shape

(31962, 3)

In [5]:
tweet_test.shape

(17197, 2)

подсчет слов

In [6]:
tweet_train['word_count'] = tweet_train['tweet'].apply(lambda x: len(str(x).split(' ')))

In [7]:
tweet_test['word_count'] = tweet_test['tweet'].apply(lambda x: len(str(x).split(' ')))

In [8]:
tweet_train.head()

Unnamed: 0,id,label,tweet,word_count
0,1,0,@user when a father is dysfunctional and is s...,21
1,2,0,@user @user thanks for #lyft credit i can't us...,22
2,3,0,bihday your majesty,5
3,4,0,#model i love u take with u all the time in ...,17
4,5,0,factsguide: society now #motivation,8


In [9]:
tweet_test.head()

Unnamed: 0,id,tweet,word_count
0,31963,#studiolife #aislife #requires #passion #dedic...,12
1,31964,@user #white #supremacists want everyone to s...,20
2,31965,safe ways to heal your #acne!! #altwaystohe...,15
3,31966,is the hp and the cursed child book up for res...,24
4,31967,"3rd #bihday to my amazing, hilarious #nephew...",18


число символов в строке

In [10]:
tweet_train['char_count'] = tweet_train['tweet'].str.len() #с пробелами
tweet_test['char_count'] = tweet_test['tweet'].str.len()

нужно избавиться от мелких слов из-за высокой значимости в них - stop слов с помощью nltk

In [11]:
nltk.download('stopwords')
set(stopwords.words('english'))

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\A\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


{'a',
 'about',
 'above',
 'after',
 'again',
 'against',
 'ain',
 'all',
 'am',
 'an',
 'and',
 'any',
 'are',
 'aren',
 "aren't",
 'as',
 'at',
 'be',
 'because',
 'been',
 'before',
 'being',
 'below',
 'between',
 'both',
 'but',
 'by',
 'can',
 'couldn',
 "couldn't",
 'd',
 'did',
 'didn',
 "didn't",
 'do',
 'does',
 'doesn',
 "doesn't",
 'doing',
 'don',
 "don't",
 'down',
 'during',
 'each',
 'few',
 'for',
 'from',
 'further',
 'had',
 'hadn',
 "hadn't",
 'has',
 'hasn',
 "hasn't",
 'have',
 'haven',
 "haven't",
 'having',
 'he',
 "he'd",
 "he'll",
 "he's",
 'her',
 'here',
 'hers',
 'herself',
 'him',
 'himself',
 'his',
 'how',
 'i',
 "i'd",
 "i'll",
 "i'm",
 "i've",
 'if',
 'in',
 'into',
 'is',
 'isn',
 "isn't",
 'it',
 "it'd",
 "it'll",
 "it's",
 'its',
 'itself',
 'just',
 'll',
 'm',
 'ma',
 'me',
 'mightn',
 "mightn't",
 'more',
 'most',
 'mustn',
 "mustn't",
 'my',
 'myself',
 'needn',
 "needn't",
 'no',
 'nor',
 'not',
 'now',
 'o',
 'of',
 'off',
 'on',
 'once',
 'on

In [12]:
stop = stopwords.words('english')

In [13]:
tweet_train['stopwords'] = tweet_train['tweet'].apply(lambda x: len([x for x in x.split() if x in stop]))
tweet_test['stopwords'] = tweet_test['tweet'].apply(lambda x: len([x for x in x.split() if x in stop]))

удаление стоп слов

In [14]:
tweet_train['tweet'] = tweet_train['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
tweet_test['tweet'] = tweet_test['tweet'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))

удаление цифр

In [15]:
tweet_train['tweet'] = tweet_train['tweet'].str.replace('\d+','', regex=True)
tweet_test['tweet'] = tweet_test['tweet'].str.replace('\d+','', regex=True)

надо удалить пунктуацию и лишние символы

In [16]:
tweet_train['tweet'] = tweet_train['tweet'].str.replace('\d+','', regex=True)
tweet_test['tweet'] = tweet_test['tweet'].str.replace('\d+','', regex=True)

In [17]:
tweet_train['tweet'] = tweet_train['tweet'].str.replace('[^\w\s]','', regex=True)
tweet_test['tweet'] = tweet_test['tweet'].str.replace('[^\w\s]','', regex=True)

In [18]:
tf1 = (tweet_train['tweet'][1:2]).apply(lambda x: pd.value_counts(x.split(' '))).sum(axis=0).reset_index()
tf1.columns = ['words', 'tf']

  tf1 = (tweet_train['tweet'][1:2]).apply(lambda x: pd.value_counts(x.split(' '))).sum(axis=0).reset_index()
  tf1 = (tweet_train['tweet'][1:2]).apply(lambda x: pd.value_counts(x.split(' '))).sum(axis=0).reset_index()


In [19]:
for i, word in enumerate(tf1['words']):
    tf1.loc[i, 'idf'] = np.log(tweet_train.shape[0]/(len(tweet_train[tweet_train['tweet'].str.contains(word)])))

In [20]:
tf1['tfidf'] = tf1['tf'] * tf1['idf']
tf1

Unnamed: 0,words,tf,idf,tfidf
0,user,2,1.054454,2.108909
1,thanks,1,4.597751,4.597751
2,lyft,1,8.762865,8.762865
3,credit,1,7.327781,7.327781
4,cant,1,3.538194,3.538194
5,use,1,1.005985,1.005985
6,cause,1,5.610129,5.610129
7,offer,1,6.522155,6.522155
8,wheelchair,1,9.273691,9.273691
9,vans,1,8.426393,8.426393


делю тренировочные данные на трейн и тест

In [21]:
X_full = tweet_train['tweet']
y_full = tweet_train['label']

In [22]:
X_train_text, X_val_text, y_train, y_val = train_test_split(
    X_full, y_full, test_size=0.2, random_state=42
)

TF-IDF

In [23]:
tfidf = TfidfVectorizer(max_features=1000, lowercase=True, analyzer='word', stop_words = 'english', ngram_range=(1,3))
X_train_tfidf = tfidf.fit_transform(X_train_text)
X_val_tfidf = tfidf.transform(X_val_text)

____________________________________________________________________________________________________________________________________________
Мультиномиальный Байес тк после TruncatedSVD генерятся отрицательные значения, поэтому делаю его до SVD

In [24]:
nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)
y_pred = nb.predict(X_val_tfidf)

In [25]:
print(classification_report(y_val, y_pred, zero_division=1))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97      5937
           1       0.87      0.25      0.39       456

    accuracy                           0.94      6393
   macro avg       0.91      0.62      0.68      6393
weighted avg       0.94      0.94      0.93      6393



_______________________________________________________________________________________________________________________________________________________

In [26]:
svd = TruncatedSVD(n_components=100, random_state=42)
X_svd_train = svd.fit_transform(X_train_tfidf)
X_svd_val = svd.transform(X_val_tfidf)

In [27]:
models = {'Logistic_Regression':LogisticRegression(),
          'Random_Forest':RandomForestClassifier(),
          'Naive_Bayes':GaussianNB()}

In [28]:
for name, model in models.items():

    model.fit(X_svd_train, y_train)
    y_pred = model.predict(X_svd_val)

    print(f'\n--- TF-IDF + TruncatedSVD + {name} ---')
    print(classification_report(y_val, y_pred, zero_division=1))


--- TF-IDF + TruncatedSVD + Logistic_Regression ---
              precision    recall  f1-score   support

           0       0.94      1.00      0.97      5937
           1       0.81      0.18      0.30       456

    accuracy                           0.94      6393
   macro avg       0.87      0.59      0.63      6393
weighted avg       0.93      0.94      0.92      6393


--- TF-IDF + TruncatedSVD + Random_Forest ---
              precision    recall  f1-score   support

           0       0.95      0.99      0.97      5937
           1       0.75      0.30      0.43       456

    accuracy                           0.94      6393
   macro avg       0.85      0.65      0.70      6393
weighted avg       0.93      0.94      0.93      6393


--- TF-IDF + TruncatedSVD + Naive_Bayes ---
              precision    recall  f1-score   support

           0       0.98      0.59      0.74      5937
           1       0.14      0.87      0.24       456

    accuracy                         

Далее я пробовал реализовать эмбендинг взвешенных по attention используя сначала сберовскую нейросеть, после нашел как использовать Bert 

эмбендинг взвешенных по attention

In [None]:
sentences = tweet_train['tweet'].tolist()

In [None]:
sentences

['user father dysfunctional selfish drags kids dysfunction run',
 'user user thanks lyft credit cant use cause offer wheelchair vans pdx disapointed getthanked',
 'bihday majesty',
 'model love u take u time urð ðððð ððð',
 'factsguide society motivation',
 ' huge fan fare big talking leave chaos pay disputes get there allshowandnogo',
 'user camping tomorrow user user user user user user user dannyâ',
 'next school year year examsð cant think ð school exams hate imagine actorslife revolutionschool girl',
 'won love land allin cavs champions cleveland clevelandcavaliers â',
 'user user welcome  gr ',
 'â ireland consumer price index mom climbed previous   may blog silver gold forex',
 'selfish orlando standwithorlando pulseshooting orlandoshooting biggerproblems selfish heabreaking values love ',
 'get see daddy today days gettingfed',
 'user cnn calls michigan middle school build wall chant  tcot',
 'comment australia opkillingbay seashepherd helpcovedolphins thecove helpcovedolphins'

Mean Pooling - Take attention mask into account for correct averaging

In [None]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0] #First element of model_output contains all token embeddings
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

Load AutoModel from huggingface model repository

In [None]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [None]:
#Tokenize sentences
encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=24, return_tensors='pt')

In [None]:
#get the embeddings
with torch.no_grad():
    outputs = model(**encoded_input)

sentence_embeddings = mean_pooling(outputs, encoded_input['attention_mask'])

print('Sentence Embeddings: \n', sentence_embeddings)