In [1]:
import pandas as pd
import re

from sklearn.decomposition import PCA, TruncatedSVD 
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier

from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import wordnet
from nltk.corpus import stopwords
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Daria\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [2]:
from gensim.sklearn_api import W2VTransformer
from gensim.models import Word2Vec
from gensim.test.utils import common_texts

In [3]:
df = pd.read_csv("processed_external_new.csv")[['text', 'y']]
df

Unnamed: 0,text,y
0,дворник надо тоже уничтожать,1
1,мой старший неделя шипеть не принима...,0
2,полностью с вы согласный,0
3,хоть нога вверх ничто не изменяться,0
4,а что значить левый ребенок,0
...,...,...
390016,но не каждый хотеть что то исправл...,1
390017,скучать так только юзернейм вправлять ...,1
390018,вот и в школа в гавно это идти ...,1
390019,rt юзернейм юзернейм тауриэль не гру...,1


In [5]:
pulse = pd.read_csv("pulse_processed.csv")[['message', 'y']]
pulse.columns = ['text',  'y']
# df = pd.concat([df, pulse])
df = pulse

In [6]:
text = [str(t).split() for t in df['text']]
w2v = Word2Vec(text, size=10, seed=0, workers=10)

In [205]:
w2v.train(text, total_examples=len(text), epochs=100)

(441074507, 586083900)

In [214]:
w2v.wv.most_similar('пидорас', topn=10)

[('пидарас', 0.9854263067245483),
 ('пидор', 0.9852038621902466),
 ('провительство', 0.9812092185020447),
 ('прихвостень', 0.9758775234222412),
 ('проститутка', 0.9749099612236023),
 ('пидар', 0.9737890958786011),
 ('единорос', 0.9728310108184814),
 ('пидарюга', 0.9631924629211426),
 ('стадо', 0.9605698585510254),
 ('холоп', 0.9604759216308594)]

In [207]:
dct = dict()
for w in w2v.wv.vocab.keys():
    dct[w] = list(float(x) for x in w2v[w])

  This is separate from the ipykernel package so we can avoid doing imports until


In [215]:
import json  

with open("w2v10.json", "w") as outfile: 
    json.dump(dct, outfile)

In [208]:
import numpy as np


pulse = pd.read_csv("pulse_processed.csv")
pulse = pulse.drop(columns=['text', 'label', 'Unnamed: 0', 'Unnamed: 0.1'])

def w2vtext(text):
    result = np.array([0.0] * 10)
    n_elements = 0
    if type(text) == float:
        print(text)
        text = str(text)
    for w in text.split():
        if w in dct.keys():
            result += np.array(dct[w]).astype(float)
            n_elements += 1
    if n_elements > 0:
        result /= n_elements
    return list(result)


def transform_dataset(data):
    X = []
    y = []
    cols = ['n_words', 'caps_lock', 'mentions', 'actions', 'length', 'message_grammar', 'pr1', 'pr2', 'pr3', 'pr4', 'pr5', 'pr6']
    for i in range(len(data)):
        row = [data[c].iloc[i] for c in cols]
        row += w2vtext(data['message'].iloc[i])
        X.append(row)
        y.append(data['y'].iloc[i])
    return np.array(X), np.array(y)
    
pulse = pulse.dropna()
X, y = transform_dataset(pulse)

In [164]:
def test_model(model, X, y, k=5):
    skf = StratifiedKFold(n_splits=k, shuffle=True)
    score = []
    roc_auc = []
    train_score = []
    train_roc_auc = []
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        model.fit(X_train, y_train)
        preds = model.predict(X_test)
        train_preds = model.predict(X_train)
        score.append(f1_score(y_test, preds))
        roc_auc.append(roc_auc_score(y_test, preds))
        train_score.append(f1_score(y_train, train_preds))
        train_roc_auc.append(roc_auc_score(y_train, train_preds))
        
    return score, roc_auc, train_score, train_roc_auc

In [158]:
X

array([[ 3.        ,  0.33333333,  0.        , ...,  1.26353675,
        -0.36339507, -0.90562579],
       [ 9.        ,  0.11111111,  0.        , ..., -0.27082732,
        -0.38239955, -0.34172625],
       [ 4.        ,  0.25      ,  0.        , ...,  0.7785173 ,
         0.12502886, -1.15698469],
       ...,
       [18.        ,  0.22222222,  0.        , ..., -0.38323065,
         0.73010988, -0.33414391],
       [21.        ,  0.0952381 ,  0.        , ..., -0.77521807,
         0.33600673, -1.11845462],
       [32.        ,  0.0625    ,  0.        , ..., -0.9756875 ,
        -0.32090214, -0.99223099]])

In [160]:
np.where(np.isnan(np.array(X)))

(array([], dtype=int64), array([], dtype=int64))

In [165]:
test_model(RandomForestClassifier(), X, y)

([0.2857142857142857,
  0.28975265017667845,
  0.2518518518518518,
  0.296028880866426,
  0.30927835051546393],
 [0.5865143676735828,
  0.5888664118835861,
  0.5739637641841043,
  0.5893146413770868,
  0.5973395220458139],
 [0.9994520547945205,
  0.9994520547945205,
  0.9989035087719299,
  0.9989035087719299,
  1.0],
 [0.9999813237711042,
  0.9994523548740416,
  0.9989047097480832,
  0.9989047097480832,
  1.0])

Вадим просил какую-то хрень:

In [7]:
text = [str(t) for t in df['text']]
text = ' '.join(text).replace('   ', ' ').replace('  ', ' ').split(' ')
text = set(text)

In [8]:
len(text)

20833

In [9]:
word_id = dict()
for t, i in enumerate(text):
    word_id[i] = t
    
list(word_id.keys())[0]

'неофициально'

In [10]:
from collections import defaultdict
w = defaultdict(int)
p = defaultdict(int)
pairwise_w = defaultdict(lambda: defaultdict(int))
pairwise_p = defaultdict(lambda: defaultdict(int))
w_over = 0

PMI = defaultdict(lambda: defaultdict(int))

In [11]:
pairwise_p

defaultdict(<function __main__.<lambda>()>, {})

In [13]:
import numpy as np

length = 3

for kkk, t in enumerate(df['text']):
    t = str(t)
    t = t.replace('  ', ' ').replace('  ', ' ').replace('  ', ' ')
    left = 0
    right = left + 3
    
    while right < len(t.split(' ')):
        w_over += 1
        window = t.split(' ')[left:right]
        
#         print(window)
        
        for i in range(len(window)):
            w[window[i]] += 1
            for j in range(len(window)):
                if i != j:
                    pairwise_w[window[i]][window[j]] += 1
                    pairwise_w[window[j]][window[i]] += 1
        
        right += 1
        left += 1
        
    if kkk % 10000 == 0:
        print(kkk)

for i in w:
    p[i] = w[i] / w_over

for i in w:
    for j in pairwise_w[i]:
        pairwise_p[i][j] = pairwise_w[i][j] / w_over
        PMI[i][j] = np.log(pairwise_p[i][j] / (p[i] * p[j]))

0
10000
20000
30000


In [14]:
edges = []
pmis = []

for i in PMI:
    for j in PMI[i]:
        if PMI[i][j] > 0:
            edges.append((word_id[i], word_id[j]))
            pmis.append(max(PMI[i][j],0))

In [15]:
id_word = dict()
for word in word_id:
    id_word[word_id[word]] = word

In [16]:
id_word

{0: 'неофициально',
 1: 'счесть',
 2: 'mercado',
 3: 'участвовать',
 4: 'позитивный',
 5: 'следующий',
 6: 'носик',
 7: 'физ',
 8: 'возвращаться',
 9: 'приглашать',
 10: 'msc',
 11: 'угрюмый',
 12: 'отторовывать',
 13: 'недогревать',
 14: 'фиксанутта',
 15: 'подхалим',
 16: 'перекур',
 17: 'обрызгивать',
 18: 'средст',
 19: 'неприятный',
 20: 'buterzone',
 21: 'states',
 22: 'бассейн',
 23: 'поправочка',
 24: 'заграничный',
 25: 'компостный',
 26: 'оксана',
 27: 'противоречие',
 28: 'еврооблиг',
 29: 'пометка',
 30: 'поровну',
 31: 'amd',
 32: 'пускаться',
 33: 'lithium',
 34: 'хребтина',
 35: 'lrn',
 36: 'айтишный',
 37: 'сплит',
 38: 'изрядно',
 39: 'крыться',
 40: 'шорты',
 41: 'событие',
 42: 'халтурка',
 43: 'юрисдикция',
 44: 'отираться',
 45: 'химиотерапия',
 46: 'глобализация',
 47: 'смотря',
 48: 'предпоследний',
 49: 'познакомиться',
 50: 'dcf',
 51: 'закупишся',
 52: 'числиться',
 53: 'программировать',
 54: 'опускать',
 55: 'гудеть',
 56: 'коммуникация',
 57: 'рапорт',
 58:

In [18]:
import json

with open("id_word_small.json", "w") as outfile: 
    json.dump(id_word, outfile)
    
with open("word_id_small.json", "w") as outfile: 
    json.dump(word_id, outfile)

In [19]:
with open("edges_small.txt", "w") as f:
    for edge in edges:
        f.write(str(edge[0]) + " " + str(edge[1]) + "\n")

In [20]:
with open("pmi_small.txt", "w") as f:
    for i in pmis:
        f.write(str(i) + "\n")

In [None]:
def prepare_preproc_and_features(pulse, pulse_data=False):

    mystem = Mystem()
    
    def remove_mentions(s: str):
        return ' '.join(i if not i.startswith('@') else 'юзернейм' for i in s.split(' '))


    def remove_stocks(s: str):
        if not pulse_data:
            return s
        return ' '.join(i if not i.startswith('{$') else 'стокнейм' for i in s.split(' '))


    def get_caps_feature(text):
        upper = re.sub('[^А-Я]+', '', text)
        return len(upper)


    def get_actions(text):
        return text.count('стокнейм')


    def get_mentions(text):
        return text.count('юзернейм')


    def text_preprocessing(text):
        if type(text) == str:
            text = re.sub(re.compile('<.*?>'), '', text)
            text = re.sub('[^A-Za-zА-Яа-я]+', ' ', text)
            text = text.lower()
        else: 
            text = ''
        return text

    
    def spellcheck_message(s):
        return spellcheck_model([s])[0]
    
    def compare_messages(before, after):
        return damerauLevenshtein(before, after, similarity=True)

    def lemmatizer(x):
        return " ".join(mystem.lemmatize(x)).strip()


    pronomens = {
        'pr1': ['я', 'мой', 'меня'],
        'pr2': ['ты', 'твой', 'тебя'],
        'pr3': ['себя'],
        'pr4': ['мы', 'наш', 'нас'],
        'pr5': ['вы', 'ваш', 'вас'],
        'pr6': ['он', 'она', 'они', 'оно', 'её', 'его', 'их']
    }

    def compute_pronomens(text):
        result = []
        words = text.split()
        for key in pronomens.keys():
            pr_count = 0
            for word in pronomens[key]:
                pr_count += text.count(word)
            result.append(pr_count)
        return result


    def count_words(text):
        return len(text.split())

    # Замена имён и акций, добавить замену ссылок и смайлов на их названия, подсчёт количества символов
    pulse['n_words'] = pulse['message'].apply(count_words)
    pulse['message'] = pulse['message'].apply(remove_mentions).apply(remove_stocks)

    pulse['caps_lock'] = pulse['message'].apply(get_caps_feature)
    pulse['mentions'] = pulse['message'].apply(get_mentions)
    pulse['actions'] = pulse['message'].apply(get_actions)
    # Добавить ссылки и смайлики

    # Удаление лишних символов и вычисление длины строки
    pulse['message'] = pulse['message'].apply(text_preprocessing)
    pulse['length'] = pulse['message'].str.len()

    
    pulse['message_no_spellcheck'] = pulse['message']
    # Здесь добавить исправление грамотности и сравнение грамотных сообщений с предыдущими
    if pulse_data:
        pulse['message'] = pulse['message'].apply(spellcheck_message)
        
    pulse['message_grammar'] = pulse.apply(lambda x: compare_messages(x['message_no_spellcheck'], x['message']), axis=1)

    # Леммы
    pulse['message'] = pulse['message'].apply(lemmatizer)

    # Удалить пустые
    pulse = pulse[pulse['n_words'] != 0]

    # Подсчёт местоимений
    pr_features = pd.DataFrame(pulse['message'].apply(compute_pronomens).to_list(), columns=pronomens.keys())
    for feature in pr_features.columns:
        pulse[feature] = list(pr_features[feature])
    
    # Деление на количество слов нужных колонок
    columns = ['mentions', 'actions', 'pr1', 'pr2', 'pr3', 'pr4', 'pr5', 'pr6', 'caps_lock']
    for c in columns:
        pulse[c] = pulse[c] / pulse['n_words']
    
    return pulse


# processed_df = prepare_preproc_and_features(pulse, pulse_data=False)
pulse_processed_df = prepare_preproc_and_features(pulse, pulse_data=True)

In [267]:
edges

[(55357, 160831),
 (55357, 89735),
 (55357, 35019),
 (55357, 139057),
 (55357, 118609),
 (55357, 69149),
 (55357, 13484),
 (55357, 108574),
 (55357, 14778),
 (55357, 156307),
 (55357, 134116),
 (55357, 128454),
 (55357, 130458),
 (55357, 175799),
 (55357, 89984),
 (55357, 173357),
 (55357, 41091),
 (55357, 205209),
 (55357, 104350),
 (55357, 176664),
 (55357, 206446),
 (55357, 163709),
 (55357, 145483),
 (55357, 133365),
 (55357, 166953),
 (55357, 59642),
 (55357, 104627),
 (55357, 66064),
 (55357, 138805),
 (55357, 59684),
 (55357, 101834),
 (55357, 177193),
 (55357, 30185),
 (55357, 54046),
 (55357, 43990),
 (55357, 193088),
 (55357, 137252),
 (55357, 138659),
 (55357, 121690),
 (55357, 185929),
 (55357, 136501),
 (55357, 31596),
 (55357, 37551),
 (55357, 124740),
 (55357, 68999),
 (55357, 60001),
 (55357, 23800),
 (55357, 29932),
 (55357, 65881),
 (55357, 179408),
 (55357, 167183),
 (55357, 28114),
 (55357, 102509),
 (55357, 45202),
 (55357, 21816),
 (55357, 152069),
 (55357, 73310)

In [270]:
for i in edges:
    print(i)
    break

(55357, 160831)


In [260]:
PMI['мудак']['нарик']

6.2835350195401

In [261]:
PMI['нарик']['мудак']

6.2835350195401

In [220]:
for t in df['text']:
    print(t)
    break

дворник   надо   тоже   уничтожать
