In [1]:
import pandas as pd
import os
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

import numpy as np

In [25]:
try:
    from google.colab import drive
    drive.mount('/content/drive')
    data_prefix = '/content/drive/My Drive/NLP'
except ModuleNotFoundError:
    data_prefix = 'data'

In [26]:
raw_train = pd.read_parquet(os.path.join(data_prefix, 'train.parquet'))


# Разделение данных на тренировочную и валидационную выборки 

В курсе по NLP от ШАДа рекомендуют делать это разделение до начала предварительной обработки, чтобы нигде не накосячить, поэтому мы тоже так сделаем.

In [27]:
raw_target = raw_train['target']
raw_data   = raw_train.drop(['target'], axis = 1)

In [28]:
train_data, test_data, train_target, test_target = train_test_split(raw_data, raw_target,
                                                                    test_size=0.25, random_state=47)

# Предварительная обработка данных

In [29]:
import nltk
nltk.download('stopwords')

from data_preprocessing import remove_html_tags, stay_only_a_z, tokenize_by_word, remove_stop_words

[nltk_data] Downloading package stopwords to /home/droman/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Препроцессинг для Body

In [30]:
def body_preprocess(text):
#     text = remove_html_tags(text)
    text = stay_only_a_z(text)
    text = tokenize_by_word(text)
#     text = remove_stop_words(text)
    return text.lower()

### Выполним препроцессинг для данных

In [31]:
train_data['Body'] = train_data['Body'].apply(body_preprocess)
test_data ['Body'] = test_data ['Body'].apply(body_preprocess)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['Body'] = train_data['Body'].apply(body_preprocess)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data ['Body'] = test_data ['Body'].apply(body_preprocess)


## Препроцессинг для Title

In [32]:
def title_preprocessing(text):
    text = stay_only_a_z(text)
    text = tokenize_by_word(text)
    return text.lower()

In [33]:
train_data['Title'] = train_data['Title'].apply(title_preprocessing)
test_data ['Title'] = test_data ['Title'].apply(title_preprocessing)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['Title'] = train_data['Title'].apply(title_preprocessing)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data ['Title'] = test_data ['Title'].apply(title_preprocessing)


# Секция для определения исходных фичей

In [34]:
def get_raw_features(__data):
    __data['text'] = __data['Title'] + ' ' + __data['Body']
    return __data.drop(['Tags', 'Title', 'Body'], axis=1)

In [35]:
train_data = get_raw_features(train_data)
test_data = get_raw_features(test_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  __data['text'] = __data['Title'] + ' ' + __data['Body']


# Не забыть уйти от Pandas

In [36]:
train_data = train_data['text']
test_data = test_data['text']

train_target = train_target.values
test_target  = test_target.values

In [73]:
# words = train_target[0].split()

def get_sentences(series_data):
    list_of_lists_data = [i.split() for i in series_data]
    return list_of_lists_data

In [59]:
def get_w2v_str_representation_mean(text, model):
    words = text.split()
    words_representation = [model[word] for word in words if word in model.wv.vocab]
    return np.mean(words_representation, axis=0)

def get_w2v_str_representation_sum(text, model):
    words = text.split()
    words_representation = [model[word] for word in words if word in model.wv.vocab]
    return np.sum(words_representation, axis=0)



In [60]:
def w2v_vectorizer_sum(train, test, model):
    return ([get_w2v_str_representation_sum(text, model) for text in train],
            [get_w2v_str_representation_sum(text, model) for text in test])

def w2v_vectorizer_mean(train, test, model):
    return ([get_w2v_str_representation_mean(text, model) for text in train],
            [get_w2v_str_representation_mean(text, model) for text in test])

OWN W2V

In [74]:
from gensim.models import Word2Vec
# sentences = [["cat", "say", "meow"], ["dog", "say", "woof"]]
sentences = get_sentences(train_data.append(test_data))

try:
    model = Word2Vec.load("own_w2v")
except Exception:
    model = Word2Vec(min_count=1)
    model.build_vocab(sentences)  # prepare the model vocabulary
    model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)  # train word vectors

    model.save("own_w2v")

  model.train(sentences, total_examples=model.corpus_count, epochs=model.iter)  # train word vectors


(21341324, 25694735)

# Построение классификаторов


## 3. Логистическая регрессия

In [53]:
def lr_score(trainX, trainY, testX, testY):
    lr_classifier = LogisticRegression()
    lr_classifier.fit(trainX, trainY)
    return lr_classifier.score(testX, testY)

In [57]:
'''========================================================================================================================
========================================================================================================================
========================================================================================================================
'''



In [51]:
def visualize_results(score):
    print(score)

def base_pipeline(train_x, train_y, test_x, test_y,
                  vectorizer, scorer):
    vectorized_train_x, vectorized_test_x = vectorizer(train_x, test_x, model)
    score = scorer(vectorized_train_x, train_y, vectorized_test_x, test_y)
    visualize_results(score)

In [77]:
base_pipeline(train_data, train_target, test_data, test_target,
              w2v_vectorizer_mean, lr_score)

  words_representation = [model[word] for word in words if word in model.wv.vocab]


0.8088333333333333
