In [1]:
import pandas as pd
import os
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

import numpy as np

In [2]:
try:
    from google.colab import drive
    drive.mount('/content/drive')
    data_prefix = '/content/drive/My Drive/NLP'
except ModuleNotFoundError:
    data_prefix = 'data'

In [62]:
raw_train = pd.read_parquet(os.path.join(data_prefix, 'train.parquet'))


# Разделение данных на тренировочную и валидационную выборки 

В курсе по NLP от ШАДа рекомендуют делать это разделение до начала предварительной обработки, чтобы нигде не накосячить, поэтому мы тоже так сделаем.

In [9]:
raw_target = raw_train['target']
raw_data   = raw_train.drop(['target'], axis = 1)

In [12]:
train_data, test_data, train_target, test_target = train_test_split(raw_data, raw_target,
                                                                    test_size=0.25, random_state=47)

# Предварительная обработка данных

In [18]:
import nltk
nltk.download('stopwords')

from data_preprocessing import remove_html_tags, stay_only_a_z, tokenize_by_word, remove_stop_words

[nltk_data] Downloading package stopwords to /home/droman/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Препроцессинг для Body

In [24]:
def body_preprocess(text):
#     text = remove_html_tags(text)
    text = stay_only_a_z(text)
    text = tokenize_by_word(text)
#     text = remove_stop_words(text)
    return text.lower()

### Выполним препроцессинг для данных

In [26]:
train_data['Body'] = train_data['Body'].apply(body_preprocess)
test_data ['Body'] = test_data ['Body'].apply(body_preprocess)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['Body'] = train_data['Body'].apply(body_preprocess)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data ['Body'] = test_data ['Body'].apply(body_preprocess)


## Препроцессинг для Title

In [28]:
def title_preprocessing(text):
    text = stay_only_a_z(text)
    text = tokenize_by_word(text)
    return text.lower()

In [29]:
train_data['Title'] = train_data['Title'].apply(title_preprocessing)
test_data ['Title'] = test_data ['Title'].apply(title_preprocessing)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  train_data['Title'] = train_data['Title'].apply(title_preprocessing)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  test_data ['Title'] = test_data ['Title'].apply(title_preprocessing)


# Секция для определения исходных фичей

In [31]:
def get_raw_features(__data):
    __data['text'] = __data['Title'] + ' ' + __data['Body']
    return __data.drop(['Tags', 'Title', 'Body'], axis=1)

In [32]:
train_data = get_raw_features(train_data)
test_data = get_raw_features(test_data)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  __data['text'] = __data['Title'] + ' ' + __data['Body']


# Не забыть уйти от Pandas

In [35]:
train_data = train_data['text']
test_data = test_data['text']

train_target = train_target.values
test_target  = test_target.values

## W2V

In [21]:
import gensim.downloader as api
word2vec_model300 = api.load('word2vec-google-news-300')
word2vec_model300['upload']

array([-0.08251953, -0.03393555, -0.05200195,  0.08691406,  0.04663086,
        0.10205078,  0.11523438,  0.14941406,  0.07617188, -0.02319336,
       -0.28515625,  0.06298828, -0.06591797,  0.08105469, -0.18261719,
        0.34765625,  0.25390625,  0.21386719, -0.03833008, -0.20800781,
       -0.01513672,  0.05859375, -0.31054688,  0.3125    , -0.046875  ,
       -0.21386719,  0.09960938, -0.140625  ,  0.13964844, -0.31445312,
       -0.25      , -0.00430298, -0.02282715, -0.15625   , -0.19921875,
       -0.33398438,  0.00476074, -0.20996094,  0.18164062,  0.16015625,
       -0.11425781,  0.23535156,  0.31445312,  0.15722656,  0.01086426,
       -0.2890625 , -0.08105469,  0.12988281,  0.0213623 , -0.0625    ,
       -0.17871094, -0.30859375,  0.26953125, -0.16308594, -0.00300598,
       -0.01782227, -0.05444336,  0.05200195, -0.20507812,  0.31445312,
        0.48828125, -0.16601562, -0.23828125, -0.16992188, -0.1484375 ,
        0.05541992, -0.19140625,  0.00189209,  0.26171875,  0.34

In [48]:
w2v_test_str = 'algorithmm input graph g output set of msts t begin tnull egedges for all vertices in g create a tree t having single vertex b add t to t end for repeat find an edge e e having minimum weight such that one end belongs to t t and the other end does not belongs to any of the trees in t add e to t until e null im stuck on the logic for the highlighted block ive used simple objects for vertexedge and tree and for their sets used array of objects'

In [40]:
def get_w2v_str_representation_mean(text, model):
    words = text.split()
    words_representation = [model[word] for word in words if word in model.vocab]
    return np.mean(words_representation, axis=0)

def get_w2v_str_representation_sum(text, model):
    words = text.split()
    words_representation = [model[word] for word in words if word in model.vocab]
    return np.sum(words_representation, axis=0)

In [None]:
# !curl https://zenodo.org/record/1199620/files/SO_vectors_200.bin?download=1 

In [46]:
from gensim.models.keyedvectors import KeyedVectors
so_w2v = KeyedVectors.load_word2vec_format("SO_vectors_200.bin", binary=True)

In [61]:
get_w2v_str_representation_sum(w2v_test_str, so_w2v)[:10]

array([  8.154484,  12.033579, -41.343773,  38.206593, -77.40206 ,
        46.77203 ,  28.464117,  15.215707, -20.109539,  -9.964755],
      dtype=float32)

In [41]:
def w2v_vectorizer_sum_news(train, test):
    return ([get_w2v_str_representation_sum(text, word2vec_model300) for text in train],
            [get_w2v_str_representation_sum(text, word2vec_model300) for text in test])

def w2v_vectorizer_mean_news(train, test):
    return ([get_w2v_str_representation_mean(text, word2vec_model300) for text in train],
            [get_w2v_str_representation_mean(text, word2vec_model300) for text in test])

def w2v_vectorizer_sum_so(train, test):
    return ([get_w2v_str_representation_sum(text, so_w2v) for text in train],
            [get_w2v_str_representation_sum(text, so_w2v) for text in test])

def w2v_vectorizer_mean_so(train, test):
    return ([get_w2v_str_representation_mean(text, so_w2v) for text in train],
            [get_w2v_str_representation_mean(text, so_w2v) for text in test])

# D2V

In [45]:
from gensim.models.doc2vec import Doc2Vec

model= Doc2Vec.load("./d2v/doc2vec.bin")


In [60]:

def d2v_vectorizer(train, test):
    return ([model.infer_vector([text]) for text in train],
            [model.infer_vector([text]) for text in test])

# Построение классификаторов


## 3. Логистическая регрессия

In [56]:
def lr_score(trainX, trainY, testX, testY):
    lr_classifier = LogisticRegression()
    lr_classifier.fit(trainX, trainY)
    return lr_classifier.score(testX, testY)

In [57]:
'''========================================================================================================================
========================================================================================================================
========================================================================================================================
'''



In [58]:
def visualize_results(score):
    print(score)

def base_pipeline(train_x, train_y, test_x, test_y,
                  vectorizer, scorer):
    vectorized_train_x, vectorized_test_x = vectorizer(train_x, test_x)
    score = scorer(vectorized_train_x, train_y, vectorized_test_x, test_y)
    visualize_results(score)


In [41]:
base_pipeline(train_data, train_target, test_data, test_target,
              bow, lr_score)

0.8834166666666666


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


In [65]:
base_pipeline(train_data, train_target, test_data, test_target,
              tf_idf, lr_score)

0.8795


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [64]:
base_pipeline(train_data, train_target, test_data, test_target,
              w2v_vectorizer_sum_so, lr_score)

0.7144166666666667


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [26]:
base_pipeline(train_data, train_target, test_data, test_target,
              w2v_vectorizer_mean_news, lr_score)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


0.7126666666666667


In [61]:
base_pipeline(train_data, train_target, test_data, test_target,
              d2v_vectorizer, lr_score)



0.3278333333333333
