In [1]:
import numpy as np
import pandas as pd
import nltk

In [333]:
data_df = pd.read_csv('news.csv')
data_df.head(10)

Unnamed: 0,text,label
0,Here are Thursday's biggest analyst calls: App...,0
1,Buy Las Vegas Sands as travel to Singapore bui...,0
2,"Piper Sandler downgrades DocuSign to sell, cit...",0
3,"Analysts react to Tesla's latest earnings, bre...",0
4,Netflix and its peers are set for a ‘return to...,0
5,Barclays believes earnings for these underperf...,0
6,"Bernstein upgrades Alibaba, says shares can ra...",0
7,"Analysts react to Netflix's strong quarter, wi...",0
8,Buy Chevron as shares look attractive at these...,0
9,Morgan Stanley says these global stocks are se...,0


In [334]:
total_nulls = data_df[data_df.text.str.strip() == ''].shape[0]
print("Порожні документи:", total_nulls)

Порожні документи: 0


In [335]:
data_df.shape

(16990, 2)

In [336]:
import re

wpt = nltk.WordPunctTokenizer()
stop_words = nltk.corpus.stopwords.words('english')


def preproc_doc(doc):
    doc = re.sub(r'[^a-zA-Z\s]', '', doc, re.I | re.A)
    doc = re.sub(r'http\S+', '', doc)
    doc = doc.lower()
    doc = doc.strip()
    tokens = wpt.tokenize(doc)
    filtered_tokens = [token for token in tokens if token not in stop_words]
    doc = ' '.join(filtered_tokens)
    return doc


preproc_corpus = np.vectorize(preproc_doc)
p_corpus = preproc_corpus(data_df['text'])
data_df['clean text'] = p_corpus
data_df

Unnamed: 0,text,label,clean text
0,Here are Thursday's biggest analyst calls: App...,0,thursdays biggest analyst calls apple amazon t...
1,Buy Las Vegas Sands as travel to Singapore bui...,0,buy las vegas sands travel singapore builds we...
2,"Piper Sandler downgrades DocuSign to sell, cit...",0,piper sandler downgrades docusign sell citing ...
3,"Analysts react to Tesla's latest earnings, bre...",0,analysts react teslas latest earnings break wh...
4,Netflix and its peers are set for a ‘return to...,0,netflix peers set return growth analysts say g...
...,...,...,...
16985,KfW credit line for Uniper could be raised to ...,3,kfw credit line uniper could raised bln eur ha...
16986,KfW credit line for Uniper could be raised to ...,3,kfw credit line uniper could raised bln eur ha...
16987,Russian https://t.co/R0iPhyo5p7 sells 1 bln r...,3,russian sells bln roubles oneyear repo auction
16988,Global ESG bond issuance posts H1 dip as supra...,3,global esg bond issuance posts h dip supranati...


In [337]:
total_nulls = data_df[data_df.text.str.strip() == ''].shape[0]
print("Порожні документи:", total_nulls)

Порожні документи: 0


In [338]:
from sklearn.model_selection import train_test_split

train_text, test_text, train_label, test_label = train_test_split(np.array(data_df['clean text']),
                                                                  np.array(data_df['label']), test_size=0.3,
                                                                  random_state=0)
train_text.shape, test_text.shape

((11893,), (5097,))

In [339]:
from collections import Counter

trd = dict(Counter(train_label))
tsd = dict(Counter(test_label))

(pd.DataFrame([[key, trd[key], tsd[key]] for key in trd], columns=['label', 'train', 'test']).sort_values(by=['label'],
                                                                                                          ascending=True))

Unnamed: 0,label,train,test
9,0,174,81
8,1,584,253
3,2,2533,1012
13,3,221,100
5,4,256,103
10,5,705,282
11,6,380,144
4,7,410,214
17,8,114,52
0,9,1082,475


In [340]:
from gensim.models import word2vec

# wpt = nltk.WordPunctTokenizer()
tokenized_train = [wpt.tokenize(document) for document in train_text]
tokenized_test = [wpt.tokenize(document) for document in test_text]

w2v_num_features = 150
w2v_model = word2vec.Word2Vec(tokenized_train, vector_size=w2v_num_features, window=30, min_count=7, sample=1e-3)

In [341]:
def average_word_vectors(words, model, vocabulary, num_features):
    feature_vector = np.zeros((num_features,), dtype="float64")
    n_words = 0
    for word in words:
        if word in vocabulary:
            n_words = n_words + 1.
            feature_vector = np.add(feature_vector, model.wv[word])
    if n_words:
        feature_vector = np.divide(feature_vector, n_words)
    return feature_vector

In [342]:
def document_vectorize(corpus, model, num_features):
    vocabulary = set(model.wv.index_to_key)
    features = [average_word_vectors(tokenized_sentence, model, vocabulary, num_features) for tokenized_sentence in
                corpus]
    return np.array(features)

In [343]:
avg_wv_train_features = document_vectorize(corpus=tokenized_train, model=w2v_model, num_features=w2v_num_features)
avg_wv_test_features = document_vectorize(corpus=tokenized_test, model=w2v_model, num_features=w2v_num_features)
print('Word2Vec model:> Train features shape:', avg_wv_train_features.shape,
      'Test features shape:', avg_wv_test_features.shape)

Word2Vec model:> Train features shape: (11893, 150) Test features shape: (5097, 150)


#### Градієнтний бустинг

In [352]:
from sklearn.model_selection import cross_val_score
from sklearn.ensemble import GradientBoostingClassifier

gbc = GradientBoostingClassifier(n_estimators=5, random_state=0)
gbc.fit(avg_wv_train_features, train_label)
gbc_bow_scores = cross_val_score(gbc, avg_wv_train_features, train_label, cv=5)
gbc_bow_mean_score = np.mean(gbc_bow_scores)

In [353]:
print('Accuracy (5-fold):', gbc_bow_scores)
print('Mean Accuracy:', gbc_bow_mean_score)
svm_bow_test_score = gbc.score(avg_wv_test_features, test_label)
print('Test Accuracy:', svm_bow_test_score)

Accuracy (5-fold): [0.48844052 0.48507776 0.47078604 0.48149706 0.46888141]
Mean Accuracy: 0.47893655977043315
Test Accuracy: 0.45830880910339417


#### Опорні вектори

In [354]:
from sklearn import svm

clf = svm.SVC(kernel='linear', C=1.0)
clf.fit(avg_wv_train_features, train_label)

svm_bow_scores = cross_val_score(clf, avg_wv_train_features, train_label, cv=5)
svm_bow_mean_score = np.mean(svm_bow_scores)

In [355]:
print('Accuracy (5-fold):', svm_bow_scores)
print('Mean Accuracy:', svm_bow_mean_score)
svm_bow_test_score = clf.score(avg_wv_test_features, test_label)
print('Test Accuracy:', svm_bow_test_score)

Accuracy (5-fold): [0.52038672 0.5136612  0.49894914 0.50378469 0.49579479]
Mean Accuracy: 0.5065153072281255
Test Accuracy: 0.5000980969197567


#### GridSearchCV

In [367]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer

gb_pipeline = Pipeline([('tfidf', TfidfVectorizer()), ('clf', GradientBoostingClassifier())])
param_grid = {'tfidf__ngram_range': [(1, 1), (1, 2)],
    'clf__learning_rate': [0.1],
    'clf__n_estimators': [10, 20]}

gs_gb = GridSearchCV(gb_pipeline, param_grid, cv=5, verbose=2)
gs_gb = gs_gb.fit(train_text, train_label)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV] END clf__learning_rate=0.1, clf__n_estimators=10, tfidf__ngram_range=(1, 1); total time=  15.7s
[CV] END clf__learning_rate=0.1, clf__n_estimators=10, tfidf__ngram_range=(1, 1); total time=  15.9s
[CV] END clf__learning_rate=0.1, clf__n_estimators=10, tfidf__ngram_range=(1, 1); total time=  15.6s
[CV] END clf__learning_rate=0.1, clf__n_estimators=10, tfidf__ngram_range=(1, 1); total time=  15.5s
[CV] END clf__learning_rate=0.1, clf__n_estimators=10, tfidf__ngram_range=(1, 1); total time=  15.5s
[CV] END clf__learning_rate=0.1, clf__n_estimators=10, tfidf__ngram_range=(1, 2); total time=  54.7s
[CV] END clf__learning_rate=0.1, clf__n_estimators=10, tfidf__ngram_range=(1, 2); total time=  54.7s
[CV] END clf__learning_rate=0.1, clf__n_estimators=10, tfidf__ngram_range=(1, 2); total time=  54.5s
[CV] END clf__learning_rate=0.1, clf__n_estimators=10, tfidf__ngram_range=(1, 2); total time=  54.1s
[CV] END clf__learning_rate=0.1

In [368]:
score = gs_gb.score(test_text, test_label)
print('Test Accuracy :', score)

Test Accuracy : 0.6231116342946832
