In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.datasets import fetch_20newsgroups
from collections import Counter
from sklearn.model_selection import StratifiedKFold, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import pandas as pd
import numpy as np
from sklearn.metrics import accuracy_score, classification_report
import re

In [2]:
train = fetch_20newsgroups()
test = fetch_20newsgroups(subset="test")

In [3]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)

# CountVectorizer

In [9]:
pipeline = Pipeline([
    ('bow', CountVectorizer()),
    ('clf', LogisticRegression()),
])

In [10]:
params = dict(clf__C=[10, 1, 0.1, 0.01])
grid_search = GridSearchCV(pipeline, params, scoring="accuracy", cv=skf, n_jobs=-1)

In [11]:
grid_search.fit(train["data"], train["target"], )



GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=0, shuffle=True),
             error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('bow',
                                        CountVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.int64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                    

In [12]:
grid_search.best_score_, grid_search.best_estimator_

(0.8927876966590066, Pipeline(memory=None,
          steps=[('bow',
                  CountVectorizer(analyzer='word', binary=False,
                                  decode_error='strict',
                                  dtype=<class 'numpy.int64'>, encoding='utf-8',
                                  input='content', lowercase=True, max_df=1.0,
                                  max_features=None, min_df=1,
                                  ngram_range=(1, 1), preprocessor=None,
                                  stop_words=None, strip_accents=None,
                                  token_pattern='(?u)\\b\\w\\w+\\b',
                                  tokenizer=None, vocabulary=None)),
                 ('clf',
                  LogisticRegression(C=1, class_weight=None, dual=False,
                                     fit_intercept=True, intercept_scaling=1,
                                     l1_ratio=None, max_iter=100,
                                     multi_class='warn', n_jobs

In [13]:
pipeline = Pipeline([
    ('bow', CountVectorizer()),
    ('clf', LogisticRegression(C=1)),
])
pipeline.fit(train["data"], train["target"])



Pipeline(memory=None,
         steps=[('bow',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('clf',
                 LogisticRegression(C=1, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='warn', n_jobs=None,
                             

In [16]:
predictions = pipeline.predict(test["data"])
accuracy_score(test["target"], predictions)

0.8068242166755177

In [17]:

print(classification_report(test["target"], predictions, target_names=test["target_names"]))

                          precision    recall  f1-score   support

             alt.atheism       0.76      0.76      0.76       319
           comp.graphics       0.68      0.76      0.72       389
 comp.os.ms-windows.misc       0.72      0.69      0.70       394
comp.sys.ibm.pc.hardware       0.67      0.69      0.68       392
   comp.sys.mac.hardware       0.76      0.81      0.78       385
          comp.windows.x       0.83      0.70      0.76       395
            misc.forsale       0.81      0.90      0.86       390
               rec.autos       0.84      0.86      0.85       396
         rec.motorcycles       0.94      0.93      0.94       398
      rec.sport.baseball       0.86      0.90      0.88       397
        rec.sport.hockey       0.93      0.93      0.93       399
               sci.crypt       0.93      0.88      0.91       396
         sci.electronics       0.70      0.74      0.72       393
                 sci.med       0.86      0.78      0.82       396
         

# Tfidf

In [34]:
pipeline_tfidf = Pipeline([
    ('bow', TfidfVectorizer(
                    analyzer='word',
                    ngram_range=(1,2), 
                    vocabulary=None,
                    max_df=1.0, 
                    stop_words={'english'},
                    smooth_idf=True
                    )),
    ('clf', LogisticRegression()),
])

In [35]:
params = dict(clf__C=[10, 1, 0.1, 0.01])
grid_search_tfidf = GridSearchCV(pipeline_tfidf, params, scoring="accuracy", cv=skf, n_jobs=-1)

In [36]:
grid_search_tfidf.fit(train["data"], train["target"], )



GridSearchCV(cv=StratifiedKFold(n_splits=5, random_state=0, shuffle=True),
             error_score='raise-deprecating',
             estimator=Pipeline(memory=None,
                                steps=[('bow',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                  

In [37]:
grid_search_tfidf.best_score_, grid_search_tfidf.best_estimator_

(0.9192151316952448, Pipeline(memory=None,
          steps=[('bow',
                  TfidfVectorizer(analyzer='word', binary=False,
                                  decode_error='strict',
                                  dtype=<class 'numpy.float64'>,
                                  encoding='utf-8', input='content',
                                  lowercase=True, max_df=1.0, max_features=None,
                                  min_df=1, ngram_range=(1, 2), norm='l2',
                                  preprocessor=None, smooth_idf=True,
                                  stop_words={'english'}, strip_accents=None,
                                  sublinear_tf=False,
                                  token_pattern='(?u)\\b\\w\\w+\\b',
                                  tokenizer=None, use_idf=True,
                                  vocabulary=None)),
                 ('clf',
                  LogisticRegression(C=10, class_weight=None, dual=False,
                                 

In [38]:
pipeline_tfidf = Pipeline([
    ('bow', TfidfVectorizer(
                    analyzer='word',
                    ngram_range=(1,2), 
                    vocabulary=None,
                    max_df=1.0, 
                    smooth_idf=True
                    )),
    
    ('clf', LogisticRegression(C=10)),])
pipeline_tfidf.fit(train["data"], train["target"])



Pipeline(memory=None,
         steps=[('bow',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=1.0, max_features=None,
                                 min_df=1, ngram_range=(1, 2), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=None, strip_accents=None,
                                 sublinear_tf=False,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, use_idf=True,
                                 vocabulary=None)),
                ('clf',
                 LogisticRegression(C=10, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,

In [39]:
predictions = pipeline_tfidf.predict(test["data"])
accuracy_score(test["target"], predictions)

0.846388741370154

In [None]:
pipeline_tfidf.

In [269]:
import gc
gc.collect()

117

# features engineering

In [210]:
# контрастные соединения
contrast_conj = set([
'alternatively','anyway','but','by contrast','differ from','elsewhere','even so','however','in contrast','in fact',
'in other respects','in spite of','in that respect','instead','nevertheless','on the contrary','on the other hand',
'rather','though','whereas','yet'])

# чтобы получить "чистоту" отзыва ~ показывает ту же тональность к обзору (~ 1) или изменение настроения (~ 0)
def purity(sentences):
    # получает полярность по всем предложениям
    polarities = np.array([TextBlob(x).sentiment.polarity for x in sentences])
    return polarities.sum() / np.abs(polarities).sum()

# uppercase pattern
uppercase_pattern = re.compile(r'(\b[0-9]*[A-Z]+[0-9]*[A-Z]+[0-9]*\b)')

# регулярное выражение для разделения отзыва на предложения, вы можете использовать метод из textblob: TextBlob(x).sentences_
sentence_splitter = re.compile('(?<!\w\.\w.)(?<![A-Z][a-z]\.)(?<![A-Z]\.)(?<=\!|\?|\.)\s')
# you can https://regex101.com/ for regex creation/checking, very convenient

In [84]:
Subject=train['data'][1].split('Subject:')[1].split('\n')[0]


In [100]:
positive_smiles = set([
":‑)",":)",":-]",":]",":-3",":3",":->",":>","8-)","8)",":-}",":}",":o)",":c)",":^)","=]","=)",":‑D",":D","8‑D","8D",
"x‑D","xD","X‑D","XD","=D","=3","B^D",":-))",";‑)",";)","*-)","*)",";‑]",";]",";^)",":‑,",";D",":‑P",":P","X‑P","XP",
"x‑p","xp",":‑p",":p",":‑Þ",":Þ",":‑þ",":þ",":‑b",":b","d:","=p",">:P", ":'‑)", ":')",  ":-*", ":*", ":×"
])
negative_smiles = set([
":‑(",":(",":‑c",":c",":‑<",":<",":‑[",":[",":-||",">:[",":{",":@",">:(","D‑':","D:<","D:","D8","D;","D=","DX",":‑/",
":/",":‑.",'>:\\', ">:/", ":\\", "=/" ,"=\\", ":L", "=L",":S",":‑|",":|","|‑O","<:‑|"
])

def get_token_level_features(texts, visualize=True):

    tdf = pd.DataFrame()
    tdf['text'] = texts # this is our review
    
    tdf['subject'] = tdf.text.apply(lambda s: s.split('Subject:')[1].split('\n')[0])
    tdf['imail_cnt'] = tdf.text.apply(lambda s: len([x for x in s.split('@')]))
 
    tdf['positive_smiles'] = tdf.text.apply(lambda s: len([x for x in s.split() if x in positive_smiles]))
    tdf['negative_smiles'] = tdf.text.apply(lambda s: len([x for x in s.split() if x in negative_smiles]))
    
    if visualize:
        # this is used for visual clarity, return pd.DataFrame
        return tdf 
    else:
        # get correct (and sparse) representation of feature matrix F
        from scipy.sparse import csr_matrix 
        return csr_matrix(tdf[tdf.columns[2:]].values)
    

In [116]:
df_features=get_token_level_features(train['data'])
df_features[['subject']] = df_features[['subject']].fillna('NoSubject')
df_features.shape

(11314, 5)

In [3]:
t1=TfidfVectorizer(
                    analyzer='word',
                    ngram_range=(1,2), 
                    vocabulary=None,
                    max_df=1.0, 
                    smooth_idf=True
                    )

t2=TfidfVectorizer(
                    analyzer='word',
                    ngram_range=(1,2), 
                    vocabulary=None,
                    max_df=1.0, 
                    smooth_idf=True
                    )

In [4]:
core_fea=t1.fit_transform(train["data"])

In [140]:
subject_fea=t2.fit_transform(df_features["subject"])

In [123]:
core_fea

<11314x1181803 sparse matrix of type '<class 'numpy.float64'>'
	with 4696188 stored elements in Compressed Sparse Row format>

In [124]:
subject_fea

<11314x24774 sparse matrix of type '<class 'numpy.float64'>'
	with 113711 stored elements in Compressed Sparse Row format>

In [125]:
add_fea=get_token_level_features(train['data'], False)

In [187]:
add_fea

<11314x3 sparse matrix of type '<class 'numpy.int64'>'
	with 11762 stored elements in Compressed Sparse Row format>

In [127]:
from scipy.sparse import hstack
train_fea=hstack((core_fea, subject_fea, add_fea))

In [128]:
train_fea

<11314x1206580 sparse matrix of type '<class 'numpy.float64'>'
	with 4821661 stored elements in COOrdinate format>

In [131]:
logReg=LogisticRegression(C=10, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='warn', n_jobs=-1,
                                    penalty='l2', random_state=None,
                                    solver='warn', tol=0.0001, verbose=0,
                                    warm_start=False)

In [133]:
logReg.fit(train_fea, train["target"])



LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [174]:
df_features_test=get_token_level_features(test['data'])
df_features_test[['subject']] = df_features_test[['subject']].fillna('NoSubject')

In [5]:
test_core = t1.transform(test["data"])
test_subject_fea = t2.transform(df_features_test["subject"])
add_fea_test = get_token_level_features(test['data'], False)


In [191]:
test_features=hstack((test_core, test_subject_fea, add_fea_test))


In [193]:
predictions = logReg.predict(test_features)
accuracy_score(test["target"], predictions)

0.8355018587360595

In [8]:
import numpy as np
import pandas as pd
from hyperopt import hp, tpe, Trials
from hyperopt.fmin import fmin
import lightgbm as lgb
import hyperopt
from sklearn.model_selection import cross_val_score

In [9]:
from sklearn.model_selection import train_test_split

In [6]:
params = {
    'boosting_type': 'gbdt',
    'objective': 'multiclass',
    'learning_rate': 0.01,
    'num_class':20,
    'max_depth': 4,
    'num_leaves': 20,
    'verbose': -1}

In [383]:
lgb_model = lgbm.LGBMClassifier(
        **params
    )

In [10]:
X_train, X_test, y_train, y_test = train_test_split(core_fea, train["target"], test_size=0.33, random_state=42)

In [371]:
lgb_model.fit(X_train, y_train, eval_set=[(X_test, y_test)],verbose=10)

[100]	valid_0's multi_logloss: 0.589869


LGBMClassifier(boosting_type='gbdt', class_weight=None, colsample_bytree=1.0,
               importance_type='split', learning_rate=0.2, max_depth=5,
               min_child_samples=20, min_child_weight=0.001, min_split_gain=0.0,
               n_estimators=100, n_jobs=-1, num_leaves=10,
               objective='multiclass', random_state=None, reg_alpha=0.0,
               reg_lambda=0.0, silent=True, subsample=1.0,
               subsample_for_bin=200000, subsample_freq=0)

In [372]:
predictions =lgb_model.predict(test_core)
accuracy_score(test["target"], predictions)

0.7615507169410515

In [373]:
predictions =lgb_model.predict(X_test)
accuracy_score(y_test, predictions)

0.8355650776647028

In [8]:
d_train=lgb.Dataset(core_fea, label=train["target"], params={'verbose': -1})

In [9]:
cv_results = lgb.cv(params, d_train, num_boost_round=1000, nfold=5, 
                     early_stopping_rounds=40)

In [379]:
max(test["target"])

19

In [12]:
import xgboost as xgb

In [15]:
def objective(params):
    params = {
        'objective': 'mlogloss', 
        'max_depth': int(params['max_depth']),
        'gamma': "{:.3f}".format(params['gamma']),
        'colsample_bytree': '{:.3f}'.format(params['colsample_bytree']),
    }
    
    clf = xgb.XGBClassifier(
        n_estimators=250,
        learning_rate=0.05,
        n_jobs=4,
        **params
    )
    
    score = cross_val_score(clf, core_fea, train["target"], scoring='accuracy', cv=StratifiedKFold()).mean()
    print("Accuracy {:.3f} params {}".format(score, params))
    return score

space = {
    'max_depth': hp.quniform('max_depth', 2, 8, 1),
    'colsample_bytree': hp.uniform('colsample_bytree', 0.3, 1.0),
    'gamma': hp.uniform('gamma', 0.0, 0.5),
}


In [16]:
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=10)

  0%|                                                                                                                           | 0/10 [00:00<?, ?trial/s, best loss=?]





Accuracy 0.822 params {'max_depth': 7, 'gamma': '0.245', 'colsample_bytree': '0.671'}                                                                                  
 10%|█████████                                                                                  | 1/10 [2:59:06<26:51:54, 10746.09s/trial, best loss: 0.82190210037961]





Accuracy 0.839 params {'max_depth': 7, 'gamma': '0.183', 'colsample_bytree': '0.357'}                                                                                  
 20%|██████████████████▍                                                                         | 2/10 [4:40:47<20:47:02, 9352.82s/trial, best loss: 0.82190210037961]





Accuracy 0.826 params {'max_depth': 5, 'gamma': '0.328', 'colsample_bytree': '0.535'}                                                                                  
 30%|███████████████████████████▌                                                                | 3/10 [6:26:05<16:24:55, 8442.23s/trial, best loss: 0.82190210037961]





Accuracy 0.827 params {'max_depth': 8, 'gamma': '0.046', 'colsample_bytree': '0.531'}                                                                                  
 40%|████████████████████████████████████▊                                                       | 4/10 [8:46:29<14:03:40, 8436.73s/trial, best loss: 0.82190210037961]





Accuracy 0.823 params {'max_depth': 5, 'gamma': '0.456', 'colsample_bytree': '0.583'}                                                                                  
 50%|█████████████████████████████████████████████▌                                             | 5/10 [10:38:11<10:59:41, 7916.25s/trial, best loss: 0.82190210037961]





Accuracy 0.816 params {'max_depth': 6, 'gamma': '0.325', 'colsample_bytree': '0.972'}                                                                                  
 60%|█████████████████████████████████████████████████████▍                                   | 6/10 [13:56:25<10:07:18, 9109.72s/trial, best loss: 0.8156265032564001]





Accuracy 0.819 params {'max_depth': 4, 'gamma': '0.113', 'colsample_bytree': '0.791'}                                                                                  
 70%|██████████████████████████████████████████████████████████████▉                           | 7/10 [16:04:35<7:14:10, 8683.64s/trial, best loss: 0.8156265032564001]





Accuracy 0.800 params {'max_depth': 2, 'gamma': '0.120', 'colsample_bytree': '0.488'}                                                                                  
 80%|████████████████████████████████████████████████████████████████████████                  | 8/10 [17:14:25<4:04:31, 7335.55s/trial, best loss: 0.7998946732654342]





Accuracy 0.821 params {'max_depth': 3, 'gamma': '0.257', 'colsample_bytree': '0.312'}                                                                                  
 90%|█████████████████████████████████████████████████████████████████████████████████         | 9/10 [18:20:58<1:45:32, 6332.83s/trial, best loss: 0.7998946732654342]





Accuracy 0.813 params {'max_depth': 3, 'gamma': '0.032', 'colsample_bytree': '0.819'}                                                                                  
100%|███████████████████████████████████████████████████████████████████████████████████████████| 10/10 [20:16:28<00:00, 7298.83s/trial, best loss: 0.7998946732654342]


In [17]:
print("Hyperopt estimated optimum {}".format(best))

Hyperopt estimated optimum {'colsample_bytree': 0.4876012032299183, 'gamma': 0.11966423607931742, 'max_depth': 2.0}


In [37]:
params_best={
        'objective': 'multi:softmax', 'num_class': 20, 
        'max_depth': 7, 'gamma': 0.183, 'colsample_bytree': 0.357}

In [38]:
data_xgb=xgb.DMatrix(X_train, label=y_train)
valid_xgb=xgb.DMatrix(X_test, label=y_test)


In [39]:
xgb_model = xgb.train(params_best, data_xgb, num_boost_round=100, evals=[(data_xgb, 'train'), (valid_xgb, 'valid')])

[0]	train-mlogloss:1.95572	valid-mlogloss:2.03817
[1]	train-mlogloss:1.63448	valid-mlogloss:1.79574
[2]	train-mlogloss:1.39261	valid-mlogloss:1.61236
[3]	train-mlogloss:1.20784	valid-mlogloss:1.46787
[4]	train-mlogloss:1.05931	valid-mlogloss:1.35493
[5]	train-mlogloss:0.94170	valid-mlogloss:1.26359
[6]	train-mlogloss:0.84989	valid-mlogloss:1.18817
[7]	train-mlogloss:0.76977	valid-mlogloss:1.12874
[8]	train-mlogloss:0.70081	valid-mlogloss:1.08433
[9]	train-mlogloss:0.64087	valid-mlogloss:1.03942
[10]	train-mlogloss:0.58532	valid-mlogloss:1.00153
[11]	train-mlogloss:0.53813	valid-mlogloss:0.96786
[12]	train-mlogloss:0.49616	valid-mlogloss:0.94235
[13]	train-mlogloss:0.45837	valid-mlogloss:0.91686
[14]	train-mlogloss:0.42087	valid-mlogloss:0.88829
[15]	train-mlogloss:0.38652	valid-mlogloss:0.86445
[16]	train-mlogloss:0.35796	valid-mlogloss:0.84325
[17]	train-mlogloss:0.33237	valid-mlogloss:0.82477
[18]	train-mlogloss:0.31125	valid-mlogloss:0.80840
[19]	train-mlogloss:0.28955	valid-mloglos

In [41]:
test_xgb=xgb.DMatrix(test_core)

In [43]:
predictions =xgb_model.predict(test_xgb)
accuracy_score(test["target"], predictions)

0.7537174721189591

In [3]:
from catboost import Pool, CatBoostClassifier

In [10]:
def objective(params):
    params = {'iterations':1000,
    'learning_rate' :0.01,
    'random_strength' : 0.1,
    'depth':3,
    'loss_function' :'MultiClass',
    'eval_metric':'Accuracy',
    'leaf_estimation_method': 'Newton'
    }
    
    clf = CatBoostClassifier(
        **params
    )
    
    score = cross_val_score(clf, core_fea, train["target"], scoring='accuracy', cv=StratifiedKFold()).mean()
    print("Accuracy {:.3f} params {}".format(score, params))
    return score

space = {
    'max_depth': hp.quniform('max_depth', 2, 8, 1),
    'learning_rate': hp.uniform('learning_rate', 0.05, 1.0),
    'random_strength': hp.uniform('random_strength', 0.1, 0.5),
}

In [None]:
best = fmin(fn=objective,
            space=space,
            algo=tpe.suggest,
            max_evals=10)

  0%|                                                                                                                            | 0/5 [00:00<?, ?trial/s, best loss=?]




In [4]:
params = {'iterations':1000,
    'learning_rate' :0.01,
    'random_strength' : 0.1,
    'depth':3,
    'loss_function' :'MultiClass',
    'eval_metric':'Accuracy',
    'leaf_estimation_method': 'Newton'
    }
    
clf = CatBoostClassifier(
        **params
    )

In [None]:
train_pool = Pool(X_train, label=y_train)
valid_pool = Pool(X_test, label=y_test) 
test_pool = Pool(test_core) 

In [5]:
clf.fit(train_pool,eval_set=valid_pool)

NameError: name 'train_pool' is not defined

In [None]:
predictions =clf.predict(test_pool)
accuracy_score(test["target"], predictions)