In [19]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, PredefinedSplit
from sklearn.model_selection import train_test_split
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords

# чтение данных 

In [2]:
train = pd.read_csv('data/products_sentiment_train.tsv', sep='\t', header=None)
train.columns = ['text', 'y']
train, hold = train_test_split(train, test_size=0.3, stratify=train['y'])
train.head()

Unnamed: 0,text,y
1340,i treat the battery well and it has lasted .,1
108,"at first i had a problem with some smell , the...",1
747,"it 's a compact , attractive machine and was e...",1
597,the depth adjustment and bit changing is much ...,1
642,i simply love this feature .,1


In [3]:
test = pd.read_csv('data/products_sentiment_test.tsv', sep='\t')
test.head()

Unnamed: 0,Id,text
0,0,"so , why the small digital elph , rather than ..."
1,1,3/4 way through the first disk we played on it...
2,2,better for the zen micro is outlook compatibil...
3,3,6 . play gameboy color games on it with goboy .
4,4,"likewise , i 've heard norton 2004 professiona..."


In [4]:
print(train.shape)
print(hold.shape)
print(test.shape)

(1400, 2)
(600, 2)
(500, 2)


# Чистка

In [21]:
# подключение модулей
lemm_enabled = True
stem_enabled = True

## Удаляем пунктуацию

In [6]:
import string
def remove_punctuation(text):
    no_punct = "".join([c for c in text if c not in string.punctuation])
    return no_punct

train['text'] = train['text'].apply(remove_punctuation)
hold['text'] = hold['text'].apply(remove_punctuation)
test['text'] = test['text'].apply(remove_punctuation)

## Токенизируем

In [9]:

tokenizer = RegexpTokenizer(r'\w+')

def tokenize(text):
    return tokenizer.tokenize(text.lower())

train['text_tokenized'] = train['text'].apply(tokenize)
hold['text_tokenized'] = hold['text'].apply(tokenize)
test['text_tokenized'] = test['text'].apply(tokenize)

# Стоп слова

In [20]:
stopwords = stopwords.words('english')
    
def remove_stopwords(text_arr):
    return [w for w in text_arr if w not in stopwords]
    
train['text_tokenized'] = train['text_tokenized'].apply(remove_stopwords)
hold['text_tokenized'] = hold['text_tokenized'].apply(remove_stopwords)
test['text_tokenized'] = test['text_tokenized'].apply(remove_stopwords)

## лематизация

In [24]:

if lemm_enabled:
    from nltk.stem import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()

    def word_lemmatizer(text):
        lem_text = [lemmatizer.lemmatize(i) for i in text]
        return lem_text
    
    train['text_tokenized'] = train['text_tokenized'].apply(word_lemmatizer)
    hold['text_tokenized'] = hold['text_tokenized'].apply(word_lemmatizer)
    test['text_tokenized'] = test['text_tokenized'].apply(word_lemmatizer)


## Stemming

In [25]:
from nltk.stem import PorterStemmer

In [26]:


if stem_enabled:
    from nltk.stem import PorterStemmer
    stemmer = PorterStemmer()

    def word_stemmer(text):
        lem_text = [stemmer.stem(i) for i in text]
        return lem_text
    
    train['text_tokenized'] = train['text_tokenized'].apply(word_stemmer)
    hold['text_tokenized'] = hold['text_tokenized'].apply(word_stemmer)
    test['text_tokenized'] = test['text_tokenized'].apply(word_stemmer)


## Джоин массивов в предложения

In [27]:
train['text'] = train['text_tokenized'].apply(lambda x: " ".join(x))
hold['text'] = hold['text_tokenized'].apply(lambda x: " ".join(x))
test['text'] = test['text_tokenized'].apply(lambda x: " ".join(x))

In [28]:
print(train.shape)
print(hold.shape)
print(test.shape)

(1400, 3)
(600, 3)
(500, 3)


In [29]:
from ipywidgets import interact, interactive, interact_manual
import ipywidgets as ip

In [30]:
@interact
def showText(x=100):
    print(train.loc[x, 'text'])
    print(train.loc[x, 'y'] )

interactive(children=(IntSlider(value=100, description='x', max=300, min=-100), Output()), _dom_classes=('widg…

# Векторизация

## WordCount

In [31]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(binary=True, ngram_range=(1,2))

cv.fit(train['text'])
X_train_wc = cv.transform(train['text'])
X_hold_wc = cv.transform(hold['text'])
X_test_wc = cv.transform(test['text'])

## TF-IDF

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfv = TfidfVectorizer()
tfv.fit(train['text'])
X_train_tfidf = tfv.transform(train['text'])
X_hold_tfidf = tfv.transform(hold['text'])
X_test_tfidf = tfv.transform(test['text'])

# Моделирование логрегом

In [33]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, roc_auc_score

In [34]:
X_train = train['text']
y_train = train['y']

X_hold = hold['text']
y_hold = hold['y']

X_test = test['text']

### Логрег на WC

In [35]:
X_train_wc = cv.transform(train['text'])
X_hold_wc = cv.transform(hold['text'])
X_test_wc = cv.transform(test['text'])

In [36]:
clf = LogisticRegression()
parameters = {'C': [c/100 for c in range(1, 100, 1)],
             'penalty':['l1'],
             'solver':['liblinear']}
gs = GridSearchCV(clf, parameters, n_jobs=-1, cv=4,  scoring = 'neg_log_loss')
gs.fit(X_train_wc, train['y'])

GridSearchCV(cv=4, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'C': [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08,
                               0.09, 0.1, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16,
                               0.17, 0.18, 0.19, 0.2, 0.21, 0.22, 0.23, 0.24,
                               0.25, 0.26, 0.27, 0.28, 0.29, 0.3, ...],
                         'penalty': ['

In [37]:
print(gs.best_score_)
gs.best_params_

-0.5352762906069509


{'C': 0.84, 'penalty': 'l1', 'solver': 'liblinear'}

## Предсказание по WC

In [38]:
final_lr_wc = LogisticRegression(C=gs.best_params_['C'], penalty=gs.best_params_['penalty'])
final_lr_wc.fit(X_train_wc, train['y'])
test_prediction_wc = final_lr_wc.predict_proba(X_test_wc)[:,1]
train_prediction_wc = final_lr_wc.predict_proba(X_train_wc)[:,1]
hold_prediction_wc = final_lr_wc.predict_proba(X_hold_wc)[:,1]



In [39]:
log_loss(hold['y'], hold_prediction_wc)

0.4961322862063864

In [40]:
# Анализ фичей
feature_coef = {word: coef for word, coef  in zip(cv.get_feature_names(), final_lr_wc.coef_[0])}
sorted(feature_coef.items(), key=lambda x:x[1], reverse=True)[:90]

[('love', 2.861466914784636),
 ('excel', 2.529611948415443),
 ('great', 2.1376802927386818),
 ('happi', 2.0261708120526842),
 ('15', 1.925663637190705),
 ('pleas', 1.777954880454184),
 ('amaz', 1.6981584680454953),
 ('cool', 1.6880767486011776),
 ('worri', 1.6349992422838004),
 ('featur', 1.5172869764448542),
 ('price', 1.4631062183331827),
 ('easi', 1.4021624560817392),
 ('ever', 1.2750865591891725),
 ('awesom', 1.2263805831069923),
 ('light', 1.1947440161448473),
 ('plu', 1.1834323000393243),
 ('quick', 1.1127065390037054),
 ('big', 1.1125626241843485),
 ('good', 1.1079496268129763),
 ('pocket', 1.0980625211140698),
 ('odor', 1.0579188333700482),
 ('everyth', 1.0566742389121404),
 ('except', 1.0301220160225206),
 ('perfect', 1.0065911011757207),
 ('champ', 1.0033390530401105),
 ('power', 0.9915765510985565),
 ('nice', 0.9693304850564436),
 ('smooth', 0.9541431277157971),
 ('navig', 0.943685710498859),
 ('best', 0.8895926663754414),
 ('absolut', 0.8809162273366739),
 ('small', 0.87777

## CV на TF-IDF

In [41]:
X_train_tfv = tfv.transform(train['text'])
X_hold_tfv = tfv.transform(hold['text'])
X_test_tfv = tfv.transform(test['text'])

In [42]:
clf_tf = LogisticRegression()
parameters_tf = {'C': [c/10 for c in range(1, 50, 1)],
             'penalty':['l1']}
gs_tf = GridSearchCV(clf_tf, parameters_tf, n_jobs=-1, cv=4,  scoring = 'neg_log_loss')
gs_tf.fit(X_train_tfv, train['y'])



GridSearchCV(cv=4, error_score='raise-deprecating',
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='warn',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='warn',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='warn', n_jobs=-1,
             param_grid={'C': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0,
                               1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0,
                               2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0, ...],
                         'penalty': ['l1']},
             pre_dispatch='2*n_jobs', refit=True, r

In [43]:
print(gs_tf.best_score_)
gs_tf.best_params_

-0.5334555408089913


{'C': 2.5, 'penalty': 'l1'}

## Предсказание по TF

In [44]:
final_lr_tfv = LogisticRegression(C=gs_tf.best_params_['C'], penalty=gs_tf.best_params_['penalty'])
final_lr_tfv.fit(X_train_tfv, train['y'])
test_prediction_tfv = final_lr_tfv.predict_proba(X_test_tfv)[:,1]
train_prediction_tfv = final_lr_tfv.predict_proba(X_train_tfv)[:,1]
hold_prediction_tfv = final_lr_tfv.predict_proba(X_hold_tfv)[:,1]

In [45]:
log_loss(hold['y'], hold_prediction_tfv)

0.48199969347084104

In [46]:
# Анализ фичей
feature_coef = {word: coef for word, coef  in zip(tfv.get_feature_names(), final_lr_tfv.coef_[0])}
sorted(feature_coef.items(), key=lambda x:x[1], reverse=True)[:30]

[('great', 8.224531819337624),
 ('excel', 8.054846575530759),
 ('love', 7.963463555882417),
 ('featur', 6.106920571169916),
 ('easi', 5.752925967373227),
 ('happi', 5.244146737381315),
 ('amaz', 4.9021065524496485),
 ('cool', 4.8546754463199315),
 ('worri', 4.567034044291068),
 ('quick', 4.232875535405707),
 ('pleas', 4.070671552673615),
 ('price', 4.058028264862966),
 ('absolut', 3.9592555986279736),
 ('ever', 3.907349936255673),
 ('power', 3.4076698433702064),
 ('pocket', 3.3951170523900323),
 ('except', 3.2857474643376188),
 ('perfectli', 3.231493898347019),
 ('good', 3.1993385261859717),
 ('plu', 3.194301390243961),
 ('light', 3.145949482065606),
 ('15', 3.1371327453655087),
 ('decent', 3.1316852406013944),
 ('awesom', 3.1140011659138582),
 ('everyth', 3.1015020824739064),
 ('odor', 2.9812399442676822),
 ('high', 2.9190273109549483),
 ('champ', 2.858727061760521),
 ('navig', 2.8302278041836857),
 ('best', 2.800602769969563)]

# Моделирование ансамблем

In [47]:
import math
def get_odd(p):
    return math.log(p/(1-p))

In [48]:
ensamble_train = pd.DataFrame()
ensamble_train['wc_logit'] = train_prediction_wc
ensamble_train['wc_logit'] = ensamble_train['wc_logit'].apply(get_odd)
ensamble_train['tfv_logit'] = train_prediction_tfv
ensamble_train['tfv_logit'] = ensamble_train['tfv_logit'].apply(get_odd)
ensamble_train['y'] = train['y'].values

In [49]:
ensamble_hold = pd.DataFrame()
ensamble_hold['wc_logit'] = hold_prediction_wc
ensamble_hold['wc_logit'] = ensamble_hold['wc_logit'].apply(get_odd)
ensamble_hold['tfv_logit'] = hold_prediction_tfv
ensamble_hold['tfv_logit'] = ensamble_hold['tfv_logit'].apply(get_odd)
ensamble_hold['y'] = hold['y'].values

In [50]:
ps = PredefinedSplit(ensamble_hold.index)

In [51]:
ensamble_full = pd.concat([ensamble_hold, ensamble_train], ignore_index=True)

In [52]:
best_p = None
best_c = None
best_logloss=10000
best_est = None
for p in ['l1', 'l2']:
    for c in [c/10000 for c in range(1, 10000, 1)]:
        clf_ens = LogisticRegression(penalty=p, C=c)
        clf_ens.fit(X=ensamble_train[['wc_logit', 'tfv_logit']], y = ensamble_train['y'])
        ensemble_hold_prediction = clf_ens.predict_proba(ensamble_hold[['wc_logit', 'tfv_logit']])[:,1]
        logloss = log_loss(ensamble_hold['y'], ensemble_hold_prediction)
        if logloss<best_logloss:
            best_logloss=logloss
            best_p = p
            best_c = c
            best_est = clf_ens











































































































































































































































































































































































































































































































































































































































































































































































































































































In [53]:
print(best_logloss)
print(best_p)
print(best_c)

0.48147984963712154
l1
0.0047


In [54]:
ensemble = LogisticRegression(penalty=best_p, C=best_c)
ensemble.fit(X=ensamble_full[['wc_logit', 'tfv_logit']], y = ensamble_full['y'])



LogisticRegression(C=0.0047, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l1',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [55]:
ensemble_train_prediction = ensemble.predict_proba(ensamble_train[['wc_logit', 'tfv_logit']])[:,1]
ensemble_hold_prediction = ensemble.predict_proba(ensamble_hold[['wc_logit', 'tfv_logit']])[:,1]

In [56]:
log_loss(ensamble_hold['y'], ensemble_hold_prediction)

0.48132780432413935

In [57]:
log_loss(ensamble_train['y'], ensemble_train_prediction)

0.3522523018001542

# Сохранение  в файл

In [58]:
submission = pd.DataFrame()

submission['wc_y'] = final_lr_wc.predict_proba(X_test_wc)[:,1]
submission['tf_y'] = final_lr_tfv.predict_proba(X_test_tfv)[:,1]

submission['wc_y_logit'] = submission['wc_y'].apply(get_odd)
submission['tf_y_logit'] = submission['tf_y'].apply(get_odd)

submission['ens_y'] = ensemble.predict_proba(submission[['wc_y_logit', 'tf_y_logit']])[:,1]
submission['avg_y'] = (submission['wc_y'] + submission['tf_y'])/2
submission.head()

Unnamed: 0,wc_y,tf_y,wc_y_logit,tf_y_logit,ens_y,avg_y
0,0.984168,0.983277,4.129792,4.074099,0.978411,0.983723
1,0.292345,0.355353,-0.884022,-0.595593,0.362759,0.323849
2,0.687323,0.787104,0.787632,1.307559,0.770781,0.737214
3,0.603718,0.661605,0.420982,0.670455,0.650716,0.632662
4,0.147815,0.227741,-1.751839,-1.221109,0.239785,0.187778


In [59]:
submission[['wc_y']].reset_index().rename({'wc_y':'y', 'index':'Id'}, axis=1).to_csv('submission/wc.csv', index=False)
submission[['tf_y']].reset_index().rename({'tf_y':'y', 'index':'Id'}, axis=1).to_csv('submission/tfv.csv', index=False)
submission[['ens_y']].reset_index().rename({'ens_y':'y', 'index':'Id'}, axis=1).to_csv('submission/ens.csv', index=False)
submission[['avg_y']].reset_index().rename({'avg_y':'y', 'index':'Id'}, axis=1).to_csv('submission/avg.csv', index=False)