In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.model_selection import GridSearchCV, PredefinedSplit
from sklearn.model_selection import train_test_split



# чтение данных 

In [2]:
train = pd.read_csv('data/products_sentiment_train.tsv', sep='\t', header=None)
train.columns = ['text', 'y']
train, hold = train_test_split(train, test_size=0.3, stratify=train['y'])
train.head()

Unnamed: 0,text,y
1722,"even the "" shorter "" battery life , though , h...",0
1217,video output stopped workin .,0
519,flawless .,1
1132,my experience with installation was quite good .,1
1665,"the box included a little polyester , pull-str...",0


In [3]:
test = pd.read_csv('data/products_sentiment_test.tsv', sep='\t')
test.head()

Unnamed: 0,Id,text
0,0,"so , why the small digital elph , rather than ..."
1,1,3/4 way through the first disk we played on it...
2,2,better for the zen micro is outlook compatibil...
3,3,6 . play gameboy color games on it with goboy .
4,4,"likewise , i 've heard norton 2004 professiona..."


In [4]:
print(train.shape)
print(hold.shape)
print(test.shape)

(1400, 2)
(600, 2)
(500, 2)


# Чистка

In [5]:
# подключение модулей
lemm_enabled = False
stem_enabled = False

## Удаляем пунктуацию

In [6]:
import string
def remove_punctuation(text):
    no_punct = "".join([c for c in text if c not in string.punctuation])
    return no_punct

train['text'] = train['text'].apply(remove_punctuation)
hold['text'] = hold['text'].apply(remove_punctuation)
test['text'] = test['text'].apply(remove_punctuation)

## Токенизируем

In [7]:
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r'\w+')

def tokenize(text):
    return tokenizer.tokenize(text.lower())

train['text_tokenized'] = train['text'].apply(tokenize)
hold['text_tokenized'] = hold['text'].apply(tokenize)
test['text_tokenized'] = test['text'].apply(tokenize)

# Стоп слова

In [8]:
with open('data/stopwords.txt', 'r') as f:
    stopwords = f.read().split('\n')
    
def remove_stopwords(text_arr):
    return [w for w in text_arr if w not in stopwords]
    
train['text_tokenized'] = train['text_tokenized'].apply(remove_stopwords)
hold['text_tokenized'] = hold['text_tokenized'].apply(remove_stopwords)
test['text_tokenized'] = test['text_tokenized'].apply(remove_stopwords)

## лематизация

In [9]:

if lemm_enabled:
    from nltk.stem import WordNetLemmatizer
    lemmatizer = WordNetLemmatizer()

    def word_lemmatizer(text):
        lem_text = [lemmatizer.lemmatize(i) for i in text]
        return lem_text
    
    train['text_tokenized'] = train['text_tokenized'].apply(word_lemmatizer)
    hold['text_tokenized'] = hold['text_tokenized'].apply(word_lemmatizer)
    test['text_tokenized'] = test['text_tokenized'].apply(word_lemmatizer)


## Stemming

In [10]:
from nltk.stem import PorterStemmer

In [11]:


if stem_enabled:
    from nltk.stem import PorterStemmer
    stemmer = PorterStemmer()

    def word_stemmer(text):
        lem_text = [stemmer.stem(i) for i in text]
        return lem_text
    
    train['text_tokenized'] = train['text_tokenized'].apply(word_stemmer)
    hold['text_tokenized'] = hold['text_tokenized'].apply(word_stemmer)
    test['text_tokenized'] = test['text_tokenized'].apply(word_stemmer)


## Джоин массивов в предложения

In [12]:
train['text'] = train['text_tokenized'].apply(lambda x: " ".join(x))
hold['text'] = hold['text_tokenized'].apply(lambda x: " ".join(x))
test['text'] = test['text_tokenized'].apply(lambda x: " ".join(x))

In [13]:
print(train.shape)
print(hold.shape)
print(test.shape)

(1400, 3)
(600, 3)
(500, 3)


In [14]:
from ipywidgets import interact, interactive, interact_manual
import ipywidgets as ip

In [15]:
@interact
def showText(x=100):
    print(train.loc[x, 'text'])
    print(train.loc[x, 'y'] )

multilple tries one disks finally recognized video poor features available
0


# Векторизация

## WordCount

In [16]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(binary=True, ngram_range=(1,2))

cv.fit(train['text'])
X_train_wc = cv.transform(train['text'])
X_hold_wc = cv.transform(hold['text'])
X_test_wc = cv.transform(test['text'])

## TF-IDF

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfv = TfidfVectorizer()
tfv.fit(train['text'])
X_train_tfidf = tfv.transform(train['text'])
X_hold_tfidf = tfv.transform(hold['text'])
X_test_tfidf = tfv.transform(test['text'])

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


# Моделирование логрегом

In [18]:
from sklearn.linear_model import LogisticRegression, LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import log_loss, roc_auc_score

In [19]:
X_train = train['text']
y_train = train['y']

X_hold = hold['text']
y_hold = hold['y']

X_test = test['text']

### Логрег на WC

In [20]:
X_train_wc = cv.transform(train['text'])
X_hold_wc = cv.transform(hold['text'])
X_test_wc = cv.transform(test['text'])

In [21]:
clf = LogisticRegression()
parameters = {'C': [c/100 for c in range(1, 100, 1)],
             'penalty':['l1'],
             'solver':['liblinear']}
gs = GridSearchCV(clf, parameters, n_jobs=-1, cv=4,  scoring = 'neg_log_loss')
gs.fit(X_train_wc, train['y'])

GridSearchCV(cv=4, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'C': [0.01, 0.02, 0.03, 0.04, 0.05, 0.06, 0.07, 0.08, 0.09, 0.1, 0.11, 0.12, 0.13, 0.14, 0.15, 0.16, 0.17, 0.18, 0.19, 0.2, 0.21, 0.22, 0.23, 0.24, 0.25, 0.26, 0.27, 0.28, 0.29, 0.3, 0.31, 0.32, 0.33, 0.34, 0.35, 0.36, 0.37, 0.38, 0.39, 0.4, 0.41, 0.42, 0.43, 0.44, 0.45, 0.46, 0.47, 0.48..., 0.91, 0.92, 0.93, 0.94, 0.95, 0.96, 0.97, 0.98, 0.99], 'penalty': ['l1'], 'solver': ['liblinear']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_log_loss', verbose=0)

In [22]:
print(gs.best_score_)
gs.best_params_

-0.5301488182061226


{'C': 0.91, 'penalty': 'l1', 'solver': 'liblinear'}

## Предсказание по WC

In [23]:
final_lr_wc = LogisticRegression(C=gs.best_params_['C'], penalty=gs.best_params_['penalty'])
final_lr_wc.fit(X_train_wc, train['y'])
test_prediction_wc = final_lr_wc.predict_proba(X_test_wc)[:,1]
train_prediction_wc = final_lr_wc.predict_proba(X_train_wc)[:,1]
hold_prediction_wc = final_lr_wc.predict_proba(X_hold_wc)[:,1]

In [24]:
log_loss(hold['y'], hold_prediction_wc)

0.5152753874119351

In [25]:
# Анализ фичей
feature_coef = {word: coef for word, coef  in zip(cv.get_feature_names(), final_lr_wc.coef_[0])}
sorted(feature_coef.items(), key=lambda x:x[1], reverse=True)[:90]

[('great', 2.509769991892592),
 ('love', 2.3343573206580106),
 ('excellent', 2.1884013132977933),
 ('holds', 1.8516901526003327),
 ('happy', 1.8196696373842212),
 ('plus', 1.818419584454648),
 ('good', 1.815576405634136),
 ('pleased', 1.457741497690634),
 ('wonderful', 1.4483765473674675),
 ('perfect', 1.398836705459644),
 ('light', 1.3740674093907697),
 ('easy', 1.365403219502211),
 ('cool', 1.3351774597019752),
 ('ever', 1.2476293233787592),
 ('click', 1.1583908642807756),
 ('installed', 1.1249777854434664),
 ('pocket', 1.119218587699565),
 ('absolutely', 1.0998183411387643),
 ('awesome', 1.0994678488377412),
 ('fast', 1.0729280859655608),
 ('decent', 1.0686364599046725),
 ('price', 1.0539149196185196),
 ('amazing', 1.0386033396945016),
 ('included software', 0.9804732276969423),
 ('champ', 0.9539378056583239),
 ('well', 0.9415081763553295),
 ('nice', 0.9408181645415982),
 ('simple', 0.9109934989433419),
 ('best', 0.8932010548962418),
 ('touch', 0.7816153852497152),
 ('design', 0.772

## CV на TF-IDF

In [26]:
X_train_tfv = tfv.transform(train['text'])
X_hold_tfv = tfv.transform(hold['text'])
X_test_tfv = tfv.transform(test['text'])

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [27]:
clf_tf = LogisticRegression()
parameters_tf = {'C': [c/10 for c in range(1, 50, 1)],
             'penalty':['l1']}
gs_tf = GridSearchCV(clf_tf, parameters_tf, n_jobs=-1, cv=4,  scoring = 'neg_log_loss')
gs_tf.fit(X_train_tfv, train['y'])

GridSearchCV(cv=4, error_score='raise',
       estimator=LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'C': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0, 1.1, 1.2, 1.3, 1.4, 1.5, 1.6, 1.7, 1.8, 1.9, 2.0, 2.1, 2.2, 2.3, 2.4, 2.5, 2.6, 2.7, 2.8, 2.9, 3.0, 3.1, 3.2, 3.3, 3.4, 3.5, 3.6, 3.7, 3.8, 3.9, 4.0, 4.1, 4.2, 4.3, 4.4, 4.5, 4.6, 4.7, 4.8, 4.9], 'penalty': ['l1']},
       pre_dispatch='2*n_jobs', refit=True, return_train_score=True,
       scoring='neg_log_loss', verbose=0)

In [28]:
print(gs_tf.best_score_)
gs_tf.best_params_

-0.5220529473596344


{'C': 2.7, 'penalty': 'l1'}

## Предсказание по TF

In [29]:
final_lr_tfv = LogisticRegression(C=gs_tf.best_params_['C'], penalty=gs_tf.best_params_['penalty'])
final_lr_tfv.fit(X_train_tfv, train['y'])
test_prediction_tfv = final_lr_tfv.predict_proba(X_test_tfv)[:,1]
train_prediction_tfv = final_lr_tfv.predict_proba(X_train_tfv)[:,1]
hold_prediction_tfv = final_lr_tfv.predict_proba(X_hold_tfv)[:,1]

In [30]:
log_loss(hold['y'], hold_prediction_tfv)

0.49988259732795665

In [31]:
# Анализ фичей
feature_coef = {word: coef for word, coef  in zip(tfv.get_feature_names(), final_lr_tfv.coef_[0])}
sorted(feature_coef.items(), key=lambda x:x[1], reverse=True)[:30]

[('great', 9.77806776383256),
 ('excellent', 7.83732401055698),
 ('good', 6.598255061366012),
 ('love', 6.564718829315826),
 ('easy', 6.063305195998226),
 ('plus', 5.259378485782258),
 ('happy', 5.006061675650512),
 ('holds', 4.5101821679451275),
 ('cool', 4.172258444382701),
 ('absolutely', 3.853795188152375),
 ('decent', 3.718900594842279),
 ('well', 3.7017559942992873),
 ('wonderful', 3.563164344554508),
 ('ever', 3.5334159959964393),
 ('pocket', 3.4681413957790244),
 ('fast', 3.3879890720583403),
 ('best', 3.344392183061145),
 ('camera', 3.340254809255229),
 ('perfect', 3.2960348907108328),
 ('price', 3.2397361248218224),
 ('amazing', 3.212673798444519),
 ('pleased', 3.2083421596844732),
 ('simple', 3.137710691522902),
 ('light', 3.065817837508457),
 ('champ', 3.0211807575020546),
 ('awesome', 2.9518022404366904),
 ('gb', 2.8616226982036213),
 ('ok', 2.637601502671946),
 ('phones', 2.623325802416347),
 ('router', 2.6159195107810267)]

# Моделирование ансамблем

In [32]:
import math
def get_odd(p):
    return math.log(p/(1-p))

In [33]:
ensamble_train = pd.DataFrame()
ensamble_train['wc_logit'] = train_prediction_wc
ensamble_train['wc_logit'] = ensamble_train['wc_logit'].apply(get_odd)
ensamble_train['tfv_logit'] = train_prediction_tfv
ensamble_train['tfv_logit'] = ensamble_train['tfv_logit'].apply(get_odd)
ensamble_train['y'] = train['y'].values

In [34]:
ensamble_hold = pd.DataFrame()
ensamble_hold['wc_logit'] = hold_prediction_wc
ensamble_hold['wc_logit'] = ensamble_hold['wc_logit'].apply(get_odd)
ensamble_hold['tfv_logit'] = hold_prediction_tfv
ensamble_hold['tfv_logit'] = ensamble_hold['tfv_logit'].apply(get_odd)
ensamble_hold['y'] = hold['y'].values

In [35]:
ps = PredefinedSplit(ensamble_hold.index)

In [36]:
ensamble_full = pd.concat([ensamble_hold, ensamble_train], ignore_index=True)

In [37]:
best_p = None
best_c = None
best_logloss=10000
best_est = None
for p in ['l1', 'l2']:
    for c in [c/10000 for c in range(1, 10000, 1)]:
        clf_ens = LogisticRegression(penalty=p, C=c)
        clf_ens.fit(X=ensamble_train[['wc_logit', 'tfv_logit']], y = ensamble_train['y'])
        ensemble_hold_prediction = clf_ens.predict_proba(ensamble_hold[['wc_logit', 'tfv_logit']])[:,1]
        logloss = log_loss(ensamble_hold['y'], ensemble_hold_prediction)
        if logloss<best_logloss:
            best_logloss=logloss
            best_p = p
            best_c = c
            best_est = clf_ens

In [38]:
print(best_logloss)
print(best_p)
print(best_c)

0.49810925688699553
l1
0.0041


In [39]:
ensemble = LogisticRegression(penalty=best_p, C=best_c)
ensemble.fit(X=ensamble_full[['wc_logit', 'tfv_logit']], y = ensamble_full['y'])

LogisticRegression(C=0.0041, class_weight=None, dual=False,
          fit_intercept=True, intercept_scaling=1, max_iter=100,
          multi_class='ovr', n_jobs=1, penalty='l1', random_state=None,
          solver='liblinear', tol=0.0001, verbose=0, warm_start=False)

In [40]:
ensemble_train_prediction = ensemble.predict_proba(ensamble_train[['wc_logit', 'tfv_logit']])[:,1]
ensemble_hold_prediction = ensemble.predict_proba(ensamble_hold[['wc_logit', 'tfv_logit']])[:,1]

In [41]:
log_loss(ensamble_hold['y'], ensemble_hold_prediction)

0.4980944483893822

In [42]:
log_loss(ensamble_train['y'], ensemble_train_prediction)

0.3486000929145205

# Сохранение  в файл

In [None]:
submission = pd.DataFrame()

submission['wc_y'] = final_lr_wc.predict_proba(X_test_wc)[:,1]
submission['tf_y'] = final_lr_tfv.predict_proba(X_test_tfv)[:,1]

submission['wc_y_logit'] = submission['wc_y'].apply(get_odd)
submission['tf_y_logit'] = submission['tf_y'].apply(get_odd)

submission['ens_y'] = ensemble.predict_proba(submission[['wc_y_logit', 'tf_y_logit']])[:,1]
submission['avg_y'] = (submission['wc_y'] + submission['tf_y'])/2
submission.head()

In [None]:
submission[['wc_y']].reset_index().rename({'wc_y':'y', 'index':'Id'}, axis=1).to_csv('submission/wc.csv', index=False)
submission[['tf_y']].reset_index().rename({'tf_y':'y', 'index':'Id'}, axis=1).to_csv('submission/tfv.csv', index=False)
submission[['ens_y']].reset_index().rename({'ens_y':'y', 'index':'Id'}, axis=1).to_csv('submission/ens.csv', index=False)
submission[['avg_y']].reset_index().rename({'avg_y':'y', 'index':'Id'}, axis=1).to_csv('submission/avg.csv', index=False)