# Неделя 3. Соревнование. Logistic Regression

Продолжение решения. Начало в ноутбуке week3_peer.

В этом задании вам нужно воспользоваться опытом предыдущих недель, чтобы побить бейзлайн в [соревновании по сентимент-анализу отзывов](https://www.kaggle.com/c/product-reviews-sentiment-analysis-light) на товары на Kaggle Inclass.

In [1]:
import pandas as pd
import numpy as np
from matplotlib import pyplot as plt
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer, HashingVectorizer
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.metrics import roc_curve, accuracy_score, roc_auc_score
from nltk.corpus import stopwords
from random import shuffle
import random

In [2]:
# seed = random.randint(0, 10000)
seed = 8269
seed

8269

In [3]:
train_data = pd.read_csv(
    "..\..\Data\products_sentiment_train.tsv",
    "\t",
    names=["text", "class"],
    dtype={ "text": "str", "class": "int" })
train_data.head()

Unnamed: 0,text,class
0,"2 . take around 10,000 640x480 pictures .",1
1,i downloaded a trial version of computer assoc...,1
2,the wrt54g plus the hga7t is a perfect solutio...,1
3,i dont especially like how music files are uns...,0
4,i was using the cheapie pail ... and it worked...,1


In [4]:
test_data = pd.read_csv(
    "..\..\Data\products_sentiment_test.tsv",
    "\t",
    index_col="Id",
    dtype={ "text": "str", "Id": "int" })
test_data.head()

Unnamed: 0_level_0,text
Id,Unnamed: 1_level_1
0,"so , why the small digital elph , rather than ..."
1,3/4 way through the first disk we played on it...
2,better for the zen micro is outlook compatibil...
3,6 . play gameboy color games on it with goboy .
4,"likewise , i 've heard norton 2004 professiona..."


In [5]:
texts = train_data["text"].tolist()
classes = train_data["class"].tolist()

In [6]:
result_columns = ["Accuracy", "Accuracy_std", "ROC_AUC", "ROC_AUC_std"]
results = pd.DataFrame(columns=result_columns)

In [7]:
def score_model_template(
    model_name,
    model_pipe,
    frame,
    frame_columns,
    score_texts,
    score_classes,
    print_results=True
):
    """ Функция принимает на вход модель, данные и фрейм с результатами, возвращает новый фрейм с обновленными результатами. """
    accuracy_scores = cross_val_score(
        model_pipe,
        score_texts,
        score_classes,
        scoring="accuracy")
    roc_auc_scores = cross_val_score(
        model_pipe,
        score_texts,
        score_classes,
        scoring="roc_auc")
    average_accuracy = np.average(accuracy_scores)
    std_accuracy = np.std(accuracy_scores)
    average_roc_auc = np.average(roc_auc_scores)
    std_roc_auc = np.std(roc_auc_scores)
    frame = frame.append(pd.DataFrame([
        [average_accuracy, std_accuracy, average_roc_auc, std_roc_auc]
    ], index=[model_name], columns=frame_columns))
    if(print_results):
        print ("Accuracy:\n\tAverage: {0:.3f}\n\tStandard Deviation: {1:.3f}".format(
            average_accuracy,
            std_accuracy))
        print ("ROC AUC:\n\tAverage: {0:.3f}\n\tStandard Deviation: {1:.3f}".format(
            np.average(roc_auc_scores),
            np.std(roc_auc_scores)))
    return frame

# Baseline
В этом ноутбуке исследуем Logistic Regression.
В качестве baseline используем Logistic Regression, с настройками по-умолчанию и CountVectorizer

In [8]:
count_logistic_pipe = Pipeline([
    ("vectorize", CountVectorizer()),
    ("model", LogisticRegression(random_state=seed))])
results = score_model_template(
    "Count Logistic",
    count_logistic_pipe,
    results,
    result_columns,
    texts,
    classes)

Accuracy:
	Average: 0.774
	Standard Deviation: 0.023
ROC AUC:
	Average: 0.832
	Standard Deviation: 0.020


# Data balancing
Попробуем сбалансировать данные и посмотрим насколько изменение баланса в данных влияет на качество предсказаний. Исследуем два метода: Oversampling и Class Weights. Undersampling пробовать не будем, т.к. данных довольно мало.

## Oversampling
Разница в количестве примерова в классах (diff_size) - 548 примеров.
Посмотрим насколько влияют oversampling меньшего класса, то числа близкого к 548.

In [9]:
def balance_data(input_texts, input_classes, diff_size):
    negative_texts, negative_classes = zip(*filter(lambda row: row[1] == 0, zip(input_texts, input_classes)))
    negative_texts_list = []
    random.seed(seed)
    for i in range(0, diff_size):
        negative_texts_list = negative_texts_list + [negative_texts[random.randint(0, len(negative_texts)- 1)]]
        
    balanced_texts = input_texts + negative_texts_list
    balanced_classes = input_classes + ([0] * diff_size)
    balanced_data = list(zip(balanced_texts, balanced_classes))
    shuffle(balanced_data)
    return zip(*balanced_data)

def score_model_oversampling(
    model_name,
    model_pipe,
    frame,
    frame_columns,
    print_results=True
):
    oversampled_texts, oversampled_classes = balance_data(texts, classes, 548)
    return score_model_template(
        model_name,
        model_pipe,
        frame,
        frame_columns,
        oversampled_texts,
        oversampled_classes,
        print_results)

In [10]:
oversampling_results = results
oversampling_results = score_model_oversampling(
    "Count Logistic oversampled",
    count_logistic_pipe,
    oversampling_results,
    result_columns
)

Accuracy:
	Average: 0.818
	Standard Deviation: 0.011
ROC AUC:
	Average: 0.897
	Standard Deviation: 0.009


Видно, что Oversampling довольно существенно улучшает качество предсказаний

## Class Weights
Негативных отзывов в 1.7548 раз меньше, чем позитивных. Попоробуем задать негативному классу больший вес

In [11]:
weight_results = results
for weight_part in range(1, 20):
    weight = 1.0548 + float(weight_part)/10
    logistic_count_weights_pipe = Pipeline([
        ("vectorize", CountVectorizer()),
        ("model", LogisticRegression(
            class_weight={
                0: weight,
                1: 1
            }
        ))])
    weight_results = score_model_template(
        "Count Logistic weighted {0}".format(weight),
        logistic_count_weights_pipe,
        weight_results,
        result_columns,
        texts,
        classes,
        False
    )
weight_results.sort_values("Accuracy", ascending=False)

Unnamed: 0,Accuracy,Accuracy_std,ROC_AUC,ROC_AUC_std
Count Logistic weighted 1.1548,0.775506,0.021571,0.831115,0.020682
Count Logistic,0.774007,0.022582,0.831514,0.020428
Count Logistic weighted 1.2548,0.772004,0.019382,0.830872,0.020799
Count Logistic weighted 1.4548,0.769001,0.013607,0.830447,0.020816
Count Logistic weighted 1.3548,0.768503,0.01711,0.830625,0.020742
Count Logistic weighted 1.5548,0.766499,0.013558,0.830155,0.020939
Count Logistic weighted 1.6547999999999998,0.761,0.017751,0.829902,0.021028
Count Logistic weighted 1.7548,0.758998,0.016736,0.82976,0.021066
Count Logistic weighted 2.0548,0.757497,0.014864,0.829153,0.021231
Count Logistic weighted 2.1548,0.757496,0.016948,0.828968,0.021239


Здесь видно, что существенно увеличить качество модели с помощью class weights не удалось

# Logistic regression

## Count Vectorizer

In [12]:
count_logistic_pipe = Pipeline([
    ("vectorize", CountVectorizer()),
    ("model", LogisticRegression(random_state=seed))])
results = score_model_oversampling(
    "Count Logistic",
    count_logistic_pipe,
    results,
    result_columns
)

Accuracy:
	Average: 0.818
	Standard Deviation: 0.011
ROC AUC:
	Average: 0.897
	Standard Deviation: 0.009


### English stopwords

In [13]:
english_stopwords = stopwords.words("english")
nltk_stopwords_count_logistic = Pipeline([
    ("vectorize", CountVectorizer(stop_words=english_stopwords)),
    ("model", LogisticRegression(random_state=seed))
])
results = score_model_oversampling(
    "Count NLTK English stopwords logistic",
    nltk_stopwords_count_logistic,
    results,
    result_columns
)

Accuracy:
	Average: 0.812
	Standard Deviation: 0.009
ROC AUC:
	Average: 0.891
	Standard Deviation: 0.006


In [14]:
sklearn_stopwords_count_logistic = Pipeline([
    ("vectorize", CountVectorizer(stop_words="english")),
    ("model", LogisticRegression(random_state=seed))
])
results = score_model_oversampling(
    "Count Sklearn English stopwords logistic",
    sklearn_stopwords_count_logistic,
    results,
    result_columns
)

Accuracy:
	Average: 0.804
	Standard Deviation: 0.010
ROC AUC:
	Average: 0.888
	Standard Deviation: 0.005


### N-grams

In [15]:
results_ngram = results
for max_ngram in range(2, 30):
    sklearn_count_ngram = Pipeline([
        ("vectorize", CountVectorizer(analyzer="word", ngram_range=(1, max_ngram))),
        ("model", LogisticRegression(random_state=seed))
    ])
    results_ngram = score_model_oversampling(
        "Count N-grams (1,{0}) logistic".format(max_ngram),
        sklearn_count_ngram,
        results_ngram,
        result_columns,
        False
    )
results_ngram

Unnamed: 0,Accuracy,Accuracy_std,ROC_AUC,ROC_AUC_std
Count Logistic,0.774007,0.022582,0.831514,0.020428
Count Logistic,0.817895,0.010648,0.897232,0.00924
Count NLTK English stopwords logistic,0.812407,0.008775,0.891127,0.005639
Count Sklearn English stopwords logistic,0.804168,0.009856,0.887977,0.005197
"Count N-grams (1,2) logistic",0.83085,0.014552,0.917302,0.006241
"Count N-grams (1,3) logistic",0.834383,0.013282,0.921745,0.005949
"Count N-grams (1,4) logistic",0.835168,0.008305,0.92253,0.006164
"Count N-grams (1,5) logistic",0.836348,0.00907,0.922453,0.006407
"Count N-grams (1,6) logistic",0.835562,0.006268,0.922177,0.006631
"Count N-grams (1,7) logistic",0.834383,0.006013,0.921943,0.006697


In [16]:
sklearn_count_ngram_1_5 = Pipeline([
    ("vectorize", CountVectorizer(analyzer="word", ngram_range=(1, 5))),
    ("model", LogisticRegression(random_state=seed))
])
results = score_model_oversampling(
    "Count N-grams (1,5) logistic",
    sklearn_count_ngram_1_5,
    results,
    result_columns
)

Accuracy:
	Average: 0.836
	Standard Deviation: 0.009
ROC AUC:
	Average: 0.922
	Standard Deviation: 0.006


## TF-IDF Vectorizer

In [17]:
tfidf_logistic_pipe = Pipeline([
    ("vectorize", TfidfVectorizer()),
    ("model", LogisticRegression(random_state=seed))])
results = score_model_oversampling(
    "TF-IDF Logistic baseline",
    tfidf_logistic_pipe,
    results,
    result_columns
)

Accuracy:
	Average: 0.805
	Standard Deviation: 0.009
ROC AUC:
	Average: 0.894
	Standard Deviation: 0.005


### English stopwords

In [18]:
english_stopwords = stopwords.words("english")
nltk_stopwords_tfidf_logistic = Pipeline([
    ("vectorize", TfidfVectorizer(stop_words=english_stopwords)),
    ("model", LogisticRegression(random_state=seed))
])
results = score_model_oversampling(
    "Tfidf NLTK English stopwords logistic",
    nltk_stopwords_tfidf_logistic,
    results,
    result_columns
)

Accuracy:
	Average: 0.801
	Standard Deviation: 0.009
ROC AUC:
	Average: 0.885
	Standard Deviation: 0.006


In [19]:
sklearn_stopwords_tfidf_logistic = Pipeline([
    ("vectorize", TfidfVectorizer(stop_words="english")),
    ("model", LogisticRegression(random_state=seed))
])
results = score_model_oversampling(
    "Tfidf Sklearn English stopwords logistic",
    sklearn_stopwords_tfidf_logistic,
    results,
    result_columns
)

Accuracy:
	Average: 0.792
	Standard Deviation: 0.003
ROC AUC:
	Average: 0.879
	Standard Deviation: 0.004


### N-grams

In [20]:
results_ngram = results
for max_ngram in range(2, 30):
    sklearn_tfidf_ngram = Pipeline([
        ("vectorize", TfidfVectorizer(analyzer="word", ngram_range=(1, max_ngram))),
        ("model", LogisticRegression(random_state=seed))
    ])
    results_ngram = score_model_oversampling(
        "Tfidf N-grams (1,{0}) logistic".format(max_ngram),
        sklearn_tfidf_ngram,
        results_ngram,
        result_columns,
        False
    )
results_ngram

Unnamed: 0,Accuracy,Accuracy_std,ROC_AUC,ROC_AUC_std
Count Logistic,0.774007,0.022582,0.831514,0.020428
Count Logistic,0.817895,0.010648,0.897232,0.00924
Count NLTK English stopwords logistic,0.812407,0.008775,0.891127,0.005639
Count Sklearn English stopwords logistic,0.804168,0.009856,0.887977,0.005197
"Count N-grams (1,5) logistic",0.836348,0.00907,0.922453,0.006407
TF-IDF Logistic baseline,0.805341,0.008842,0.894245,0.005339
Tfidf NLTK English stopwords logistic,0.801015,0.008513,0.884795,0.005838
Tfidf Sklearn English stopwords logistic,0.791995,0.002666,0.878878,0.00408
"Tfidf N-grams (1,2) logistic",0.824172,0.009687,0.912338,0.006352
"Tfidf N-grams (1,3) logistic",0.829276,0.005149,0.918082,0.007182


In [21]:
results_ngram = results
for min_ngram in range(2, 6):
    sklearn_tfidf_ngram = Pipeline([
        ("vectorize", TfidfVectorizer(analyzer="word", ngram_range=(min_ngram, 6))),
        ("model", LogisticRegression(random_state=seed))
    ])
    results_ngram = score_model_oversampling(
        "Tfidf N-grams ({0}, 6) logistic".format(min_ngram),
        sklearn_tfidf_ngram,
        results_ngram,
        result_columns,
        False
    )
results_ngram

Unnamed: 0,Accuracy,Accuracy_std,ROC_AUC,ROC_AUC_std
Count Logistic,0.774007,0.022582,0.831514,0.020428
Count Logistic,0.817895,0.010648,0.897232,0.00924
Count NLTK English stopwords logistic,0.812407,0.008775,0.891127,0.005639
Count Sklearn English stopwords logistic,0.804168,0.009856,0.887977,0.005197
"Count N-grams (1,5) logistic",0.836348,0.00907,0.922453,0.006407
TF-IDF Logistic baseline,0.805341,0.008842,0.894245,0.005339
Tfidf NLTK English stopwords logistic,0.801015,0.008513,0.884795,0.005838
Tfidf Sklearn English stopwords logistic,0.791995,0.002666,0.878878,0.00408
"Tfidf N-grams (2, 6) logistic",0.823398,0.006749,0.911101,0.007006
"Tfidf N-grams (3, 6) logistic",0.799462,0.010065,0.869869,0.012304


In [22]:
sklearn_tfidf_ngram_1_6 = Pipeline([
    ("vectorize", TfidfVectorizer(analyzer="word", ngram_range=(1, 6))),
    ("model", LogisticRegression(random_state=seed))
])
results = score_model_oversampling(
    "Tfidf N-grams (1,6) logistic",
    sklearn_tfidf_ngram_1_6,
    results,
    result_columns
)

Accuracy:
	Average: 0.838
	Standard Deviation: 0.000
ROC AUC:
	Average: 0.924
	Standard Deviation: 0.008


### Regularization coefficient

In [24]:
coef_results = results
for coef_part in range(1, 20):
    coef = float(coef_part)/10
    regularized_tfidf = Pipeline([
        ("vectorize", TfidfVectorizer(analyzer="word", ngram_range=(1, 6))),
        ("model", LogisticRegression(random_state=seed, C=coef))
    ])
    coef_results = score_model_oversampling(
        "Regularized TF-IDF logistic. C={0}".format(coef),
        regularized_tfidf,
        coef_results,
        result_columns,
        False
    )
coef_results

Unnamed: 0,Accuracy,Accuracy_std,ROC_AUC,ROC_AUC_std
Count Logistic,0.774007,0.022582,0.831514,0.020428
Count Logistic,0.817895,0.010648,0.897232,0.00924
Count NLTK English stopwords logistic,0.812407,0.008775,0.891127,0.005639
Count Sklearn English stopwords logistic,0.804168,0.009856,0.887977,0.005197
"Count N-grams (1,5) logistic",0.836348,0.00907,0.922453,0.006407
TF-IDF Logistic baseline,0.805341,0.008842,0.894245,0.005339
Tfidf NLTK English stopwords logistic,0.801015,0.008513,0.884795,0.005838
Tfidf Sklearn English stopwords logistic,0.791995,0.002666,0.878878,0.00408
"Tfidf N-grams (1,6) logistic",0.837519,0.000181,0.923966,0.00816
Regularized TF-IDF logistic. C=0.1,0.830064,0.003026,0.913138,0.008781


In [30]:
regularized_tfidf_c_1_1 = Pipeline([
    ("vectorize", TfidfVectorizer(analyzer="word", ngram_range=(1, 6))),
    ("model", LogisticRegression(random_state=seed, C=1.1))
])
results = score_model_oversampling(
    "Regularized TF-IDF logistic. C=1.1",
    regularized_tfidf_c_1_1,
    results,
    result_columns
)

Accuracy:
	Average: 0.838
	Standard Deviation: 0.001
ROC AUC:
	Average: 0.924
	Standard Deviation: 0.008


## Hashing Vectorizer

In [26]:
hashing_logistic_pipe = Pipeline([
    ("vectorize", HashingVectorizer()),
    ("model", LogisticRegression(random_state=seed))])
results = score_model_oversampling(
    "Hashing Logistic baseline",
    hashing_logistic_pipe,
    results,
    result_columns
)

Accuracy:
	Average: 0.772
	Standard Deviation: 0.007
ROC AUC:
	Average: 0.866
	Standard Deviation: 0.007


### English stopwords

In [27]:
english_stopwords = stopwords.words("english")
nltk_stopwords_hashing_logistic = Pipeline([
    ("vectorize", HashingVectorizer(stop_words=english_stopwords)),
    ("model", LogisticRegression(random_state=seed))
])
results = score_model_oversampling(
    "Hashing NLTK English stopwords logistic",
    nltk_stopwords_hashing_logistic,
    results,
    result_columns
)

Accuracy:
	Average: 0.783
	Standard Deviation: 0.008
ROC AUC:
	Average: 0.871
	Standard Deviation: 0.006


In [28]:
sklearn_stopwords_hashing_logistic = Pipeline([
    ("vectorize", HashingVectorizer(stop_words="english")),
    ("model", LogisticRegression(random_state=seed))
])
results = score_model_oversampling(
    "Hashing Sklearn English stopwords logistic",
    sklearn_stopwords_hashing_logistic,
    results,
    result_columns
)

Accuracy:
	Average: 0.775
	Standard Deviation: 0.009
ROC AUC:
	Average: 0.866
	Standard Deviation: 0.005


# Result

In [29]:
results = results.sort_values("Accuracy", ascending=False)
results.to_csv("..\..\Results\sentiment_kaggle_results_table.csv", index=True, index_label="Id")
results

Unnamed: 0,Accuracy,Accuracy_std,ROC_AUC,ROC_AUC_std
"Tfidf N-grams (1,6) logistic",0.837519,0.000181,0.923966,0.00816
Regularized TF-IDF logistic. C=0.8,0.836734,0.000664,0.922593,0.008359
"Count N-grams (1,5) logistic",0.836348,0.00907,0.922453,0.006407
Count Logistic,0.817895,0.010648,0.897232,0.00924
Count NLTK English stopwords logistic,0.812407,0.008775,0.891127,0.005639
TF-IDF Logistic baseline,0.805341,0.008842,0.894245,0.005339
Count Sklearn English stopwords logistic,0.804168,0.009856,0.887977,0.005197
Tfidf NLTK English stopwords logistic,0.801015,0.008513,0.884795,0.005838
Tfidf Sklearn English stopwords logistic,0.791995,0.002666,0.878878,0.00408
Hashing NLTK English stopwords logistic,0.782572,0.008452,0.870553,0.006265


В итоге после нескольких сабмитов на kaggle была выбрана следующая модель:

1. Оверсемплинг негативных отзывов (548 случайно выбранных примеров)
2. Использование N-грамм размером от 1 до 6 слов
3. Коэффициент регуляризации 1.1
4. Логистическая регрессия.

Результат на kaggle: 0.83250, что лучше, чем базовое решение.

In [35]:
test_texts = test_data["text"].tolist()
balanced_texts, balanced_classes = balance_data(texts, classes, 548)
model = regularized_tfidf_c_1_1.fit(balanced_texts, balanced_classes)
result_frame = pd.DataFrame(model.predict(test_texts), columns=["y"])
result_frame.to_csv("..\\..\\Results\\balanced_tfidf_ng1_6_c_1_1_logistic.csv", index=True, index_label="Id")
result_frame.head()

Unnamed: 0,y
0,1
1,0
2,1
3,1
4,0
