In [None]:
import re
import fasttext
import numpy as np
import pandas as pd


from string import punctuation


from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import (
    accuracy_score,
    roc_auc_score,
    roc_curve
)


from matplotlib import pyplot as plt

In [None]:
punctuation = set(punctuation)

## Sentiment analysis

### 1 Предобработка данных

Загрузим данные.

In [None]:
polariry_dataset = pd.read_csv("data/polarity-dataset.csv", skiprows=1, names=["text", "label"])

polariry_dataset.head()

Уберем лишние знаки пунктуации.

In [None]:
def is_digit(string: str) -> bool:
    return re.sub(r"[,.]", "", string, count=1).isdigit()

In [None]:
polariry_dataset["text_clean"] = polariry_dataset["text"].apply(lambda x: [token for token in x.split() if (token not in punctuation) and (not is_digit(token))])

polariry_dataset["text_clean"] = polariry_dataset["text_clean"].str.join(" ")

Напишем функции для записи данных в файл.

In [None]:
def to_file(path: str, texts: list) -> None:
    with open(path, mode="w") as file:
        for text in texts:
            file.write(f"{text}\n")

### 2 Тестирование модели

#### 2.1 Custom embeddings

In [None]:
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


roc_auc_scores = []
for i, (train_index, test_index) in enumerate(stratified_kfold.split(polariry_dataset["text_clean"], polariry_dataset["label"])):
    X_train, y_train = polariry_dataset.loc[train_index, "text_clean"].values, polariry_dataset.loc[train_index, "label"].values
    X_test , y_test  = polariry_dataset.loc[test_index,  "text_clean"].values, polariry_dataset.loc[test_index,  "label"].values
    
    to_file("train.txt", X_train)
    
    model = fasttext.train_unsupervised(
        input="train.txt",
        model="skipgram",
        lr=0.1,
        dim=300,
        ws=5,
        epoch=25,
        minCount=5,
        minn=3,
        maxn=6,
        wordNgrams=3,
        loss="softmax",
        thread=10,
        verbose=False
    )
    
    X_train_embeddings = np.array([model.get_sentence_vector(text) for text in X_train])
    
    clf = LogisticRegression(C=5)
    clf.fit(X_train_embeddings, y_train)
    
    X_test_embeddings  = np.array([model.get_sentence_vector(text) for text in X_test])
    
    y_pred_proba = clf.predict_proba(X_test_embeddings)
    
    roc_auc_scores.append(roc_auc_score(y_test, y_pred_proba[:, 1]))
    print(f"split {i} | roc_auc_score: {roc_auc_scores[-1]}")
    
print(f"roc_auc_score mean: {np.mean(roc_auc_scores)}")

#### 2.2 Pre-trained embeddings

Скачать эмбеддинги: https://fasttext.cc/docs/en/python-module.html

In [None]:
model = fasttext.load_model("/Users/samarin.ia/study/notebooks/hw1/model/cc.ru.300.bin")

In [None]:
stratified_kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)


roc_auc_scores = []
for i, (train_index, test_index) in enumerate(stratified_kfold.split(polariry_dataset["text_clean"], polariry_dataset["label"])):
    X_train, y_train = polariry_dataset.loc[train_index, "text_clean"].values, polariry_dataset.loc[train_index, "label"].values
    X_test , y_test  = polariry_dataset.loc[test_index,  "text_clean"].values, polariry_dataset.loc[test_index,  "label"].values
    
    X_train_embeddings = np.array([model.get_sentence_vector(text) for text in X_train])
    
    clf = LogisticRegression(C=5)
    clf.fit(X_train_embeddings, y_train)
    
    X_test_embeddings  = np.array([model.get_sentence_vector(text) for text in X_test])
    
    y_pred_proba = clf.predict_proba(X_test_embeddings)
    
    roc_auc_scores.append(roc_auc_score(y_test, y_pred_proba[:, 1]))
    print(f"split {i} | roc_auc_score: {roc_auc_scores[-1]}")
    
print(f"roc_auc_score mean: {np.mean(roc_auc_scores)}")