In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score, classification_report, \
    accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, \
    TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from scipy.sparse import coo_matrix, hstack
from scipy.sparse.csr import csr_matrix
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVC
import nltk
import io
from nltk.corpus import stopwords
import spacy 
import en_core_web_md
nlp = en_core_web_md.load()

nltk.download('stopwords')
nltk.download('wordnet')

df = \
    pd.read_csv('https://raw.githubusercontent.com/esolovev/ling2019/master/mo\
dule2/twi_data.csv', sep=';')

!python -m spacy download en_core_web_md


In [2]:
SEED = 227
np.random.seed(SEED)
df_train, df_test = train_test_split(df, train_size=0.2, test_size=0.1,
                                     stratify=df.target, random_state=SEED)
y_train = df_train.target
y_test = df_test.target


def score(model):
    y_pred = model.predict(X_test)
    print(f'Accuracy: {accuracy_score(y_pred, y_test)}')
    return classification_report(y_test, y_pred)


Baseline 1

In [3]:
count_vectorizer = CountVectorizer()
X_train = count_vectorizer.fit_transform(df_train.text)
X_test = count_vectorizer.transform(df_test.text)
model = MultinomialNB()
model.fit(X_train, y_train)

print(score(model))

tfidf_vectorizer = TfidfVectorizer()
X_train = tfidf_vectorizer.fit_transform(df_train.text)
X_test = tfidf_vectorizer.transform(df_test.text)
model.fit(X_train, y_train)

print(score(model))


Accuracy: 0.73875
              precision    recall  f1-score   support

           0       0.71      0.82      0.76      2000
           4       0.78      0.66      0.72      2000

    accuracy                           0.74      4000
   macro avg       0.74      0.74      0.74      4000
weighted avg       0.74      0.74      0.74      4000

Accuracy: 0.7405
              precision    recall  f1-score   support

           0       0.70      0.83      0.76      2000
           4       0.79      0.65      0.71      2000

    accuracy                           0.74      4000
   macro avg       0.75      0.74      0.74      4000
weighted avg       0.75      0.74      0.74      4000



Baseline 2


In [4]:
tfidf_vectorizer = TfidfVectorizer()
X_train = tfidf_vectorizer.fit_transform(df_train.text)
X_test = tfidf_vectorizer.transform(df_test.text)
model = LogisticRegression(random_state=SEED, solver='liblinear')
model.fit(X_train, y_train)
print(score(model))

# param_grid = { 'C': [5,10,15,20],
#                'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga']}
# grid = GridSearchCV(estimator=LogisticRegression(random_state=SEED,
#                        max_iter = 1000),
#                       param_grid=param_grid,
#                      cv = 10)
# model = grid.fit(X_train, y_train)
# print(score(model))

stop_words = stopwords.words('russian')
tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words,
                                   analyzer='char_wb',
                                   lowercase=True,
                                   ngram_range=(2, 5),
                                   max_df=0.95,
                                   min_df=4,
                                   norm='l2',
                                   use_idf=True)
X_train = tfidf_vectorizer.fit_transform(df_train.text)
X_test = tfidf_vectorizer.transform(df_test.text)
model = SVC(random_state=SEED)
model.fit(X_train, y_train)
print(score(model))


Accuracy: 0.75325
              precision    recall  f1-score   support

           0       0.75      0.76      0.76      2000
           4       0.76      0.74      0.75      2000

    accuracy                           0.75      4000
   macro avg       0.75      0.75      0.75      4000
weighted avg       0.75      0.75      0.75      4000

Accuracy: 0.7625
              precision    recall  f1-score   support

           0       0.76      0.76      0.76      2000
           4       0.76      0.76      0.76      2000

    accuracy                           0.76      4000
   macro avg       0.76      0.76      0.76      4000
weighted avg       0.76      0.76      0.76      4000



Baseline 3

In [14]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 4))
X_train_tfidf = tfidf_vectorizer.fit_transform(df_train.text)
X_test_tfidf = tfidf_vectorizer.transform(df_test.text)

tfidf_vectorizer_char = TfidfVectorizer(ngram_range=(3, 4), analyzer='char')
X_train_tfidf_char = tfidf_vectorizer_char.fit_transform(df_train.text)
X_test_tfidf_char = tfidf_vectorizer_char.transform(df_test.text)

X_train = hstack((X_train_tfidf, X_train_tfidf_char))
X_test = hstack((X_test_tfidf, X_test_tfidf_char))

model = LogisticRegression(random_state=SEED, solver='liblinear')
model.fit(X_train, y_train)

print(score(model))

tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words,
                                   analyzer='word',
                                   lowercase=True,
                                   ngram_range=(1, 2),
                                   max_df=0.95,
                                   min_df=4,
                                   norm='l2',
                                   use_idf=True)
X_train_tfidf = tfidf_vectorizer.fit_transform(df_train.text)
X_test_tfidf = tfidf_vectorizer.transform(df_test.text)

tfidf_vectorizer_char = TfidfVectorizer(stop_words=stop_words,
                                        analyzer='char',
                                        lowercase=True,
                                        ngram_range=(3, 5),
                                        max_df=0.95,
                                        min_df=4,
                                        norm='l2',
                                        use_idf=True)
X_train_tfidf_char = tfidf_vectorizer_char.fit_transform(df_train.text)
X_test_tfidf_char = tfidf_vectorizer_char.transform(df_test.text)

X_train = hstack((X_train_tfidf, X_train_tfidf_char))
X_test = hstack((X_test_tfidf, X_test_tfidf_char))

model = SVC(random_state=SEED)
model.fit(X_train, y_train)

print(score(model))


Accuracy: 0.7635
              precision    recall  f1-score   support

           0       0.76      0.77      0.76      2000
           4       0.77      0.76      0.76      2000

    accuracy                           0.76      4000
   macro avg       0.76      0.76      0.76      4000
weighted avg       0.76      0.76      0.76      4000

Accuracy: 0.76425
              precision    recall  f1-score   support

           0       0.76      0.77      0.77      2000
           4       0.77      0.75      0.76      2000

    accuracy                           0.76      4000
   macro avg       0.76      0.76      0.76      4000
weighted avg       0.76      0.76      0.76      4000



Baseline 4

In [6]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 4))
X_train_tfidf = tfidf_vectorizer.fit_transform(df_train.text)
X_test_tfidf = tfidf_vectorizer.transform(df_test.text)

tfidf_vectorizer_char = TfidfVectorizer(ngram_range=(3, 4), analyzer='char')
X_train_tfidf_char = tfidf_vectorizer_char.fit_transform(df_train.text)
X_test_tfidf_char = tfidf_vectorizer_char.transform(df_test.text)

X_train_vectors = csr_matrix([nlp(twi_text).vector
                              for twi_text in df_train.text])
X_test_vectors = csr_matrix([nlp(twi_text).vector
                             for twi_text in df_test.text])
X_train = hstack((X_train_tfidf, X_train_tfidf_char, X_train_vectors))
X_test = hstack((X_test_tfidf, X_test_tfidf_char, X_test_vectors))

model = LogisticRegression(random_state=SEED, solver='liblinear')
model.fit(X_train, y_train)

print(score(model))

tfidf_vectorizer = TfidfVectorizer(stop_words=stop_words,
                                   analyzer='word',
                                   lowercase=True,
                                   ngram_range=(1, 2),
                                   max_df=0.95,
                                   min_df=4,
                                   norm='l2',
                                   use_idf=True)
X_train_tfidf = tfidf_vectorizer.fit_transform(df_train.text)
X_test_tfidf = tfidf_vectorizer.transform(df_test.text)

tfidf_vectorizer_char = TfidfVectorizer(stop_words=stop_words,
                                        analyzer='char',
                                        lowercase=True,
                                        ngram_range=(3, 5),
                                        max_df=0.95,
                                        min_df=4,
                                        norm='l2',
                                        use_idf=True)
X_train_tfidf_char = tfidf_vectorizer_char.fit_transform(df_train.text)
X_test_tfidf_char = tfidf_vectorizer_char.transform(df_test.text)

X_train_vectors = csr_matrix([nlp(twi_text).vector
                              for twi_text in df_train.text])
X_test_vectors = csr_matrix([nlp(twi_text).vector
                             for twi_text in df_test.text])
X_train = hstack((X_train_tfidf, X_train_tfidf_char, X_train_vectors))
X_test = hstack((X_test_tfidf, X_test_tfidf_char, X_test_vectors))

model = LogisticRegression(random_state=SEED, solver='liblinear')
model.fit(X_train, y_train)

print(score(model))


Accuracy: 0.777
              precision    recall  f1-score   support

           0       0.77      0.79      0.78      2000
           4       0.78      0.76      0.77      2000

    accuracy                           0.78      4000
   macro avg       0.78      0.78      0.78      4000
weighted avg       0.78      0.78      0.78      4000

Accuracy: 0.78125
              precision    recall  f1-score   support

           0       0.78      0.79      0.78      2000
           4       0.79      0.77      0.78      2000

    accuracy                           0.78      4000
   macro avg       0.78      0.78      0.78      4000
weighted avg       0.78      0.78      0.78      4000



In [None]:
# тут пыталась сделать препроцессинг для разных бейзлайнов, но нигде особо
# accuracy не повышалось (и я делала это в пайчарме, тут оочень долго работает)

import regex as re
import nltk
from pymystem3 import Mystem
from string import punctuation
from nltk.corpus import stopwords

russian_stemmer = nltk.stem.snowball.SnowballStemmer("russian")
lemmatizer = Mystem()
russian_stopwords = stopwords.words('russian')


def remove_stowords(text):
    text = [word for word in text.split()
            if word.lower() not in russian_stopwords]
    return ' '.join(text)


def remove_punctuation_and_make_lowercase(text):
    sentences = [(sentence.translate
                 (str.maketrans('', '', punctuation))).lower()
                 for sentence in text.split()]
    return ' '.join(sentences)


def lemmatization(text):
    text = lemmatizer.lemmatize(text)
    return ' '.join(text)


def stemmatize(text):
    text = [russian_stemmer.stem(word) for word in text.split()]
    return ' '.join(text)


def corpora_with_stemmed_column(data):
    new_column = pd.DataFrame(columns=['stemmed_text'])
    for text in list(data['text']):
        stemmed_text = stemmatize(
            lemmatization(remove_punctuation_and_make_lowercase
                          (remove_stowords(text))))
        df = pd.DataFrame({'stemmed_text': stemmed_text}, index=[0])
        new_column = new_column.append(df, ignore_index=True)
    data['stemmed_text'] = list(new_column['stemmed_text'])

    return data

stemmed_data = corpora_with_stemmed_column(df)
