In [1]:
from lxml import etree

In [2]:
from typing import List, Tuple

In [3]:
def load_sentirueval_2016(file_name: str) -> Tuple[List[str], List[str]]:
    texts = []
    labels = []
    with open(file_name, mode='rb') as fp:
        xml_data = fp.read()
    root = etree.fromstring(xml_data)
    for database in root.getchildren():
        if database.tag == 'database':
            for table in database.getchildren():
                if table.tag != 'table':
                    continue
                new_text = None
                new_label = None
                for column in table.getchildren():
                    if column.get('name') == 'text':
                        new_text = str(column.text).strip()
                        if new_label is not None:
                            break
                    elif column.get('name') not in {'id', 'twitid', 'date'}:
                        if new_label is None:
                            label_candidate = str(column.text).strip()
                            if label_candidate in {'0', '1', '-1'}:
                                new_label = 'negative' if label_candidate == '-1' else \
                                    ('positive' if label_candidate == '1' else 'neutral')
                                if new_text is not None:
                                    break
                if (new_text is None) or (new_label is None):
                    raise ValueError('File `{0}` contains some error!'.format(file_name))
                texts.append(new_text)
                labels.append(new_label)
            break
    return texts, labels

In [4]:
texts, labels = load_sentirueval_2016('bank_train_2016.xml')

In [5]:
print('Number of texts is {0}, number of labels is {1}.'.format(len(texts), len(labels)))

Number of texts is 9392, number of labels is 9392.


In [6]:
#1)токенизацию с учётом возможных смайлов (базовая токенизация некорректно
#работает со знаками препинания и прочими неалфавитными и нецифровыми
#символами, из которых как раз и могут состоять тонально значимые смайлы)

In [7]:
# 2) лемматизацию с учётом контекста, чтобы успешно разрешать морфоомонимию вида
# “ мы стали лучше программировать ” - “ мы выплавляем больше стали ” (для такой
# лемматиции можно использовать, например, библиотеку UDPipe или её адаптацию
# под SpaCy)

# 3) удаление стоп-слов по словарям и/или правилам (например, описанным в виде
# регулярных выражений).

In [8]:
from nltk.tokenize import TweetTokenizer
from rnnmorph.predictor import RNNMorphPredictor

predictor = RNNMorphPredictor(language='ru')
tokenizer = TweetTokenizer()

def drop_junk(s):
    return ' '.join([word for word in tokenizer.tokenize(s) if word.isalpha() or '!' in word or '?' in word \
                     or '(' in word or ')' in word])

def lemmatization(s):
    toks = tokenizer.tokenize(s)
    forms = predictor.predict(toks)
    return ' '.join([f.normal_form for f in forms])

texts, labels = load_sentirueval_2016('bank_train_2016.xml')

texts = list(map(drop_junk, texts))
texts = list(map(lemmatization, texts))

ModuleNotFoundError: No module named 'rnnmorph'

In [9]:
from stop_words import get_stop_words
stop_words = get_stop_words('ru')

ModuleNotFoundError: No module named 'stop_words'

In [None]:
import random

In [None]:
for idx in random.choices(list(range(len(texts))), k=20):
    print('{0} => {1}'.format(labels[idx], texts[idx]))

In [None]:
positive_tweets = [texts[idx] for idx in range(len(texts)) if labels[idx] == 'positive']
negative_tweets = [texts[idx] for idx in range(len(texts)) if labels[idx] == 'negative']
neutral_tweets = [texts[idx] for idx in range(len(texts)) if labels[idx] == 'neutral']

In [None]:
for cur in positive_tweets[:5]: print(cur)

In [None]:
for cur in negative_tweets[:5]: print(cur)

In [None]:
from nltk import word_tokenize
#import nltk
#nltk.download()

In [None]:
from sklearn.feature_extraction.text import CountVectorizer

In [None]:
vectorizer = CountVectorizer(lowercase=True, tokenizer=word_tokenize)

In [None]:
vectorizer.fit(texts)

In [None]:
print(vectorizer.get_feature_names()[0:20])

In [None]:
print(len(vectorizer.get_feature_names()))

In [None]:
X = vectorizer.transform(texts)

In [None]:
print(type(X))

In [None]:
print(texts[0])

In [None]:
print(X[0])

In [None]:
print(vectorizer.get_feature_names()[6321])

In [None]:
print(vectorizer.get_feature_names()[9866])

In [None]:
print(vectorizer.get_feature_names()[19056])

In [None]:
from sklearn.feature_extraction.text import TfidfTransformer

In [None]:
transformer = TfidfTransformer().fit(X)

In [None]:
#X_transformed = transformer.transform(X)

In [None]:
print(X_transformed[0])

In [None]:
print(vectorizer.get_feature_names()[6321])

In [None]:
print(vectorizer.get_feature_names()[5979])

In [None]:
#print(vectorizer.get_feature_names()[7199])

In [None]:
tokens_with_IDF = list(zip(vectorizer.get_feature_names(), transformer.idf_))

In [None]:
for feature, idf in tokens_with_IDF[0:20]: print('{0:.6f} => {1}'.format(idf, feature))

In [None]:
sorted_tokens_with_IDF = sorted(tokens_with_IDF, key=lambda it: (-it[1], it[0]))

In [None]:
for feature, idf in sorted_tokens_with_IDF[0:20]: print('{0:.6f} => {1}'.format(idf, feature))

In [None]:
from sklearn.feature_selection import SelectPercentile, chi2

In [None]:
selector = SelectPercentile(chi2, percentile=20)

In [None]:
selector.fit(X_transformed, labels)

In [None]:
selected_tokens_with_IDF = [tokens_with_IDF[idx] for idx in selector.get_support(indices=True)]

In [None]:
print(len(selected_tokens_with_IDF))

In [None]:
for feature, idf in selected_tokens_with_IDF[0:20]: print('{0:.6f} => {1}'.format(idf, feature))

In [None]:
selected_and_sorted_tokens_with_IDF = sorted(selected_tokens_with_IDF, key=lambda it: (-it[1], it[0]))

In [None]:
for feature, idf in selected_and_sorted_tokens_with_IDF[0:20]: print('{0:.6f} => {1}'.format(idf, feature))