In [None]:
import spacy
import pandas as pd
import spacy
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from matplotlib import pyplot as plt

In [None]:
def predict_sentiment(text):
    loaded_model = spacy.load("model_artifacts")
    parsed_text = loaded_model(text)
    return parsed_text.cats["+"], parsed_text.cats["−"], parsed_text.cats["?"]

In [None]:
train = pd.read_csv('train.csv').drop('Unnamed: 0', axis = 1)
X = train[train['1category'] != '?'].drop(['2category', 'sentiment'], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X.sentence, X['1category'], test_size = 0.3, random_state = 69)
nlp = spacy.load('ru_core_news_sm')

def cleaner(sentence):
    doc = nlp(sentence)
    mytokens = [token for token in doc if (not token.is_stop and not token.like_num)]    
    lemmas = [token.lemma_.lower() for token in mytokens]  
    filtered_token = [token for token in lemmas if token not in (',','(',')','%','банк','карта', '\xa0', '\xa0 ', '-', '!', '?', '.', ':', '"', '«', '»', '...', '/')]  
    #print(1)
    return filtered_token

class predictors(TransformerMixin):
    def transform(self, X, **transform_params):
        return [text.strip().lower() for text in X]

    def fit(self, X, y=None, **fit_params):
        return self

    def get_params(self, deep=True):
        return {}
    
bow_vector = CountVectorizer(tokenizer = cleaner, ngram_range=(1,1))

from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()

pipe = Pipeline([("cleaner", predictors()),
                 ('vectorizer', bow_vector),
                 ('classifier', clf)])

pipe.fit(X_train,y_train)

In [None]:
from sklearn import metrics
# Predicting with a test dataset
predicted = pipe.predict(X_test)

print(metrics.classification_report(y_test, predicted))

In [None]:
f = open('train.csv', 'r', encoding='utf-8')

texts = []
i = 0
for line in f:
    line = line.replace('\n', '')
    i += 1
    if i == 1:
        continue
    text = line.split(',')[1]
    texts += [text]
    if i == 10:
        break
        
print(pipe.predict_proba(texts))

In [None]:
f = open('train.csv', 'r', encoding='utf-8')

ans = []
texts = []

i = 0
for line in f:
    line = line.replace('\n', '')
    i += 1
    if i == 1:
        ans += [line + ',+,-,?,Communication,Quality,Price,Safety']
        continue
    text = line.split(',')[1]
    p_pos, p_neg, p_nei = predict_sentiment(text)
    p_pos = p_pos/sum([p_pos, p_neg, p_nei])
    p_neg = p_neg/sum([p_pos, p_neg, p_nei])
    p_nei = p_nei/sum([p_pos, p_neg, p_nei])
    texts += [text]
    ans += [line + f',{p_pos},{p_neg},{p_nei}']

pr_proba = pipe.predict_proba(texts)

i = 1
for proba in pr_proba:
    p_com = proba[0]
    p_qua = proba[2]
    p_pri = proba[1]
    p_saf = proba[3]
    ans[i] = ans[i] + f',{p_com},{p_qua},{p_pri},{p_saf}'
    i += 1

with open('test.csv', 'w', encoding='utf-8') as f:
    for line in ans:
        f.write(line + '\n')