In [2]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize, sent_tokenize
from nltk.stem import SnowballStemmer
from collections import defaultdict
import string
import random 
import re
import csv

In [3]:
from sklearn.svm import LinearSVC,SVC
from sklearn.linear_model import LogisticRegression, LinearRegression, Ridge, Lasso, SGDClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfVectorizer,TfidfTransformer
from sklearn.naive_bayes import MultinomialNB 
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt

In [4]:
import keras
from keras.preprocessing.text import Tokenizer #one_hot
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Flatten, Embedding, LSTM, SpatialDropout1D, Input, Bidirectional, Dropout
from keras.callbacks import EarlyStopping, ReduceLROnPlateau

In [5]:
import pymorphy2
morph = pymorphy2.MorphAnalyzer()

In [24]:
def typos(word,count):
    repl_symb={'а':['в','п','к','м','с'],
               'б':['ь','ю','л','д'],            
               'в':['у','с','ч','ы','а'],
               'г':['н','ш','о'],
               'д':['щ','ж','л','б','ю'],
               'е':['к','н','п'],
               'ж':['д','э','з','ю'],
               'з':['щ','ж','х'],
               'и':['м','п','р','т'],
               'й':['ц','ф'],
               'к':['у','е','а'],
               'л':['о','д','ш','ь','б'],
               'м':['с','и','а','п'],
               'н':['е','г','р'],
               'о':['г','р','т','ь','л'],
               'п':['а','м','и','р','е'],
               'р':['н','п','и','т','о'],
               'с':['ч','в','а','м'],
               'т':['и','р','о','ь'],
               'у':['ц','ы','в','в','к'],
               'ф':['й','ы','я'],
               'х':['з','э','ъ'],
               'ц':['й','ф'],
               'ч':['я','ы','в','с'],
               'ш':['г','л','щ'],
               'щ':['ш','д','з'],
               'ъ':['х'],
               'ы':['ц','ч','ф','в'],
               'ь':['т','о','л','б'],
               'э':['ж','х'],
               'ю':['б','д','ж'],
               'я':['ф','ы','ч']}
    result=[]
    l=len(word)
    if count>=l:
        for i in range(l):
            for c in repl_symb[word[i]]:
                result.append(word[:i]+c+word[i+1:])
    else:
        for cur in range(count):   
            pos=np.random.randint(l)
            for c in repl_symb[word[pos]]:                
                result.append(word[:pos]+c+word[pos+1:])    
       # result=random.choices(result, k = count)
    return result

In [25]:
def del_char(word,count):
    result=[]
    l=len(word)
    if count>=l:
        for i in range(l):
            result.append(word[:i]+word[i+1:])
    else:
        for cur in range(count):            
            pos=np.random.randint(l)
            result.append(word[:pos]+word[pos+1:])
    return result

In [26]:
def add_char(word,count):
    vocab='абвгдеёжзийклмнопрстуфхцчшщъыьэюя'
    l_v=len(vocab)
    l=len(word)
    result=[]
    if count>=l:
        for i in range(l):
            pos=np.random.randint(l_v)
            result.append(word[:i]+vocab[pos]+word[i:])
    else:
        for cur in range(count):            
            pos=np.random.randint(l)  
            c_pos=np.random.randint(l_v)
            result.append(word[:pos]+vocab[c_pos]+word[pos:])
    return result

In [27]:
def swap_char(word,count):
    result=[]
    l=len(word)
    if count>=l:
        for i in range(1,l-1):
            left=word[:i]
            right=word[i:]
            if len(left)>0 and len(right)>0:
                result.append(left[:-1]+right[0]+left[-1]+right[1:])
    else:
        for cur in range(count): 
            pos=np.random.randint(l-2)+1
            left=word[:pos]
            right=word[pos:]
            if len(left)>0 and len(right)>0:
                result.append(left[:-1]+right[0]+left[-1]+right[1:])
    return result

In [28]:
def make_mistake(word,config,count):
    all_mistakes=[]
    if config['typos']:
        all_mistakes+=typos(word,count)
    if config['delchar']:
        all_mistakes+=del_char(word,count)    
    if config['addchar']:
        all_mistakes+=add_char(word,count)
    if config['swapchar']:
        all_mistakes+=swap_char(word,count)
    return all_mistakes
#random.choices(all_mistakes, k = count)

In [29]:
config={'typos':True,'delchar':True,'addchar':True,'swapchar':True}
make_mistake('исследователей',config,1)

['исседователей']

In [49]:
def sub_replace(fstr, tstr, text):
    reg = re.compile(r'('+fstr+')')
    output = re.sub(reg, tstr, text)
    return output

In [51]:
reg = 'dog'
rep = 'cat'
text = 'dog fat dog cat dog cat'

In [52]:
sub_replace(reg,rep,text)

'cat fat cat cat cat cat'

In [166]:
def augment_sentence(sentence,config,new_examples,mistake_words):
    reg=re.compile('^[а-я]+$')
    words=re.findall(r"[\w']+", sentence)    
    words=[word for word in words if len(word)>3 and reg.match(word)]  
    size=len(words)
    ans=[]
    for i in range(new_examples):
        bag=[]
        miswords=[]
        n=size//3
        num_mistakes=min(mistake_words,n)        
        newsentence=sentence
        for mistakes in range(num_mistakes):
            pos=0
            if size>0:
                pos=np.random.randint(size)                   
            else:
                break
            word=words.pop(pos)
            size=len(words)            
            if word not in bag and word in aug_words:
                bag.append(word)
                miswords=miswords+random.choices(aug_words[word], k = 1)
                #print(miswords)
                #make_mistake(word,config,1)                    
            else: 
                continue 
        for word,misword in zip(bag,miswords):
            newsentence= newsentence.replace(word, misword, 1)
            #sub_replace(word,misword,newsentence)
        if newsentence!=sentence:
            ans.append(newsentence)
    return ans

In [167]:
config={'typos':True,'delchar':True,'addchar':True,'swapchar':True}
prepare_words('Это отличное решение для студентов, специалистов по обработке данных и исследователей в области искусственного интеллекта.')
augment_sentence('Это отличное решение для студентов, специалистов по обработке данных и исследователей в области искусственного интеллекта.',config,2,3)

['Это отличное решение для студентов, специалисотв по обработке днных и исследователей в области искусственного интеллекта.',
 'Это отличное решение для студентов, специалистов по обработке данных и исследователей в оласти чискусственного интеллекта.']

In [84]:
aug_words={}
def prepare_words(sentences):
    reg=re.compile('^[а-я]+$')
    for sentence in sentences:
        words=re.findall(r"[\w']+", sentence)    
        ru_words=[word for word in words if len(word)>3 and reg.match(word)] 
        for word in ru_words:
            if word not in aug_words:
                aug_words[word]=make_mistake(word.lower(),config,1)       
        

In [14]:
def preprocess_text_tags(text):
    text = text.lower().replace("ё", "е")
    text = re.sub('((www\.[^\s]+)|(http[s]?://[^\s]+))','url', text)     
    text = re.sub('@[^\s]+','USER', text)
    text = re.sub('\w+@[a-zA-Z_]+?\.[a-zA-Z]{2,4}','email', text)   
    text = re.sub('(?:\#+[\w_]+[\w\'_\-]*[\w_]+)','hashtag', text)
    text = re.sub('(?:(?:\d+,?)+(?:\.?\d+)?)','num', text)
    text = re.sub('[^a-zA-Zа-яА-Я1-9]+', ' ', text)
    text = re.sub(' +',' ', text)
    return ' '.join(text.split()) 

In [15]:
def preprocess_text(text):
    text = re.sub('((www\.[^\s]+)|(https?://[^\s]+))','URL', text)
    text = re.sub('@[^\s]+','USER', text)
    text = text.lower().replace("ё", "е")
    text = re.sub('[^a-zA-Zа-яА-Я1-9]+', ' ', text)
    text = re.sub(' +',' ', text)
    return ' '.join(text.split()) 

In [16]:
import emo_unicode
def convert_emoticons(text):
    for emot in emo_unicode.EMOTICONS:
        text = re.sub(u'('+emot+')', "_".join(emo_unicode.EMOTICONS[emot].replace(",","").split()), text)
    return text

In [17]:
def hard_filter_text(data):
    data = data.lower()
    data = re.sub(r'ё', r'е', data) # ё ----> е
    data = re.sub(r'Ё', r'Е', data)
    data = re.sub(r'[^а-яА-Я ]',r' ',data) #оставляем только русские буквы и пробелы (все ост символы заменяются на пробел)
    data = ' '.join(data.split()) #убираем лишние пробелы    
    return data

In [18]:

stops = set(stopwords.words("russian"))
snowball = SnowballStemmer(language="russian")

def clean_text(data):
    delstops = True
    simple_filter = True
    emoji2word = True
    del12gram = True  # True = убираем обрывки слов в 1-2 символа
    stem = False
    lemma = False    
    
    if lemma:
        data = " ".join([morph.parse(w)[0].normal_form for w in data.split()])
    if stem:
        data = " ".join([snowball.stem(w) for w in data.split()])
        
    if delstops:
        data = " ".join([w for w in data.split() if w not in stops])
    
    if simple_filter:
        data = preprocess_text(data)
    
    if del12gram:
        data = " ".join([w for w in data.split() if len(w) > 2])
        
    if emoji2word:
        data = convert_emoticons(data)
        
    return data

In [19]:
def ru_sent2num(tag):
    """
    document convert to vector (sum of token)
    """
    ans=-1000.0
    if 'Ужасно' in tag: 
        ans=-1.0
    elif 'Плохо' in tag: 
        ans=-0.8
    elif 'Хорошо' in tag: 
        ans=0.8
    elif 'Отлично' in tag: 
        ans=1.0
    elif 'Нормально' in tag:
        ans=0.0
    return ans

In [20]:
def ru2_sent2num(tag):
    """
    document convert to vector (sum of token)
    """
    ans=-1000.0
    if 'negative' in tag: 
        ans=-1.0
    elif 'positive' in tag: 
        ans=1.0
    elif 'speech' in tag: 
        ans=0.0
    elif 'skip' in tag: 
        ans=0.0
    elif 'neutral' in tag:
        ans=0.0
    return ans

In [21]:
def mix2num(tag):
    """
    document convert to vector (sum of token)
    """
    ans=-1000.0
    if ('-1' in tag) or ('--' in tag): 
        ans=-1.0
    elif ('1' in tag) or ('++' in tag): 
        ans=1.0
    elif ('0' in tag) or ('-+' in tag) or ('+-' in tag):
        ans=0.0
    return ans

In [22]:
def sent2num(tag):
    """
    document convert to vector (sum of token)
    """
    ans=-1000.0
    if 'negative' in tag: 
        ans=-1.0
    elif 'positive' in tag: 
        ans=1.0
    elif 'neutral' in tag:
        ans=0.0
    return ans

In [23]:
def sent2num2(tag):
    """
    document convert to vector (sum of token)
    """
    ans=-1000.0
    if 'negative' in tag: 
        ans=-1.0
    elif 'positive' in tag: 
        ans=1.0
    elif 'neautral' in tag:
        ans=0.0
    return ans

In [24]:
def shuffle_sentence(newlist,size):
    res_list=[]
    ml=max([len(sent) for sent in newlist])
    for i in range(0, ml):
        for sentence in newlist:
            tmp_list=sentence.split()
            l=len(tmp_list)            
            if l<i or l<3:
                continue
            res_list.append(" ".join(tmp_list[:i]+tmp_list[i+1:]))
            if len(res_list)>=size:
                return res_list
    return res_list          

In [25]:
def shuffle_word(newlist,size):
    res_list=[]
    l=len(newlist)
    while len(res_list)<size:        
        shuff_list=res_list or newlist
        for sentence in shuff_list:
            rn=np.random.randint(l)
            new_word=newlist[rn]
            res_list.append(" ".join([sentence,new_word]))
            if len(res_list)>=size:
                return res_list
    return res_list     

In [26]:
def tag2num(tag):
    """
    document convert to vector (sum of token)
    """
    ans=0
    if tag=='PSTV':
        ans=1
    elif tag=='NGTV':
        ans=-1
    return ans

In [27]:
def bank_test_2015_dataset(balance=True):
    df = pd.read_csv('./dataset/bank_test.csv', sep=',')
    df['clear_text'] = df['text'].map(clean_text)
#    df['tone'] = df['label'].apply(ru2_sent2num)
    df=df[['clear_text','tone']]
#    if balance:
#        newlist=df.clear_text[df.tone>0.1].tolist()
#        nlist=shuffle_sentence(newlist,13981)
#        tmp_df =pd.DataFrame(nlist,columns=['clear_text'])
#        tmp_df['tone']=1.0
#        newlist=df.clear_text[df.tone<-0.1].tolist()
#        nlist=shuffle_sentence(newlist,16715)
#        tmp_df2 =pd.DataFrame(nlist,columns=['clear_text'])
#        tmp_df2['tone']=-1.0
#        df=pd.concat([df,tmp_df,tmp_df2])

#0
#5296
#0
    return df

In [28]:
def bank_train_2015_dataset(balance=True):
    df = pd.read_csv('./dataset/bank_train.csv', sep=',')
    df['clear_text'] = df['text'].map(clean_text)
    df['tone'] = df['tone'].apply(mix2num)
    df=df[['clear_text','tone']]
#    if balance:
#        newlist=df.clear_text[df.tone>0.1].tolist()
#        nlist=shuffle_sentence(newlist,13981)
#        tmp_df =pd.DataFrame(nlist,columns=['clear_text'])
#        tmp_df['tone']=1.0
#        newlist=df.clear_text[df.tone<-0.1].tolist()
#        nlist=shuffle_sentence(newlist,16715)
#        tmp_df2 =pd.DataFrame(nlist,columns=['clear_text'])
#        tmp_df2['tone']=-1.0
#        df=pd.concat([df,tmp_df,tmp_df2])

#354
#3475
#1171
    return df

In [29]:
def bank_train_2016_dataset(balance=True):
    df = pd.read_csv('./dataset/bank_train_2016.csv', sep=',')
    df['clear_text'] = df['text'].map(clean_text)
#    df['tone'] = df['tone'].apply(mix2num)
    df=df[['clear_text','tone']]
#    if balance:
#        newlist=df.clear_text[df.tone>0.1].tolist()
#        nlist=shuffle_sentence(newlist,13981)
#        tmp_df =pd.DataFrame(nlist,columns=['clear_text'])
#        tmp_df['tone']=1.0
#        newlist=df.clear_text[df.tone<-0.1].tolist()
#        nlist=shuffle_sentence(newlist,16715)
#        tmp_df2 =pd.DataFrame(nlist,columns=['clear_text'])
#        tmp_df2['tone']=-1.0
#        df=pd.concat([df,tmp_df,tmp_df2])

#701
#6961
#1730
    return df

In [30]:
def bank_test_2016_dataset(balance=True):
    df = pd.read_csv('./dataset/banks_test_2016.csv', sep=',')
    df['clear_text'] = df['text'].map(clean_text)
#    df['tone'] = df['label'].apply(ru2_sent2num)
    df=df[['clear_text','tone']]
#    if balance:
#        newlist=df.clear_text[df.tone>0.1].tolist()
#        nlist=shuffle_sentence(newlist,13981)
#        tmp_df =pd.DataFrame(nlist,columns=['clear_text'])
#        tmp_df['tone']=1.0
#        newlist=df.clear_text[df.tone<-0.1].tolist()
#        nlist=shuffle_sentence(newlist,16715)
#        tmp_df2 =pd.DataFrame(nlist,columns=['clear_text'])
#        tmp_df2['tone']=-1.0
#        df=pd.concat([df,tmp_df,tmp_df2])

#0
#19586
#0
    return df

In [31]:
def bank_test_etalon_dataset(balance=True):
    df = pd.read_csv('./dataset/banks_test_etalon.csv', sep=',')
    df['clear_text'] = df['text'].map(clean_text)
#    df['tone'] = df['label'].apply(ru2_sent2num)
    df=df[['clear_text','tone']]
#    if balance:
#        newlist=df.clear_text[df.tone>0.1].tolist()
#        nlist=shuffle_sentence(newlist,13981)
#        tmp_df =pd.DataFrame(nlist,columns=['clear_text'])
#        tmp_df['tone']=1.0
#        newlist=df.clear_text[df.tone<-0.1].tolist()
#        nlist=shuffle_sentence(newlist,16715)
#        tmp_df2 =pd.DataFrame(nlist,columns=['clear_text'])
#        tmp_df2['tone']=-1.0
#        df=pd.concat([df,tmp_df,tmp_df2])

#306
#2240
#767
    return df

In [32]:
def tel_train_2015_dataset(balance=True):
    df = pd.read_csv('./dataset/ttk_train.csv', sep=',')
    df['clear_text'] = df['text'].map(clean_text)
    df['tone'] = df['tone'].apply(mix2num)
    df=df[['clear_text','tone']]
#    if balance:
#        newlist=df.clear_text[df.tone>0.1].tolist()
#        nlist=shuffle_sentence(newlist,13981)
#        tmp_df =pd.DataFrame(nlist,columns=['clear_text'])
#        tmp_df['tone']=1.0
#        newlist=df.clear_text[df.tone<-0.1].tolist()
#        nlist=shuffle_sentence(newlist,16715)
#        tmp_df2 =pd.DataFrame(nlist,columns=['clear_text'])
#        tmp_df2['tone']=-1.0
#        df=pd.concat([df,tmp_df,tmp_df2])
#931
#2358
#1711
    return df

In [33]:
def tel_test_2015_dataset(balance=True):
    df = pd.read_csv('./dataset/ttk_test.csv', sep=',')
    df['clear_text'] = df['text'].map(clean_text)
#    df['tone'] = df['label'].apply(ru2_sent2num)
    df=df[['clear_text','tone']]
#    if balance:
#        newlist=df.clear_text[df.tone>0.1].tolist()
#        nlist=shuffle_sentence(newlist,13981)
#        tmp_df =pd.DataFrame(nlist,columns=['clear_text'])
#        tmp_df['tone']=1.0
#        newlist=df.clear_text[df.tone<-0.1].tolist()
#        nlist=shuffle_sentence(newlist,16715)
#        tmp_df2 =pd.DataFrame(nlist,columns=['clear_text'])
#        tmp_df2['tone']=-1.0
#        df=pd.concat([df,tmp_df,tmp_df2])

#0
#5322
#0
    return df

In [34]:
def tel_test_2016_dataset(balance=True):
    df = pd.read_csv('./dataset/tkk_test_2016.csv', sep=',')
    df['clear_text'] = df['text'].map(clean_text)
#    df['tone'] = df['label'].apply(ru2_sent2num)
    df=df[['clear_text','tone']]
#    if balance:
#        newlist=df.clear_text[df.tone>0.1].tolist()
#        nlist=shuffle_sentence(newlist,13981)
#        tmp_df =pd.DataFrame(nlist,columns=['clear_text'])
#        tmp_df['tone']=1.0
#        newlist=df.clear_text[df.tone<-0.1].tolist()
#        nlist=shuffle_sentence(newlist,16715)
#        tmp_df2 =pd.DataFrame(nlist,columns=['clear_text'])
#        tmp_df2['tone']=-1.0
#        df=pd.concat([df,tmp_df,tmp_df2])

#0
#19673
#0
    return df

In [35]:
def tel_train_2016_dataset(balance=True):
    df = pd.read_csv('./dataset/tkk_train_2016.csv', sep=',')
    df['clear_text'] = df['text'].map(clean_text)
#    df['tone'] = df['label'].apply(ru2_sent2num)
    df=df[['clear_text','tone']]
#    if balance:
#        newlist=df.clear_text[df.tone>0.1].tolist()
#        nlist=shuffle_sentence(newlist,13981)
#        tmp_df =pd.DataFrame(nlist,columns=['clear_text'])
#        tmp_df['tone']=1.0
#        newlist=df.clear_text[df.tone<-0.1].tolist()
#        nlist=shuffle_sentence(newlist,16715)
#        tmp_df2 =pd.DataFrame(nlist,columns=['clear_text'])
#        tmp_df2['tone']=-1.0
#        df=pd.concat([df,tmp_df,tmp_df2])

#1299
#4849
#2495
    return df

In [36]:
def tel_test_etalon_dataset(balance=True):
    df = pd.read_csv('./dataset/tkk_test_etalon.csv', sep=',')
    df['clear_text'] = df['text'].map(clean_text)
#    df['tone'] = df['label'].apply(ru2_sent2num)
    df=df[['clear_text','tone']]
#    if balance:
#        newlist=df.clear_text[df.tone>0.1].tolist()
#        nlist=shuffle_sentence(newlist,13981)
#        tmp_df =pd.DataFrame(nlist,columns=['clear_text'])
#        tmp_df['tone']=1.0
#        newlist=df.clear_text[df.tone<-0.1].tolist()
#        nlist=shuffle_sentence(newlist,16715)
#        tmp_df2 =pd.DataFrame(nlist,columns=['clear_text'])
#        tmp_df2['tone']=-1.0
#        df=pd.concat([df,tmp_df,tmp_df2])
#210
#1011
#1026
    return df

In [37]:
def news_dataset(balance=True):
    df = pd.read_csv('./dataset/news.csv', sep=',')
    df['clear_text'] = df['text'].map(clean_text)
    df['tone'] = df['sentiment'].apply(sent2num)
    df=df[['clear_text','tone']]
#    if balance:
#        newlist=df.clear_text[df.tone>0.1].tolist()
#        nlist=shuffle_sentence(newlist,13981)
#        tmp_df =pd.DataFrame(nlist,columns=['clear_text'])
#        tmp_df['tone']=1.0
#        newlist=df.clear_text[df.tone<-0.1].tolist()
#        nlist=shuffle_sentence(newlist,16715)
#        tmp_df2 =pd.DataFrame(nlist,columns=['clear_text'])
#        tmp_df2['tone']=-1.0
#        df=pd.concat([df,tmp_df,tmp_df2])
#2795
#4034
#1434
    return df

In [38]:
def rusentiment_dataset(balance=True):
    df1 = pd.read_csv('./dataset/rusentiment_random_posts.csv',',')
    df2 = pd.read_csv('./dataset/rusentiment_preselected_posts.csv',',') 
    df3 = pd.read_csv('./dataset/rusentiment_test.csv',',') 
    df=pd.concat([df1,df2,df3])
    df['clear_text'] = df['text'].map(clean_text)
    df['tone'] = df['label'].apply(ru2_sent2num)
    df=df[['clear_text','tone']]
    if balance:
        newlist=df.clear_text[df.tone>0.1].tolist()
        nlist=shuffle_sentence(newlist,13981)
        tmp_df =pd.DataFrame(nlist,columns=['clear_text'])
        tmp_df['tone']=1.0
        newlist=df.clear_text[df.tone<-0.1].tolist()
        nlist=shuffle_sentence(newlist,16715)
        tmp_df2 =pd.DataFrame(nlist,columns=['clear_text'])
        tmp_df2['tone']=-1.0
        df=pd.concat([df,tmp_df,tmp_df2])
    return df

In [39]:
def medical_dataset(balance=True):
    df = pd.read_csv('./dataset/medical_comments.csv',sep=',')
    df['clear_text'] = df['text'].map(clean_text)
    df['tone'] = df['sentiment'].apply(ru_sent2num)
    df=df[['clear_text','tone']]
    if balance:
        newlist=df.clear_text[df.tone==0.0].tolist()
        nlist=shuffle_sentence(newlist,85806)
        tmp_df =pd.DataFrame(nlist,columns=['clear_text'])
        tmp_df['tone']=0.0
        newlist=df.clear_text[df.tone==-1.0].tolist()
        nlist=shuffle_sentence(newlist,28239)
        tmp_df2 =pd.DataFrame(nlist,columns=['clear_text'])
        tmp_df2['tone']=-1.0
        df=pd.concat([df,tmp_df,tmp_df2])
    return df

In [40]:
def clothes_dataset(balance=True):
    df = pd.read_csv('./dataset/women-clothing-accessories.3-class.balanced.txt',sep='\t')
    df['clear_text'] = df['review'].map(clean_text)
    df['tone'] = df['sentiment'].apply(sent2num2)
    df=df[['clear_text','tone']]
    return df

In [41]:
def emo_dict_old_dataset(balance=True):
    df = pd.read_csv('./dataset/emo_dict_old.csv',sep=';')
    df['clear_text'] = df['term'].map(clean_text)
    df['tone'] = df['tag'].apply(tag2num)
    df=df[['clear_text','tone']]
    if balance:
        newlist=df.clear_text[df.tone>0.1].tolist()
        nlist=shuffle_word(newlist,24747)
        tmp_df =pd.DataFrame(nlist,columns=['clear_text'])
        tmp_df['tone']=1.0
        newlist=df.clear_text[df.tone<-0.1].tolist()
        nlist=shuffle_word(newlist,24234)
        tmp_df2 =pd.DataFrame(nlist,columns=['clear_text'])
        tmp_df2['tone']=-1.0
        df=pd.concat([df,tmp_df,tmp_df2])
    return df

In [42]:
def emo_dict_dataset(balance=True):
    df = pd.read_csv('./dataset/emo_dict.csv',sep=';')
    df['clear_text'] = df['term'].map(clean_text)
    df['tone'] = df['tag'].apply(tag2num)
    df=df[['clear_text','tone']]
    if balance:
        newlist=df.clear_text[df.tone>0.1].tolist()
        nlist=shuffle_word(newlist,12323)
        tmp_df =pd.DataFrame(nlist,columns=['clear_text'])
        tmp_df['tone']=1.0
        newlist=df.clear_text[df.tone<-0.1].tolist()
        nlist=shuffle_word(newlist,10099)
        tmp_df2 =pd.DataFrame(nlist,columns=['clear_text'])
        tmp_df2['tone']=-1.0
        df=pd.concat([df,tmp_df,tmp_df2])
    return df

In [43]:
def tweet_dataset(balance=True):
    dtype = {'id': np.uint16, 'date': str, 'name': str, 'text': str,  'type': np.uint8}
    ppath = './dataset/positive.csv' #home
    npath = './dataset/negative.csv' #home
    pdf = pd.read_csv (ppath, sep=';', names=['id', 'date', 'name', 'text', 'type'], usecols=dtype.keys(), dtype=dtype)
    ndf = pd.read_csv (npath, sep=';', names=['id', 'date', 'name', 'text', 'type'], usecols=dtype.keys(), dtype=dtype)
    pdf.drop(['id', 'date', 'name'], axis=1, inplace=True)
    ndf.drop(['id', 'date', 'name'], axis=1, inplace=True)
    pdf['clear_text'] = pdf['text'].map(clean_text)
    ndf['clear_text'] = ndf['text'].map(clean_text)
    pdf.drop('text', axis=1, inplace=True)
    ndf.drop('text', axis=1, inplace=True)
    ndf['type'] = -1
    df = pd.concat([pdf, ndf], ignore_index=True)
    df = df.rename(columns={'type': 'tone'})
    return df
#114911
#111923

In [44]:
def social_dataset(balance=True):
    df = pd.read_csv('./dataset/marked.csv',sep=',')
    df['clear_text'] = df['FullText'].map(clean_text)
    df = df.rename(columns={'Tone': 'tone'})
    df=df[['clear_text','tone']]   
    if balance:
        newlist=df.clear_text[df.tone>0.1].tolist()
        nlist=shuffle_sentence(newlist,412)
        tmp_df =pd.DataFrame(nlist,columns=['clear_text'])
        tmp_df['tone']=1.0
        newlist=df.clear_text[df.tone==0.0].tolist()
        nlist=shuffle_sentence(newlist,292)
        tmp_df2 =pd.DataFrame(nlist,columns=['clear_text'])
        tmp_df2['tone']=0.0
        df=pd.concat([df,tmp_df,tmp_df2])
    return df

In [45]:
#rand_ru['vectr'] = rand_ru['clear_text'].apply(vectorize_sum)

In [46]:
def join_frames(df1,df2):
    if df1.shape[0]<1:
        return df2
    else:
        return pd.concat([df1,df2])
    
def load_dataset(datasets):
    df=pd.DataFrame(columns = ['clear_text','tone'])
    if datasets['rusentiment'][0]:
        df=join_frames(df,rusentiment_dataset(balance=datasets['rusentiment'][1]))
    if datasets['medical'][0]:
        df=join_frames(df,medical_dataset(balance=datasets['medical'][1]))
    if datasets['clothes'][0]:
        df=join_frames(df,clothes_dataset(balance=datasets['clothes'][1]))
    if datasets['emo_dict_old'][0]:
        df=join_frames(df,emo_dict_old_dataset(balance=datasets['emo_dict_old'][1]))
    if datasets['emo_dict'][0]:
        df=join_frames(df,emo_dict_dataset(balance=datasets['emo_dict'][1]))
    if datasets['tweet'][0]:
        df=join_frames(df,tweet_dataset(balance=datasets['tweet'][1]))
    if datasets['social'][0]:
        df=join_frames(df,social_dataset(balance=datasets['social'][1]))
    if datasets['bank_test_2015'][0]:
        df=join_frames(df,bank_test_2015_dataset(balance=datasets['bank_test_2015'][1]))        
    if datasets['bank_train_2015'][0]:
        df=join_frames(df,bank_train_2015_dataset(balance=datasets['bank_train_2015'][1]))      
    if datasets['bank_train_2016'][0]:
        df=join_frames(df,bank_train_2016_dataset(balance=datasets['bank_train_2016'][1]))      
    if datasets['bank_test_2016'][0]:
        df=join_frames(df,bank_test_2016_dataset(balance=datasets['bank_test_2016'][1]))      
    if datasets['bank_test_etalon'][0]:
        df=join_frames(df,bank_test_etalon_dataset(balance=datasets['bank_test_etalon'][1]))              
    if datasets['tel_train_2015'][0]:
        df=join_frames(df,tel_train_2015_dataset(balance=datasets['tel_train_2015'][1]))   
    if datasets['tel_test_2015'][0]:
        df=join_frames(df,tel_test_2015_dataset(balance=datasets['tel_test_2015'][1]))                   
    if datasets['tel_test_2016'][0]:
        df=join_frames(df,tel_test_2016_dataset(balance=datasets['tel_test_2016'][1]))                   
    if datasets['tel_train_2016'][0]:
        df=join_frames(df,tel_train_2016_dataset(balance=datasets['tel_train_2016'][1]))                   
    if datasets['tel_test_etalon'][0]:
        df=join_frames(df,tel_test_etalon_dataset(balance=datasets['tel_test_etalon'][1]))           
    if datasets['news'][0]:
        df=join_frames(df,news_dataset(balance=datasets['news'][1]))                   
    return df 

In [47]:
def word2vec_model(df_train,df_test):
    df_train['vectr'] = df_train['clear_text'].apply(vectorize_sum)
    df_test['vectr'] = df_test['clear_text'].apply(vectorize_sum)
    X=df_train['vectr'].tolist()
    Y=df_train['tone'].tolist()
    model = LinearRegression()
    model.fit(X, Y)
    x_hat=df_test['vectr'].tolist()
    y_hat=model.predict(x_hat)
    return y_hat    

In [48]:
def naive_bayes(df_train,df_test):   #emb. TF-IDF 0.7303
    y_train = np.asarray(df_train['tone'],dtype=np.int8)
    train_data = np.array([twitt for twitt in df_train['clear_text']])
    test_data = np.array([twitt for twitt in df_test['clear_text']])
    
    tfidf = TfidfVectorizer(
           ngram_range=(1, 3),
           use_idf=1,
           smooth_idf=1)

    data_train_count = tfidf.fit_transform(train_data)
    data_test_count  = tfidf.transform(test_data)
    clf = MultinomialNB()
    clf.fit(data_train_count, y_train)
    pred = clf.predict(data_test_count)
    return pred

In [49]:
def naive_bayes2(df_train,df_test):
    y_train = np.asarray(df_train['tone'],dtype=np.int8)
    train_data = np.array([twitt for twitt in df_train['clear_text']])
    test_data = np.array([twitt for twitt in df_test['clear_text']])
    text_clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None)),])
    text_clf.fit(train_data, y_train)
    predicted = text_clf.predict(test_data)
    return predicted

In [50]:
def naive_bayes_gs(df_train,df_test):
    y_train = np.asarray(df_train['tone'],dtype=np.int8)
    train_data = np.array([twitt for twitt in df_train['clear_text']])
    test_data = np.array([twitt for twitt in df_test['clear_text']])
    text_clf = Pipeline([
        ('vect', CountVectorizer()),
        ('tfidf', TfidfTransformer()),
        ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None)),])
    parameters = {'vect__ngram_range': [(1, 1), (1, 2)],
                  'tfidf__use_idf': (True, False),
                  'clf__alpha': (1e-2, 1e-3),}
    gs_clf = GridSearchCV(text_clf, parameters, cv=5, n_jobs=16,verbose=1)
    gs_clf.fit(train_data, y_train)
    for param_name in sorted(parameters.keys()):
        print("%s: %r" % (param_name, gs_clf.best_params_[param_name]))
    predicted = gs_clf.predict(test_data)
    return predicted

In [51]:
def svm_model(df_train,df_test):
    y_train = np.asarray(df_train['tone'],dtype=np.int8)
    train_data = np.array([twitt for twitt in df_train['clear_text']])
    test_data = np.array([twitt for twitt in df_test['clear_text']])
    
    tfidf = TfidfVectorizer(
           ngram_range=(1, 3),
           use_idf=1,
           smooth_idf=1)

    data_train_count = tfidf.fit_transform(train_data)
    data_test_count  = tfidf.transform(test_data)
    
    classifier_linear = LinearSVC()
    classifier_linear.fit(data_train_count, y_train)
    pred = classifier_linear.predict(data_test_count)
    return pred

In [52]:
def word_class_count(df_train,df_test):
    ngd=defaultdict(float)
    psd=defaultdict(float)
    ntd=defaultdict(float)    
    y_train = df_train['tone']
    train_data = df_train['clear_text'].tolist()
    for x,y in zip(train_data, y_train):        
        if y>0.1:
            for w in x.split():
                psd[w] += 1.0
        elif y<-0.1:
            for w in x.split():
                ngd[w] += 1.0  
        else:
            for w in x.split():
                ntd[w] += 1.0
    words=set(list(ngd.keys())+list(psd.keys())+list(ntd.keys()))
    words=list(words)
    wl=sum(ngd.values())+sum(psd.values())+sum(ntd.values())
    for w in words:
        s=psd[w] + ntd[w] + ngd[w]+1
        psd[w] = (psd[w]+1)/s
        ngd[w] = (ngd[w]+1)/s
        ntd[w] = (ntd[w]+1)/s
    
    res=[]
    for sent in df_test['clear_text']:   
        pos=ngt=neut=0
        for w in sent.split():
            pos+=psd[w]
            ngt+=ngd[w]
            neut+=ntd[w]
        if pos>=ngt and pos>=neut:
            res.append(1)
        elif ngt>=pos and ngt>=neut:
            res.append(-1)
        else:
            res.append(0)
    return res

In [53]:
whole_dataset={'rusentiment':(True,False),\
               'medical':(False,False),\
               'clothes':(False,False),\
               'emo_dict_old':(False,False),\
               'emo_dict':(False,False),\
               'tweet':(False,False),\
               'social':(False,False),\
               'news':(False,False),\
               'bank_test_2015':(False,False),\
               'bank_train_2015':(False,False),\
               'bank_train_2016':(False,False),\
               'bank_test_2016':(False,False),\
               'bank_test_etalon':(False,False),\
               'tel_train_2015':(True,False),\
               'tel_test_2015':(True,False),\
               'tel_test_2016':(True,False),\
               'tel_train_2016':(True,False),\
               'tel_test_etalon':(True,False)}

In [75]:
df = load_dataset(whole_dataset)

In [76]:
print(df.clear_text[df.tone  > 0.1].count())
print(df.clear_text[df.tone  == 0.0].count())
print(df.clear_text[df.tone  < -0.1].count())

9086
53840
9144


In [168]:
df_spam = pd.read_csv('./dataset/spam.csv', sep=',')

In [169]:
len(df_spam)

24384

In [170]:
df_spam=df_spam[['clear_text']]

In [171]:
df_spam['clear_text'].replace('', np.nan, inplace=True)

In [172]:
df_spam.dropna(subset=['clear_text'], inplace=True)

In [173]:
len(df_spam)

24384

In [174]:
df_spam=df_spam.rename(columns={"clear_text": "text"})

In [175]:
df_norm = pd.read_csv('./dataset/norm.csv', sep=',')

In [176]:
df_norm['text'].replace('', np.nan, inplace=True)

In [177]:
df_norm.dropna(subset=['text'], inplace=True)

In [178]:
len(df_norm)

31185

In [179]:
df_norm=df_norm[['text']]

In [180]:
posetive=df_norm.text.tolist()

In [181]:
negative=df_spam.text.tolist()

In [182]:
prepare_words(posetive+negative)

In [183]:
aug_text=[]
config={'typos':True,'delchar':True,'addchar':True,'swapchar':True}
for sentence in posetive:
    new_sentence=augment_sentence(sentence,config,2,3)
    aug_text+=new_sentence
aug_text=aug_text+posetive
df_norm=pd.DataFrame(aug_text) 
df_norm["spam"]='0'

In [184]:
len(df_norm)

65160

In [185]:
aug_text=[]
config={'typos':True,'delchar':True,'addchar':True,'swapchar':True}
for sentence in negative:
    new_sentence=augment_sentence(sentence,config,3,3)
    aug_text+=new_sentence
aug_text=aug_text+negative
df_spam=pd.DataFrame(aug_text) 
df_spam["spam"]='1'

In [186]:
print(len(df_norm))
print(len(df_spam))

65160
59053


In [187]:
df=pd.concat([df_norm,df_spam])

In [188]:
len(df)

124213

In [189]:
display(df)

Unnamed: 0,0,spam
0,"А попа подозревала двано,что ты с кавказса..пе...",0
1,"А попа подозревала давно,что ты с кавказа..пер...",0
2,"Блин, почему эта жизнь стоь не справедлива ((((",0
3,"Блин, почему эта жизнь столь не спарведлива ((((",0
4,где еще встречатйь свой день рождения как не н...,0
...,...,...
59048,sms предупрежден мангистауск дчс направл абоне...,1
59049,sms предупрежден мангистауск дчс направл абоне...,1
59050,номер восстанавлива,1
59051,beautiful прекрасн,1


In [192]:
df[0].replace('', np.nan, inplace=True)

In [193]:
df.dropna(subset=[0], inplace=True)

In [194]:
len(df)

124213

In [195]:
num_instances=len(df)

In [196]:
msk = np.random.rand(num_instances)<0.9

In [197]:
df_train,df_test=df[msk],df[~msk]

In [198]:
len(df_train)

111759

In [199]:
len(df_test)

12454

In [201]:
import csv
df_test.spam = df_test.spam.astype(int)
df_train.spam = df_train.spam.astype(int)

df_train.to_csv('spam_train.csv', index=False, header=None)
df_test.to_csv('spam_test.csv', index=False, header=None)

In [100]:
num_instances=len(df_spam)

In [101]:
msk = np.random.rand(num_instances)<0.9

In [103]:
df_train,df_test=df_spam[msk],df_spam[~msk]

In [104]:
len(df_train)

130712

In [105]:
len(df_test)

14510

In [106]:
print(df_train.clear_text[df_train.spam  == 1].count())
print(df_train.clear_text[df_train.spam  == 0].count())

65790
64850


In [107]:
print(df_test.clear_text[df_test.spam  == 1].count())
print(df_test.clear_text[df_test.spam  == 0].count())

7323
7176


In [11]:
dtype = {'clear_text': str,'tone': np.int8} 
df = pd.read_csv('all_test.tsv', sep='\t', dtype=dtype, names=['clear_text', 'tone'])

In [14]:
display(df)

Unnamed: 0,clear_text,tone
0,семизубая девчонка это,1.0
1,днем варенья фанат всего самого наилутшего,1.0
2,рождение это великое чудо,2.0
3,вот тебе няшку,2.0
4,едут автобусе панк монашка панк говорит типа т...,1.0
...,...,...
18501,мтс связываюсь доп соглашений amp raquo дикими...,0.0
18502,user ума чтоль сошли мне августа связи url,0.0
18503,user билайн это оператор состояние души предло...,0.0
18504,user здравствуйте мобильным интернетом караган...,0.0


In [13]:
df['tone'] = df.tone + 1.0

In [15]:
df.to_csv('all_test_new.tsv', quoting=csv.QUOTE_NONE,index=False, sep="\t", header=None)

In [110]:
df_train.to_csv('spam_train.tsv', quoting=csv.QUOTE_NONE,index=False, sep="\t", header=None)
df_test.to_csv('spam_test.tsv', quoting=csv.QUOTE_NONE,index=False, sep="\t", header=None)

In [82]:
display(df)

Unnamed: 0,clear_text,spam
0,попа подозревала давно что кавказа перестану о...,0
1,прошедшим днем ангела,0
2,два дня отлета острова,0
3,блин почему эта жизнь столь справедлива,0
4,встречать свой день рождения кладбище,0
...,...,...
2242,мтс заботится моем бюджете просто позволяя вто...,0
2243,караоке ростелекома,0
2244,user включили подключении дублирующую опцию ка...,0
2245,агенты приватизации шереметьево ростелекома по...,0


In [91]:
pdf=pd.concat([df,df_spam])

In [96]:
pdf

Unnamed: 0,clear_text,spam
0,попа подозревала давно что кавказа перестану о...,0
1,прошедшим днем ангела,0
2,два дня отлета острова,0
3,блин почему эта жизнь столь справедлива,0
4,встречать свой день рождения кладбище,0
...,...,...
145217,sms предупрежден мангистауск дчс направл абоне...,1
145218,sms предупрежден мангистауск дчс направл абоне...,1
145219,номер восстанавлива,1
145220,beautiful прекрасн,1


In [97]:
pdf.to_csv('all_test.tsv', index=False)

In [93]:
pdf=pdf.reset_index()

In [95]:
pdf=pdf.drop(['index'], axis=1)

In [60]:
df_spam = pd.read_csv('./dataset/spam_dataset.csv', sep=',')

In [67]:
df_spam['clear_text'] = df_spam['clear_text'].map(clean_text)

In [68]:
spam=df_spam.clear_text.tolist()

In [70]:
len(spam)

24384

In [71]:
aug_text=[]
config={'typos':True,'delchar':True,'addchar':True,'swapchar':True}
for sentence in spam:
    new_sentence=augment_sentence(sentence,config,2,3)
    aug_text+=new_sentence
aug_text=aug_text+spam
df_spam=pd.DataFrame(aug_text) 

In [72]:
df_spam["spam"]=1

In [89]:
df_spam=df_spam.rename(columns={0: "clear_text"})

In [90]:
display(df_spam)

Unnamed: 0,clear_text,spam
0,распоряжеини cnews оказалась обновленная аерси...,1
1,распоряжении cnews оказалась обновленная верси...,1
2,скидка оплату премичум дотсупа играм билайн пр...,1
3,сёкидка оплату премиум доступа играм блайн про...,1
4,всем привет хочешь мной поиграть пиши вконтакт...,1
...,...,...
73147,sms предупрежден мангистауск дчс направл абоне...,1
73148,sms предупрежден мангистауск дчс направл абоне...,1
73149,номер восстанавлива,1
73150,beautiful прекрасн,1


In [88]:
len(df)

72070

In [113]:
print(df.clear_text[df.tone  > 0.1].count())
print(df.clear_text[df.tone  == 0.0].count())
print(df.clear_text[df.tone  < -0.1].count())

131841
108708
129747


In [114]:
num_instances=len(df)

In [120]:
#msk = [False]*num_instances
msk = np.random.rand(num_instances)<0.95

In [121]:
df_train,df_test=df[msk],df[~msk]

In [122]:
print(df_train.clear_text[df_train.tone  > 0.1].count())
print(df_train.clear_text[df_train.tone  == 0.0].count())
print(df_train.clear_text[df_train.tone  < -0.1].count())

125132
103266
123274


In [123]:
print(df_test.clear_text[df_test.tone  > 0.1].count())
print(df_test.clear_text[df_test.tone  == 0.0].count())
print(df_test.clear_text[df_test.tone  < -0.1].count())

6709
5442
6473


In [None]:
df_aug=df_train.copy()
df_aug['clear_text'] = df_aug['clear_text'].apply(lambda x: str(augment_sentence(x,config,1,10)[0]))

In [None]:
print(df_aug.clear_text[df_aug.tone  > 0.1].count())
print(df_aug.clear_text[df_aug.tone  == 0.0].count())
print(df_aug.clear_text[df_aug.tone  < -0.1].count())

In [None]:
df=pd.concat([df_train,df_aug])

In [None]:
display(df)

In [None]:
import csv
df_test['tone'] = df_test.tone + 1.0
df_test.tone = df_test.tone.astype(int)
df.tone = df.tone.astype(int)

df.to_csv('all_train.tsv', quoting=csv.QUOTE_NONE,index=False, sep="\t", header=None)
df_test.to_csv('all_test.tsv', quoting=csv.QUOTE_NONE,index=False, sep="\t", header=None)

In [124]:
posetive=df_train.clear_text[df_train.tone  > 0.1].tolist()
neutral=df_train.clear_text[df_train.tone  == 0.0].tolist()
negative=df_train.clear_text[df_train.tone  < -0.1].tolist()

In [125]:
#posetive

In [None]:
aug_text=[]
config={'typos':True,'delchar':True,'addchar':True,'swapchar':True}
reg=re.compile('^[а-я]+$')
for sentence in posetive:
    words=re.findall(r"[\w']+", sentence)
    words=[word for word in words if len(word)>3 and reg.match(word) ]  
    size=len(words)
    new_examples=size//7+1
    new_sentence=augment_sentence(sentence,config,new_examples,3)
    aug_text+=new_sentence
aug_text=aug_text+posetive
df_posetive=pd.DataFrame(aug_text) 
df_posetive["tone"]='2'

aug_text=[]
for sentence in neutral:
    words=re.findall(r"[\w']+", sentence)
    words=[word for word in words if len(word)>3 and reg.match(word) ]  
    size=len(words)
    new_examples=size//7+1
    new_sentence=augment_sentence(sentence,config,new_examples,3)
    aug_text+=new_sentence
aug_text=aug_text+neutral
df_neutral=pd.DataFrame(aug_text) 
df_neutral["tone"]='1'

aug_text=[]
for sentence in negative:
    words=re.findall(r"[\w']+", sentence)
    words=[word for word in words if len(word)>3 and reg.match(word) ]  
    size=len(words)
    new_examples=size//7+1
    new_sentence=augment_sentence(sentence,config,new_examples,3)
    aug_text+=new_sentence
aug_text=aug_text+negative
df_negative=pd.DataFrame(aug_text) 
df_negative["tone"]='0'

In [None]:
print(len(df_posetive))
print(len(df_neutral))
print(len(df_negative))

In [None]:
df_train=pd.concat([df_posetive,df_neutral,df_negative])

In [None]:
print(df_train.clear_text[df_train.tone  > 0.1].count())
print(df_train.clear_text[df_train.tone  == 0.0].count())
print(df_train.clear_text[df_train.tone  < -0.1].count())

In [None]:
display(df_train)

In [None]:
import csv
df_test['tone'] = df_test.tone + 1.0
df_test.tone = df_test.tone.astype(int)
df_train.tone = df_train.tone.astype(int)

df_train.to_csv('all_train.tsv', quoting=csv.QUOTE_NONE,index=False, sep="\t", header=None)
df_test.to_csv('all_test.tsv', quoting=csv.QUOTE_NONE,index=False, sep="\t", header=None)