In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [36]:
texts = pd.read_csv('ml_data.csv', encoding = 'utf-8') #пока наилучший вариант был с третьей итерацией

In [37]:
texts = texts.drop(['Unnamed: 0'], axis=1)

In [35]:
texts['id'] = texts['id'].factorize()[0]

In [38]:
train, test = train_test_split(texts, test_size=0.3)

In [39]:
y_train = train['in_out'] 
y_test = test['in_out']

In [44]:
#train

In [45]:
#import ksslib.readability_metrics as kk

In [40]:
import re
from pyphen import Pyphen 
import string
exclude = list(string.punctuation)

def sentence_splitter(text):
    sent_list = re.split(' *[\.\?!][\'"\)\]]* ', text)
    return sent_list
 
def text_len_sent(text):
    TL_sent = len(sentence_splitter(text))
    return TL_sent
    
def text_len_words(text):
    TL_words = len(text.split())
    return TL_words
 
def avg_sentence_length(text):
    ASL = float(text_len_words(text)/text_len_sent(text))
    return round(ASL, 2)
    
    
def avg_sent_per_word(text):
    ASPW = float(text_len_sent(text)/text_len_words(text))
    return round(ASPW, 2)
    
    
def char_count(text, ignore_spaces=True):
    if ignore_spaces:
        text_chars = text.replace(" ", "")
    return len(text_chars) 

    
def avg_letter_per_word(text):
    ALPW = float(float(char_count(text))/float(len(text.split())))
    return round(ALPW, 2)
    

def avg_letter_per_sent(text):
    ALPS = float(float(char_count(text))/float(len(sentence_splitter(text))))
    return round(ALPS, 2)
    

def syllable_count(text): 
    text = text.lower()
    text = "".join(x for x in text if x not in exclude)
    dic = Pyphen(lang='ru_RU')
    count = 0
    for word in text.split(' '):
        word_hyphenated = dic.inserted(word)
        count += max(1, word_hyphenated.count("-") + 1)
    return count
    
    
def avg_syllab_per_word(text):
    ASYPW = float(float(syllable_count(text))/float(len(text.split())))
    return round(ASYPW, 2)
    

def avg_syllab_per_sent(text):
    ASYPS = float(float(syllable_count(text))/float(len(sentence_splitter(text))))
    return round(ASYPS, 2)    
    
def diffsyll(text):
    count = 0
    for word in text.split():
        wrds = syllable_count(word)
        #if wrds >= 3:
        if wrds >= 4:
            count += 1
    return count

def percent_syll(text):
    perc_diff = float(float(diffsyll(text)))/float(len(text.split()))*100
    return round(perc_diff, 2) 
    
      
def get_simple_metrics(text):
    SL = len(sentence_splitter(text))
    WC = len(text.split())
    ASL = avg_sentence_length(text)
    TC = char_count(text)
    ALPW = avg_letter_per_word(text)
    ALPS = avg_letter_per_sent(text)
    SYC = syllable_count(text)
    ASYPW = avg_syllab_per_word(text)
    ASYPS = avg_syllab_per_sent(text)
    DW = diffsyll(text)
    ADF = percent_syll(text)
    return [SL, WC, ASL, TC, ALPW, ALPS, SYC, ASYPW, ASYPS, DW, ADF] 
    

    
def print_simple_metrics(text):
    print('Количество предложений в тексте:', len(sentence_splitter(text)))
    print('Количество слов в тексте:', len(text.split()))
    print('Средняя длина предложений:', avg_sentence_length(text))
    print('Количество символов в тексте:', char_count(text))
    print('Средняя длина слова:', avg_letter_per_word(text))
    print('Средняя длина предложений в символах:', avg_letter_per_sent(text))
    print('Количество слогов в тексте:', syllable_count(text))
    print('Среднее количество слогов в слове:', avg_syllab_per_word(text))
    print('Среднее количеcтво слогов в предложении:', avg_syllab_per_sent(text))
    print('Количество сложных слов в тексте:', diffsyll(text))
    print('Процент сложных слов в тексте', percent_syll(text))
    

In [41]:
import re
import os
import string
exclude = list(string.punctuation)


def flesch_RE(text):
    ASL = avg_sentence_length(text)
    ASW = avg_syllab_per_word(text)
    FRE = 206.835 - float(1.3 * ASL) - float(60.6 * ASW)
    return round(FRE, 2)

def flesch_kincaid_grade(text):
    ASL = avg_sentence_length(text)
    ASW = avg_syllab_per_word(text)
    #английский язык!
    #FKRA = float(0.39 * ASL) + float(11.8 * ASW) - 15.59
    #русский
    #FKRA = float(0.49 * ASL) + float(7.3 * ASW) - 16.59
    #Оборнева
    FKRA = float(0.5 * ASL) + float(8.4 * ASW) - 15.59
    return round(FKRA, 2)
    
def smog_index(text): 
    if len(sentence_splitter(text)) >= 3:
        SMOG = (1.043 * (30*(diffsyll(text)/len(sentence_splitter(text))))**.5) + 3.1291
        return round(SMOG, 2)
    else:
        return 0
        
        
def coleman_liau_index(text):
    L = round(avg_letter_per_word(text)*100, 2)
    S = round(avg_sent_per_word(text)*100, 2)
    CLI = float((0.058 * L) - (0.296 * S) - 15.8)
    return round(CLI, 2)


def dale_chall_score(text): #т.к. делаем сложные слова как 4 слога, все ок 
    word_count = len(text.split())
    count = word_count - diffsyll(text)
    per = float(count)/float(word_count)*100
    difficult_words = 100-per
    if difficult_words > 5: #дальше идет адаптация: вместо 0,0496 0,062
        score = (0.1579 * difficult_words) + (0.062 * avg_sentence_length(text)) + 3.6365
    else:
        score = (0.1579 * difficult_words) + (0.062 * avg_sentence_length(text))
    return round(score, 2)
    
    
def gunning_fog(text):
    grade = 0.4*(avg_sentence_length(text) + percent_syll(text))
    return round(grade,2)
       
def print_statistics(text):
    print('Russian Flesh reading Ease =', flesch_RE(text))
    print('Russian Flesh-Kincaid Grade =', flesch_kincaid_grade(text))
    print('Russian SMOG =', smog_index(text))
    print('Russian CLI =', coleman_liau_index(text))
    print('Russian DCH =', dale_chall_score(text))
    print('Russian Gunning Fog =', gunning_fog(text))
    
def statist_vectors(text):
    FRE = flesch_RE(text)
    FKG = flesch_kincaid_grade(text)
    SMOG = smog_index(text)
    CLI = coleman_liau_index(text)
    DCH = dale_chall_score(text)
    GF = gunning_fog(text)
    return [FRE, FKG, SMOG, CLI, DCH, GF]   
    
    
def statist_sum(text):
    average = (flesch_kincaid_grade(text)+smog_index(text)+coleman_liau_index(text)+dale_chall_score(text)+gunning_fog(text))/5
    return round(average,2)

def simple_classifire(text):
    level = statist_sum(text)
    if level > 0 and level <13 :
        return 1
    if level >= 13 and level < 17:
        return 2
    return 3
    

In [42]:
def statist_vectors(text):
    FRE = flesch_RE(text)
    FKG = flesch_kincaid_grade(text)
    SMOG = smog_index(text)
    CLI = coleman_liau_index(text)
    DCH = dale_chall_score(text)
    GF = gunning_fog(text)
    return [FRE, FKG, SMOG, CLI, DCH, GF]

In [47]:
#try_me = statist_vectors(train['text'][1])

In [48]:
#try_me

In [43]:
def count_me_all(texts):
    return [statist_vectors(text) for text in texts]

In [44]:
x_train = count_me_all(train['text'])
x_test = count_me_all(test['text'])

In [45]:
train

Unnamed: 0,id,in_out,text
1230,1234,1,"Для меня программа ""Учитель для России"" может ..."
4704,4718,0,Я хотела быть учителем очень давно. Это желани...
1692,1698,0,"потомкам необходимо показать любовь к науке, а..."
4394,4407,2,Будущее страны напрямую зависит от последующих...
459,461,0,Проработав определённое количество времени в м...
1627,1633,0,"Вот и настало время, когда скоро мои дети пойд..."
600,602,0,"Я хочу стать участником программы ""Учитель для..."
821,823,0,Хочу начать строить карьеру в сфере образовани...
5385,5402,2,"В 2011 году, меня, менеджера по привлечению кл..."
2226,2233,2,Система образования в России вообще оставляет ...


In [46]:
from sklearn.preprocessing import StandardScaler

In [47]:
scaler = StandardScaler()
x_fit = scaler.fit(x_train)
x_transform = scaler.transform(x_train)
x_fit_test = scaler.fit(x_test)
x_transform_test = scaler.transform(x_test)

In [48]:
x_transform

array([[-0.04641497,  0.06490557,  0.82326205,  0.54147853,  0.29992065,
         0.21646784],
       [ 0.04484526,  0.04810163,  0.56748612, -0.12515301, -0.10482088,
        -0.08442974],
       [ 0.21380226, -0.15564609, -1.52195106, -0.99356764, -0.39319921,
        -0.42858135],
       ...,
       [ 0.02408146,  0.06700606,  0.92052895, -0.02949288,  0.36569115,
         0.35939419],
       [-0.14521044,  0.33481878,  1.02139833,  0.15285925,  0.38592822,
         0.54557457],
       [ 0.00716902,  0.10796565,  0.82326205, -0.23725473,  0.20379454,
         0.24279638]])

In [49]:
from sklearn import svm
from sklearn.naive_bayes import MultinomialNB

In [50]:
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn.neighbors import KNeighborsClassifier

In [51]:
from sklearn.svm import SVC

model_svc = SVC()
model_svc.fit(x_transform, y_train)
print(model_svc)

expected_svc = y_test
predicted_svc = model_svc.predict(x_transform_test)

print(metrics.classification_report(expected_svc, predicted_svc))
print(metrics.confusion_matrix(expected_svc, predicted_svc))

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


TypeError: unorderable types: str() < float()

In [52]:
model = KNeighborsClassifier()
model.fit(x_transform, y_train)
print(model)

expected = y_test
predicted = model.predict(x_transform_test)

print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')


TypeError: unorderable types: str() < float()

А что если попробовать стандартное преобразование текстов? Традиционно делаем предобработку

In [53]:
def set_clean(s):
    try:
        # print(type(s))
        clean_line = re.sub('[\W\d_-]+', ' ', s.lower().strip())
        return re.sub(' +', ' ', clean_line)
    except AttributeError:
        ''#print ("this was a series")

In [54]:
import re
from sklearn import svm
from sklearn.feature_extraction.text import CountVectorizer
from nltk.corpus import stopwords

In [55]:
train_clean = train.applymap(set_clean) #применяем еще раз нашу предобработку
test_clean = test.applymap(set_clean)

In [56]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
def tfidf_vec(voc=None):
    if(voc):
        vectorizer = TfidfVectorizer(ngram_range=(1, 1),
                                     stop_words='russian',
                                     min_df=5,   
                                     max_df=0.9)  
        tr = vectorizer.fit_transform(train_clean["text"]) 
        te = vectorizer.fit_transform(test_clean["text"]) 
        return (tr, te)
    else:
        vectorizer = TfidfVectorizer() 
        tr = vectorizer.fit_transform(train_clean["text"]) 
        voc = vectorizer.get_feature_names()
        vectorizer = CountVectorizer(vocabulary=voc) 
        te = vectorizer.fit_transform(test_clean["text"]) 
        return (tr, te)
train_counts, test_counts = tfidf_vec() 

tfidf_transformer = TfidfTransformer()

x_train = tfidf_transformer.fit_transform(train_counts)
x_test = tfidf_transformer.fit_transform(test_counts)

  if hasattr(X, 'dtype') and np.issubdtype(X.dtype, np.float):


In [57]:
clf = MultinomialNB().fit(x_train, y_train)
predicted = clf.predict(x_test)
len(predicted)

1982

In [58]:
print(classification_report(y_pred=predicted, y_true=y_test))

TypeError: unorderable types: str() < float()

In [59]:
model = KNeighborsClassifier()
model.fit(x_train, y_train)
print(model)

expected = y_test
predicted = model.predict(x_test)

print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=1, n_neighbors=5, p=2,
           weights='uniform')


TypeError: unorderable types: str() < float()

In [60]:
from sklearn.svm import SVC

model_svc = SVC()
model_svc.fit(x_train, y_train)
print(model_svc)

expected_svc = y_test
predicted_svc = model_svc.predict(x_test)

print(metrics.classification_report(expected_svc, predicted_svc))
print(metrics.confusion_matrix(expected_svc, predicted_svc))

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto', kernel='rbf',
  max_iter=-1, probability=False, random_state=None, shrinking=True,
  tol=0.001, verbose=False)


TypeError: unorderable types: str() < float()