# Ранижрование комментариев

## Задача: предсказать скор комментария.

In [12]:
import re
import pandas as pd
import numpy as np
import scipy
import time
from tqdm import tqdm, tqdm_notebook
from pathlib import Path
import stop_words
import string
import pymorphy2
from sklearn.utils import shuffle
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

# Загрузка исходных данных

In [2]:
train = pd.read_json('ranking_train.jsonl', lines=True)

In [3]:
test = pd.read_json('ranking_test.jsonl', lines=True)

Посмотрим на дынные

In [4]:
train

Unnamed: 0,text,comments
0,How many summer Y Combinator fundees decided n...,[{'text': 'Going back to school is not identic...
1,CBS acquires last.fm for $280m,[{'text': 'It will be curious to see where thi...
2,How Costco Became the Anti-Wal-Mart,[{'text': 'I really hate it when people falsel...
3,"Fortune Favors Big Turds | Screw The Money, Th...",[{'text': 'His real point is that something ca...
4,StartupWeekend: 70 Founders Create One Company...,[{'text': 'Looks like someone hasn't read The ...
...,...,...
88102,Don't upgrade to iOS 8.0.1 or you may experien...,[{'text': 'I had this issue and was able to fi...
88103,Ask HN: How do US HNers get their health insur...,[{'text': 'We use a HSA-qualified high-deducti...
88104,Justin Gordon Using React on Rails,[{'text': 'neat insight! A friend of mine conv...
88105,"iOS 8.0.1 released, broken on iPhone 6 models,...","[{'text': 'Ouch, I feel for whoever let this s..."


Выведем один объект признака comments для трейна

In [5]:
pd.DataFrame(train['comments'][0])

Unnamed: 0,text,score
0,Going back to school is not identical with giv...,0
1,There will invariably be those who don't see t...,1
2,For me school is a way to be connected to what...,2
3,I guess it really depends on how hungry you ar...,3
4,I know pollground decided to go back to school...,4


Для тестовой выборки

In [6]:
pd.DataFrame(test['comments'][0])

Unnamed: 0,text,score
0,I&#x27;m still waiting for them to stabilize w...,
1,"For those who upgraded, no need to do a restor...",
2,Upgraded shortly after it was released and suf...,
3,I think they were under a lot of pressure on t...,
4,Fix for those who already updated: http:&#x2F...,


**В одном объекте имеем список дата-фреймов.**

Пойдем самым простым путем: из тренировочных данных получим дата-сет вида **комментарий - скор**, перемешаем его и обучим на нем модель. Такой же дата-сет получим из тестовой выборки без переменшивания и предскажем на нем. Это будет baseline решение.  

# Преобразование данных

In [16]:
def Frame(data):
    print('Идет преобразование дата-сета. Подождите....')
    frames = []
    for index in tqdm(data):
        frames.append(pd.DataFrame(index))
    total_frame = pd.concat(frames).reset_index(drop=True)
    total_frame.to_csv('df_train', index_label=False)
    print(f'В директорию загружен файл')
    return total_frame       

In [17]:
Frame(train['comments'])

Идет преобразование дата-сета. Подождите....


100%|██████████████████████████████████████████████████████████████████████████| 88107/88107 [00:43<00:00, 2041.50it/s]


В директорию загружен файл


Unnamed: 0,text,score
0,Going back to school is not identical with giv...,0
1,There will invariably be those who don't see t...,1
2,For me school is a way to be connected to what...,2
3,I guess it really depends on how hungry you ar...,3
4,I know pollground decided to go back to school...,4
...,...,...
440530,Most major banks offer a service called &#x27;...,0
440531,"It costs 3.25%, or $74.25 for the example of $...",1
440532,As many other comments have pointed out almost...,2
440533,My apartment building uses Yapstone&#x27;s Ren...,3


# Нормализация текста

## Тренировочный дата-сет

Для начала нужно перемешать весь получившийся дата-сет:

In [43]:
df_train = pd.read_csv('df_train')

In [44]:
df_train.head(3)

Unnamed: 0,text,score
0,Going back to school is not identical with giv...,0
1,There will invariably be those who don't see t...,1
2,For me school is a way to be connected to what...,2


In [45]:
df_train = shuffle(df_train)

In [46]:
df_train.head(5)

Unnamed: 0,text,score
12815,"Once you learn the basics, try just reading th...",0
355622,I know that HN mods replace the title of posts...,2
18743,"I don't need it, but I think you should send i...",3
70166,I can't tell if this is a joke or not.,1
288961,* Windows desktop will still thrive on Enterpr...,1


### Нормализация текста

Приведем весь  текст в один регистр, удалим лишние пробелы и знаки препинания, а также стоп-слова.

In [47]:
stopwords = stop_words.get_stop_words('en')

In [48]:
stopwords.extend(['...', '«', '»', 'an','the','i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', 'too', 'very', 's', 't', 'can', 'will', 'just', 'don', "don't", 'should', "should've", 'now', 'd', 'll', 'm', 'o', 're', 've', 'y', 'ain', 'aren', "aren't", 'couldn', "couldn't", 'didn', "didn't", 'doesn', "doesn't", 'hadn', "hadn't", 'hasn', "hasn't", 'haven', "haven't", 'isn', "isn't", 'ma', 'mightn', "mightn't", 'mustn', "mustn't", 'needn', "needn't", 'shan', "shan't", 'shouldn', "shouldn't", 'wasn', "wasn't", 'weren', "weren't", 'won', "won't", 'wouldn', "wouldn't"])

In [49]:
def remove_punctuation(text):
    return ''.join([ch if ch not in string.punctuation else ' ' for ch in text])

In [50]:
def remove_numbers(text):
    return ''.join([i if not i.isdigit() else ' ' for i in text])

In [51]:
def remove_multiple_spaces(text):
    return re.sub(r'\s+', ' ', text, flags=re.I)

In [53]:
prep_text = [remove_multiple_spaces(remove_numbers(remove_punctuation(text.lower()))) for text in df_train['text'].astype('str')]

In [54]:
df_train['text_prep'] = prep_text

In [55]:
df_train.head()

Unnamed: 0,text,score,text_prep
12815,"Once you learn the basics, try just reading th...",0,once you learn the basics try just reading thr...
355622,I know that HN mods replace the title of posts...,2,i know that hn mods replace the title of posts...
18743,"I don't need it, but I think you should send i...",3,i don t need it but i think you should send it...
70166,I can't tell if this is a joke or not.,1,i can t tell if this is a joke or not
288961,* Windows desktop will still thrive on Enterpr...,1,windows desktop will still thrive on enterpri...


### Лемматизация 

In [56]:
df_train = df_train.dropna(subset=['text_prep'])

In [57]:
morph = pymorphy2.MorphAnalyzer()

In [67]:
lemm_texts_list = []

In [68]:
'''for text in tqdm(df_train['text_prep']):
    text_lem = [morph.parse(word)[0].normal_form for word in text.split(' ')]
    if len(text_lem) <= 2:
        lemm_texts_list.append('')
        
    else:
        lemm_texts_list.append(' '.join(text_lem))'''

100%|█████████████████████████████████████████████████████████████████████████| 440535/440535 [31:23<00:00, 233.93it/s]


In [70]:
df_train['text_lemm'] = lemm_texts_list

In [71]:
df_train = df_train[df_train['text_lemm'] != '']

In [72]:
df_train.head()

Unnamed: 0,text,score,text_prep,text_lemm
12815,"Once you learn the basics, try just reading th...",0,once you learn the basics try just reading thr...,once you learn the basics try just reading thr...
355622,I know that HN mods replace the title of posts...,2,i know that hn mods replace the title of posts...,i know that hn mods replace the title of posts...
18743,"I don't need it, but I think you should send i...",3,i don t need it but i think you should send it...,i don t need it but i think you should send it...
70166,I can't tell if this is a joke or not.,1,i can t tell if this is a joke or not,i can t tell if this is a joke or not
288961,* Windows desktop will still thrive on Enterpr...,1,windows desktop will still thrive on enterpri...,windows desktop will still thrive on enterpri...


Так как лемматизация занимает длительное время - выгрузим файл с лемматизированным текстом.

In [73]:
#df_train.to_csv('df_train_lemm', index_label=False)

# ML

## Разбиение данных

In [2]:
df_train_lemm = pd.read_csv('df_train_lemm')

In [3]:
df_train_lemm_samp = df_train_lemm.sample(frac=0.1, random_state=12)
# df.sample(frac=0.5, replace=True, random_state=1)

In [4]:
train = df_train_lemm_samp['text_lemm']

In [5]:
target = df_train_lemm_samp['score']

In [6]:
features_train, features_valid, target_train, target_valid = train_test_split(train, target, random_state= 12, test_size=0.3)

## Обучение модели

In [19]:
LogisticRegression = Pipeline([
                ('vect', CountVectorizer(analyzer='char', ngram_range =(2,10))),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(n_jobs=3,C=1e5, solver='saga', 
                                           multi_class='multinomial',
                                           max_iter=1000,
                                           random_state=12)),
])

In [None]:
%%time
LogisticRegression.fit(features_train, target_train)

In [19]:
CountVectorizer = CountVectorizer(analyzer='char',
                        ngram_range =(2,10))

In [None]:
train_vect = CountVectorizer.fit_transform(features_train)

In [None]:
'''model = LogisticRegression()
model.fit(X_train, Y_train)
# save the model to disk
filename = 'finalized_model.sav'
pickle.dump(model, open(filename, 'wb'))

# some time later...

# load the model from disk
loaded_model = pickle.load(open(filename, 'rb'))
result = loaded_model.score(X_test, Y_test)
print(result)'''