In [2]:
import pandas as pd
import re
from nltk.corpus import stopwords
from pymystem3 import Mystem
from collections import defaultdict
from nltk.tokenize import RegexpTokenizer

In [3]:
CLEANR = re.compile('<.*?>|&([a-z0-9]+|#[0-9]{1,6}|#x[0-9a-f]{1,6});')
m = Mystem()

In [4]:
def cleanhtml(raw_html):
    cleantext = re.sub(CLEANR, '', raw_html)
    return cleantext


In [5]:
def lemm_part(corpus, all_corpus):
    all_texts = ' '.join(corpus)
    lem_words = m.lemmatize(all_texts)
    for txt in lem_words:
        if txt != '\n' and txt.strip() != '':
            all_corpus.append(txt)

In [6]:
def count_freq(data):
    word_freq = defaultdict(int)
    for token in data:
        word_freq[token] += 1
    return word_freq

In [7]:
def lemm_column(column_name):
    corpus = []
    all_corpus = []
    for index, row in df.iterrows():
        words = row[column_name]
        if pd.notnull(words):
            words = words.lower()
            words = cleanhtml(words)
            words = tokenizer.tokenize(words)
            text = ' '.join(
                [word for word in words if not word.isdigit() and word not in set(stopwords.words('russian'))])
            corpus.append(text)
            if index > 0 and index % 100 == 0:
                lemm_part(corpus, all_corpus)
                corpus = []
    lemm_part(corpus, all_corpus)
    return all_corpus

In [8]:
df = pd.read_csv('data.csv', encoding='UTF-8')
tokenizer = RegexpTokenizer(r'\w+')


In [45]:
cond_lemm = lemm_column('cond')
cond_count = len(cond_lemm)
cond_result = count_freq(cond_lemm)

In [46]:
req_lemm = lemm_column('req')
req_result = count_freq(req_lemm)
req_count = len(req_lemm)

In [47]:
resp_lemm = lemm_column('resp')
resp_result = count_freq(resp_lemm)
resp_count = len(resp_lemm)

In [30]:
all_count = cond_count + req_count + resp_count

cond_data = pd.DataFrame(
    cond_data = pd.DataFrame(
    columns=['word', 'count_in_condition', 'count_in_requirements',
             'count_in_responsibilities', 'frequency_in_condition',
             'frequency_in_requirements', 'frequency_in_responsibilities'])
for word, count in cond_result.items():
    c_c = count
    rq_c = req_result.pop(word, 0)
    rs_c = resp_result.pop(word, 0)

    c_all = c_c + rq_c + rs_c
    c_f = c_c / c_all
    rq_f = rq_c / c_all
    rs_f = rs_c / c_all
    new_row_data = {'word': word, 'count_in_condition': c_c, 'count_in_requirements': rq_c,
                    'count_in_responsibilities': rs_c,
                    'frequency_in_condition': c_f,
                    'frequency_in_requirements': rq_f,
                    'frequency_in_responsibilities': rs_f}
    new_row = pd.DataFrame([new_row_data])
    cond_data = pd.concat([cond_data, new_row], ignore_index=True)
for word, count in req_result.items():
    rq_c = count
    rs_c = resp_result.pop(word, 0)

    c_all = rq_c + rs_c
    rq_f = rq_c / c_all
    rs_f = rs_c / c_all
    new_row_data = {'word': word, 'count_in_condition': 0, 'count_in_requirements': rq_c,
                    'count_in_responsibilities': rs_c,
                    'frequency_in_condition': 0,
                    'frequency_in_requirements': rq_f,
                    'frequency_in_responsibilities': rs_f}
    new_row = pd.DataFrame([new_row_data])
    cond_data = pd.concat([cond_data, new_row], ignore_index=True)

for word, count in resp_result.items():
    rs_c = count
    new_row_data = {'word': word, 'count_in_condition': 0, 'count_in_requirements': 0,
                    'count_in_responsibilities': rs_c,
                    'frequency_in_condition': 0,
                    'frequency_in_requirements': 0,
                    'frequency_in_responsibilities': rs_f}
    new_row = pd.DataFrame([new_row_data])
    cond_data = pd.concat([cond_data, new_row], ignore_index=True)

In [48]:
cond_data['condition_freq_count'] = cond_data['count_in_condition'] * cond_data['frequency_in_condition']
cond_data['requirements_freq_count'] = cond_data['count_in_requirements'] * cond_data['frequency_in_requirements']
cond_data['responsibilities_freq_count'] = cond_data['count_in_responsibilities'] * cond_data['frequency_in_responsibilities']
cond_data.to_csv('word_frequencies.csv', encoding='utf8')

In [9]:
def ngram_column(column_name, n):
    ex_words = {'и', 'или', 'также', 'так', 'же', 'а'}
    corpus = []
    unique_data = df[['ad_id', column_name]]
    unique_data = unique_data.drop_duplicates()
    for index, row in unique_data.iterrows():
        words = row[column_name]
        if pd.notnull(words):
            words = words.lower()
            words = cleanhtml(words)
            for w in ast.literal_eval(words):
                tokens = tokenizer.tokenize(w)
                cleaned_tokens = [t for t in tokens if t not in ex_words]
                n_grams = ngrams(cleaned_tokens, n)
                text = [' '.join(grams) for grams in n_grams]
                corpus.extend(text)
    return corpus

In [10]:
from nltk.util import ngrams
import ast

In [15]:
def count_ngramm(n):
    cond_lemm = ngram_column('cond', n)
    cond_count = len(cond_lemm)
    cond_result = count_freq(cond_lemm)
    req_lemm = ngram_column('req', n)
    req_result = count_freq(req_lemm)
    req_count = len(req_lemm)
    resp_lemm = ngram_column('resp', n)
    resp_result = count_freq(resp_lemm)
    resp_count = len(resp_lemm)
    cond_data = pd.DataFrame(
        columns=['word', 'count_in_condition', 'count_in_requirements',
                 'count_in_responsibilities', 'frequency_in_condition', 'frequency_in_requirements',
                 'frequency_in_responsibilities'])
    for word, count in cond_result.items():
        c_c = count
        rq_c = req_result.pop(word, 0)
        rs_c = resp_result.pop(word, 0)

        c_all = c_c + rq_c + rs_c
        c_f = c_c / c_all
        rq_f = rq_c / c_all
        rs_f = rs_c / c_all
        new_row_data = {'word': word, 'count_in_condition': c_c, 'count_in_requirements': rq_c,
                        'count_in_responsibilities': rs_c, 'frequency_in_condition': c_f,'frequency_in_requirements':rq_f, 'frequency_in_responsibilities':rs_f}
        new_row = pd.DataFrame([new_row_data])
        cond_data = pd.concat([cond_data, new_row], ignore_index=True)

    for word, count in req_result.items():
        rq_c = count
        rs_c = resp_result.pop(word, 0)

        c_all = rq_c + rs_c
        rq_f = rq_c / c_all
        rs_f = rs_c / c_all
        new_row_data = {'word': word, 'count_in_condition': 0, 'count_in_requirements': rq_c,
                        'count_in_responsibilities': rs_c, 'frequency_in_condition': 0,
                        'frequency_in_requirements': rq_f, 'frequency_in_responsibilities': rs_f}
        new_row = pd.DataFrame([new_row_data])
        cond_data = pd.concat([cond_data, new_row], ignore_index=True)

    for word, count in resp_result.items():
        rs_c = count
        new_row_data = {'word': word, 'count_in_condition': 0, 'count_in_requirements': 0,
                        'count_in_responsibilities': rs_c, 'frequency_in_condition': 0,
                        'frequency_in_requirements': 0, 'frequency_in_responsibilities': 1}
        new_row = pd.DataFrame([new_row_data])
        cond_data = pd.concat([cond_data, new_row], ignore_index=True)

    cond_data['condition_freq_count'] = cond_data['count_in_condition'] * cond_data['frequency_in_condition']
    cond_data['requirements_freq_count'] = cond_data['count_in_requirements'] * cond_data['frequency_in_requirements']
    cond_data['responsibilities_freq_count'] = cond_data['count_in_responsibilities'] * cond_data['frequency_in_responsibilities']

    cond_data.to_csv(str(n) + 'gramm_frequencies.csv', encoding='utf8')

In [30]:
count_ngramm(2)

In [17]:
count_ngramm(3)

In [18]:
count_ngramm(4)

In [19]:
count_ngramm(5)

In [21]:
def convert_numbers_to_excel(file_name):
    df = pd.read_csv(file_name, encoding='UTF-8')
    df.to_csv('excel_' + file_name, encoding='utf8', decimal=',')

In [22]:
convert_numbers_to_excel('word_frequencies.csv')
convert_numbers_to_excel('2gramm_frequencies.csv')
convert_numbers_to_excel('3gramm_frequencies.csv')
convert_numbers_to_excel('4gramm_frequencies.csv')
convert_numbers_to_excel('5gramm_frequencies.csv')
