In [1]:
raw_data = sc.textFile('hdfs://master:54310/raw_data')

In [2]:
import json
data = raw_data.map(json.loads)

In [3]:
unique = data.groupBy(lambda x: x['Uri']). \
    map(lambda g: list(g[1])[0])

In [41]:
data.count(), unique.count()

(139209, 128169)

In [4]:
from snowballstemmer import RussianStemmer, EnglishStemmer
russian_stemmer = RussianStemmer()
english_stemmer = EnglishStemmer()

def stem_words(words: list):
    stemmed = russian_stemmer.stemWords(words)
    stemmed = english_stemmer.stemWords(stemmed)
    return stemmed

In [5]:
from nltk.tokenize import regexp_tokenize

def split_text(text: str):
    return regexp_tokenize(text, '''[\w']+''')

In [6]:
import re

def create_compiled_url_regexp():
    url_regexp = r'''(?:http(?:s)?:\\/\\/)?(?:(?:[a-zа-я]|[a-zа-я0-9_-]{2,})\\.)+(?:[a-z][a-z0-9-]{1,20}|рф)(?:\\/[!a-z0-9а-яё_z%~:\\.,-]*)*(?:[\\?&#+][a-z0-9\\[\\]_]*(?:=?[a-z0-9~\\._=,%\\|+!-]*))*(?<![\\.,:!?-])'''
    return re.compile(url_regexp)

def create_compiled_url_regexp_2():
    url_regexp = r'''(\b(https?|ftp|file)://)?[-A-Za-z0-9+&@#/%?=~_|!:,.;]+[-A-Za-z0-9+&@#/%=~_|]'''
    return re.compile(url_regexp)

url_regexp = create_compiled_url_regexp()
url_regexp2 = create_compiled_url_regexp_2()

def replace_urls(text: str):
    text = url_regexp.sub(' ', text)
    text = url_regexp2.sub(' ', text)
    return text

def filter_variable_names(words: list):
    return [word for word in words if '_' not in word]

def delete_non_word_chars(text: str):
    temp = replace_urls(text)
    temp = temp.replace('ё', 'е')
    temp = re.sub(r'(&[a-z0-9]*;)', ' ', temp) # & encoded symbols
    temp = re.sub(r'(\W|\d)+', ' ', temp) # non word or digit
    temp = re.sub(r'\s+', ' ', temp) # 2+ spaces
    return temp.strip()

In [7]:
from nltk.corpus import stopwords

russian_stops = stopwords.words('russian')
english_stops = stopwords.words('english')

def filter_stopwords(words: list):
    return [word for word in words \
           if word not in russian_stops \
           and word not in english_stops]

In [8]:
def filter_words_with_repeatable_letters(words: list):
    return [word for word in words if not re.match('(.)\\1{2}', word)]
    
def is_language_usual(word: str):
    length = len(word)
    is_eng = re.match('[a-z]', word)
    return length > 2 and ((not is_eng and length < 25) or (is_eng and length < 15))

def filter_words_with_unusual_by_language_length(words: list):
    return [word for word in words if is_language_usual(word)]

In [9]:
def tokenize_text(text: str):
    text = text.lower()
    text = delete_non_word_chars(text)
    tokens = split_text(text)
    tokens = filter_variable_names(tokens)
    tokens = filter_stopwords(tokens)
    tokens = stem_words(tokens)
    tokens = filter_words_with_repeatable_letters(tokens)
    tokens = filter_words_with_unusual_by_language_length(tokens)
    return tokens

In [66]:
from itertools import chain
with open('/home/hadoop/l_repl.json', 'r') as fp:
    l_repl_1 = json.load(fp)
with open('/home/hadoop/res.json', 'r', encoding='utf-8') as fp:
    l_repl_2 = json.load(fp)    
def repl_labels(labels):
    temp = [l_repl_1.get(l, l) for l in labels]
    return list(set(chain(*tuple(l_repl_2[l] for l in temp if l in l_repl_2))))

def repl_labels_2(labels):
    temp = [l_repl_1.get(l, l) for l in labels]
    temp = list(set(chain(*tuple(l_repl_2[l] for l in temp if l in l_repl_2))))
    return list(set(temp + list(chain(*tuple(l_repl_2[l] for l in temp if l in l_repl_2)))))

In [67]:
def normalize_label(label: str):
    temp = label.lower()
    temp = temp.replace('ё', 'е')
    temp = split_text(temp)
    temp = russian_stemmer.stemWords(temp)
    temp = english_stemmer.stemWords(temp)
    return ' '.join(temp)

label_bl = {'лог компании', 'рная дыра', 'пиарюсь'}

def filter_bl_labels(labels: list):
    return [l for l in labels if
            all([l_bl not in l for l_bl in label_bl])]

def normalize_labels(labels: list):    
    return [normalize_label(l_i) for l_i in filter_bl_labels(labels)]    

def normalize_tags(tags):
    norm_tags = normalize_labels(tags)
    return repl_labels_2(norm_tags)

In [68]:
def transform_data(data: dict):
    res = {}
    #res['Labels'] = normalize_labels(data['Hubs'] + data['Tags'])
    hubs = normalize_labels(data['Hubs'])
    tags = normalize_tags(data['Tags'])
    res['Labels'] = list(set(hubs + tags))
    res['Id'] = data['Number']
    res['Features'] = tokenize_text(data['Text'])
    return res

In [69]:
transformed_data = unique.map(lambda x: transform_data(x)). \
    filter(lambda x: len(x['Features']) > 0 and len(x['Labels']) > 0)

In [70]:
transformed_data.cache()

PythonRDD[44] at RDD at PythonRDD.scala:43

In [44]:
labels = transformed_data.flatMap(lambda x: x['Labels']).distinct()

In [45]:
labels.count()

583

In [71]:
label_nums = transformed_data.map(lambda x: len(x['Labels'])).collect()

In [72]:
from matplotlib import pyplot as plt

In [73]:
plt.hist(label_nums, bins=40)
plt.show()

In [52]:
transformed_data.count()

114250

In [74]:
excl_features = transformed_data.flatMap(lambda x: set(x['Features'])). \
    map(lambda f: (f,1)). \
    reduceByKey(int.__add__). \
    filter(lambda fc: fc[1] == 1). \
    map(lambda fc: fc[0]).collect()
    
excl_features_set = set(excl_features)
excl_features_br = sc.broadcast(excl_features_set)

In [75]:
def exclude_features(doc: dict):
    doc['Features'] = [f for f in doc['Features'] if f not in excl_features_br.value]
    return doc

In [76]:
transformed_data.map(exclude_features). \
    filter(lambda x: len(x['Features']) > 0). \
    map(lambda x: json.dumps(x)). \
    repartition(6). \
    saveAsTextFile('hdfs://master:54310/exp_f/clean')