In [10]:
raw_data = sc.textFile('hdfs://master:54310/raw_data')

In [11]:
import json
data = raw_data.map(json.loads)

In [12]:
unique = data.groupBy(lambda x: x['Uri']). \
    map(lambda g: list(g[1])[0])

In [None]:
data.count(), unique.count()

In [14]:
from snowballstemmer import RussianStemmer, EnglishStemmer
russian_stemmer = RussianStemmer()
english_stemmer = EnglishStemmer()

def stem_words(words: list):
    stemmed = russian_stemmer.stemWords(words)
    stemmed = english_stemmer.stemWords(stemmed)
    return stemmed

In [15]:
from nltk.tokenize import regexp_tokenize

def split_text(text: str):
    return regexp_tokenize(text, '''[\w']+''')

In [16]:
import re

def create_compiled_url_regexp():
    url_regexp = r'''(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|
    [a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|
    (\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))'''
    return re.compile(url_regexp)

url_regexp = create_compiled_url_regexp()

def replace_urls(text: str):
    return url_regexp.sub(' ', text)

def filter_variable_names(words: list):
    return [word for word in words if '_' not in word]

def delete_non_word_chars(text: str):
    temp = text#replace_urls(text)
    temp = temp.replace('ё', 'е')
    temp = re.sub(r'(&[a-z0-9]*;)', ' ', temp) # & encoded symbols
    temp = re.sub(r'(\W|\d)+', ' ', temp) # non word or digit
    temp = re.sub(r'\s+', ' ', temp) # 2+ spaces
    return temp.strip()

In [17]:
from nltk.corpus import stopwords

russian_stops = stopwords.words('russian')
english_stops = stopwords.words('english')

def filter_stopwords(words: list):
    return [word for word in words \
           if word not in russian_stops \
           and word not in english_stops]

In [18]:
def filter_words_with_repeatable_letters(words: list):
    return [word for word in words if not re.match('(.)\\1{2}', word)]
    
def is_language_usual(word: str):
    length = len(word)
    is_eng = re.match('[a-z]', word)
    return length > 2 and ((not is_eng and length < 25) or (is_eng and length < 15))

def filter_words_with_unusual_by_language_length(words: list):
    return [word for word in words if is_language_usual(word)]

In [19]:
def tokenize_text(text: str):
    text = text.lower()
    text = delete_non_word_chars(text)
    tokens = split_text(text)
    tokens = filter_variable_names(tokens)
    tokens = filter_stopwords(tokens)
    tokens = stem_words(tokens)
    tokens = filter_words_with_repeatable_letters(tokens)
    tokens = filter_words_with_unusual_by_language_length(tokens)
    return tokens

In [20]:
def normalize_label(label: str):
    temp = label.lower()
    temp = temp.replace('ё', 'е')
    temp = split_text(temp)
    temp = russian_stemmer.stemWords(temp)
    temp = english_stemmer.stemWords(temp)
    return ' '.join(temp)

label_bl = {'лог компании', 'рная дыра', 'пиарюсь'}
def filter_bl_labels(labels: list):
    return [l for l in labels if
            all([l_bl not in l for l_bl in label_bl])]

def normalize_labels(labels: list):    
    return [normalize_label(l_i) for l_i in filter_bl_labels(labels)]    

In [21]:
def transform_data(data: dict):
    res = {}
    res['Labels'] = normalize_labels(data['Hubs'] + data['Tags'])
    res['Id'] = data['Number']
    res['Features'] = tokenize_text(data['Text'])
    return res

In [22]:
transformed_data = unique.map(lambda x: transform_data(x))

In [23]:
transformed_data.cache()

PythonRDD[7] at RDD at PythonRDD.scala:43

In [24]:
excluding_labels = transformed_data.flatMap(lambda l: l['Labels']). \
    map(lambda l: (l,1)). \
    reduceByKey(int.__add__). \
    filter(lambda l: l[1] < 30).collectAsMap()

In [26]:
transformed_data.flatMap(lambda l: l['Labels']).distinct().count()

101302

In [25]:
len(excluding_labels)

98773

In [22]:
excluding_labels_br = sc.broadcast(excluding_labels)

In [23]:
def filter_labels(doc: dict):
    doc['Labels'] = [l for l in doc['Labels']
                     if l not in excluding_labels_br.value]
    return doc

In [24]:
transformed_data.map(filter_labels). \
    filter(lambda x: len(x['Features']) > 0 and len(x['Labels']) > 2). \
    map(lambda x: json.dumps(x)). \
    repartition(6). \
    saveAsTextFile('hdfs://master:54310/clean_ml')