In [1]:
raw_data = sc.textFile('hdfs://master:54310/raw_data')

In [3]:
import json
data = raw_data.map(json.loads)

In [4]:
unique = data.groupBy(lambda x: x['Uri']). \
    map(lambda g: list(g[1])[0])

In [5]:
data.count(), unique.count()

(139209, 128169)

In [6]:
from snowballstemmer import RussianStemmer, EnglishStemmer
russian_stemmer = RussianStemmer()
english_stemmer = EnglishStemmer()

def stem_words(words: list):
    stemmed = russian_stemmer.stemWords(words)
    stemmed = english_stemmer.stemWords(stemmed)
    return stemmed

In [19]:
from nltk.tokenize import regexp_tokenize

def split_text(text: str):
    return regexp_tokenize(text, '''[\w']+''')

In [9]:
import re

def create_compiled_url_regexp():
    url_regexp = r'''(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|
    [a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|
    (\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))'''
    return re.compile(url_regexp)

url_regexp = create_compiled_url_regexp()

def replace_urls(text: str):
    return url_regexp.sub(' ', text)

def filter_variable_names(words: list):
    return [word for word in words if '_' not in word]

def delete_non_word_chars(text: str):
    temp = text#replace_urls(text)
    temp = temp.replace('ё', 'е')
    temp = re.sub(r'(&[a-z0-9]*;)', ' ', temp) # & encoded symbols
    temp = re.sub(r'(\W|\d)+', ' ', temp) # non word or digit
    temp = re.sub(r'\s+', ' ', temp) # 2+ spaces
    return temp.strip()

In [4]:
A = "AAAAAA"

In [5]:
A.replace('A','BB')

'BBBBBBBBBBBB'

In [6]:
from nltk.corpus import stopwords

russian_stops = stopwords.words('russian')
english_stops = stopwords.words('english')

def filter_stopwords(words: list):
    return [word for word in words \
           if word not in russian_stops \
           and word not in english_stops]

In [7]:
def filter_words_with_repeatable_letters(words: list):
    return [word for word in words if not re.match('(.)\\1{2}', word)]
    
def is_language_usual(word: str):
    length = len(word)
    is_eng = re.match('[a-z]', word)
    return length > 2 and ((not is_eng and length < 25) or (is_eng and length < 15))

def filter_words_with_unusual_by_language_length(words: list):
    return [word for word in words if is_language_usual(word)]

In [8]:
def tokenize_text(text: str):
    text = text.lower()
    text = delete_non_word_chars(text)
    tokens = split_text(text)
    tokens = filter_variable_names(tokens)
    tokens = filter_stopwords(tokens)
    tokens = stem_words(tokens)
    tokens = filter_words_with_repeatable_letters(tokens)
    tokens = filter_words_with_unusual_by_language_length(tokens)
    return tokens

In [9]:
def transform_data(data: dict):
    res = {}
    res['Hubs'] = data['Hubs']
    res['Id'] = data['Number']
    res['Text'] = tokenize_text(data['Text'])
    res['Comments'] = tokenize_text(' '.join(data['Comments']))
    res['CodeComments'] = tokenize_text(' '.join(data['CodeComments']))
    res['Tags'] = data['Tags']
    return res

In [10]:
unique.map(lambda x: transform_data(x)). \
    filter(lambda x: x['Text']). \
    map(lambda x: json.dumps(x)). \
    repartition(4). \
    saveAsTextFile('hdfs://master:54310/stemmed')

NameError: name 'unique' is not defined