In [1]:
raw_data = sc.textFile('hdfs://master:54310/raw_data')

In [2]:
import json
data = raw_data.map(json.loads)

In [3]:
unique = data.groupBy(lambda x: x['Uri']). \
    map(lambda g: list(g[1])[0])

In [4]:
data.count(), unique.count()

(139209, 128169)

In [5]:
from snowballstemmer import RussianStemmer, EnglishStemmer
russian_stemmer = RussianStemmer()
english_stemmer = EnglishStemmer()

def stem_words(words: list):
    stemmed = russian_stemmer.stemWords(words)
    stemmed = english_stemmer.stemWords(stemmed)
    return stemmed

In [6]:
from nltk.tokenize import regexp_tokenize

def split_text(text: str):
    return regexp_tokenize(text, '''[\w']+''')

In [7]:
import re

def create_compiled_url_regexp():
    url_regexp = r'''(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|
    [a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|
    (\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'".,<>?«»“”‘’]))'''
    return re.compile(url_regexp)

url_regexp = create_compiled_url_regexp()

def replace_urls(text: str):
    return url_regexp.sub(' ', text)

def filter_variable_names(words: list):
    return [word for word in words if '_' not in word]

def delete_non_word_chars(text: str):
    temp = text#replace_urls(text)
    temp = temp.replace('ё', 'е')
    temp = re.sub(r'(&[a-z0-9]*;)', ' ', temp) # & encoded symbols
    temp = re.sub(r'(\W|\d)+', ' ', temp) # non word or digit
    temp = re.sub(r'\s+', ' ', temp) # 2+ spaces
    return temp.strip()

In [8]:
from nltk.corpus import stopwords

russian_stops = stopwords.words('russian')
english_stops = stopwords.words('english')

def filter_stopwords(words: list):
    return [word for word in words \
           if word not in russian_stops \
           and word not in english_stops]

In [9]:
def filter_words_with_repeatable_letters(words: list):
    return [word for word in words if not re.match('(.)\\1{2}', word)]
    
def is_language_usual(word: str):
    length = len(word)
    is_eng = re.match('[a-z]', word)
    return length > 2 and ((not is_eng and length < 25) or (is_eng and length < 15))

def filter_words_with_unusual_by_language_length(words: list):
    return [word for word in words if is_language_usual(word)]

In [10]:
def tokenize_text(text: str):
    text = text.lower()
    text = delete_non_word_chars(text)
    tokens = split_text(text)
    tokens = filter_variable_names(tokens)
    tokens = filter_stopwords(tokens)
    tokens = stem_words(tokens)
    tokens = filter_words_with_repeatable_letters(tokens)
    tokens = filter_words_with_unusual_by_language_length(tokens)
    return tokens

In [11]:
def normalize_label(label: str):
    temp = label.lower()
    temp = temp.replace('ё', 'е')
    temp = split_text(temp)
    temp = russian_stemmer.stemWords(temp)
    temp = english_stemmer.stemWords(temp)
    return ' '.join(temp)

label_bl = {'лог компании', 'рная дыра', 'пиарюсь'}
def filter_bl_labels(labels: list):
    return [l for l in labels if
            all([l_bl not in l for l_bl in label_bl])]

def normalize_labels(labels: list):    
    return [normalize_label(l_i) for l_i in filter_bl_labels(labels)]    

In [12]:
def transform_data(data: dict):
    res = {}
    res['Labels'] = normalize_labels(data['Hubs'] + data['Tags'])
    res['Id'] = data['Number']
    res['Features'] = tokenize_text(data['Text'])
    return res

In [13]:
unique.map(lambda x: transform_data(x)). \
    filter(lambda x: len(x['Features']) > 0 and len(x['Labels']) > 2). \
    map(lambda x: json.dumps(x)). \
    repartition(6). \
    saveAsTextFile('hdfs://master:54310/test10')

Py4JJavaError: An error occurred while calling o65.saveAsTextFile.
: org.apache.spark.SparkException: Job 2 cancelled because Stage 4 was cancelled
	at org.apache.spark.scheduler.DAGScheduler.org$apache$spark$scheduler$DAGScheduler$$failJobAndIndependentStages(DAGScheduler.scala:1280)
	at org.apache.spark.scheduler.DAGScheduler.handleJobCancellation(DAGScheduler.scala:1226)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleStageCancellation$1.apply$mcVI$sp(DAGScheduler.scala:1214)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleStageCancellation$1.apply(DAGScheduler.scala:1213)
	at org.apache.spark.scheduler.DAGScheduler$$anonfun$handleStageCancellation$1.apply(DAGScheduler.scala:1213)
	at scala.collection.IndexedSeqOptimized$class.foreach(IndexedSeqOptimized.scala:33)
	at scala.collection.mutable.ArrayOps$ofInt.foreach(ArrayOps.scala:156)
	at org.apache.spark.scheduler.DAGScheduler.handleStageCancellation(DAGScheduler.scala:1213)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.doOnReceive(DAGScheduler.scala:1466)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1455)
	at org.apache.spark.scheduler.DAGSchedulerEventProcessLoop.onReceive(DAGScheduler.scala:1444)
	at org.apache.spark.util.EventLoop$$anon$1.run(EventLoop.scala:48)
	at org.apache.spark.scheduler.DAGScheduler.runJob(DAGScheduler.scala:567)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1813)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1826)
	at org.apache.spark.SparkContext.runJob(SparkContext.scala:1903)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply$mcV$sp(PairRDDFunctions.scala:1124)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1065)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopDataset$1.apply(PairRDDFunctions.scala:1065)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:306)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopDataset(PairRDDFunctions.scala:1065)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply$mcV$sp(PairRDDFunctions.scala:989)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:965)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$4.apply(PairRDDFunctions.scala:965)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:306)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:965)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply$mcV$sp(PairRDDFunctions.scala:897)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:897)
	at org.apache.spark.rdd.PairRDDFunctions$$anonfun$saveAsHadoopFile$1.apply(PairRDDFunctions.scala:897)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:306)
	at org.apache.spark.rdd.PairRDDFunctions.saveAsHadoopFile(PairRDDFunctions.scala:896)
	at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply$mcV$sp(RDD.scala:1426)
	at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1405)
	at org.apache.spark.rdd.RDD$$anonfun$saveAsTextFile$1.apply(RDD.scala:1405)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:147)
	at org.apache.spark.rdd.RDDOperationScope$.withScope(RDDOperationScope.scala:108)
	at org.apache.spark.rdd.RDD.withScope(RDD.scala:306)
	at org.apache.spark.rdd.RDD.saveAsTextFile(RDD.scala:1405)
	at org.apache.spark.api.java.JavaRDDLike$class.saveAsTextFile(JavaRDDLike.scala:522)
	at org.apache.spark.api.java.AbstractJavaRDDLike.saveAsTextFile(JavaRDDLike.scala:47)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:497)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:231)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:379)
	at py4j.Gateway.invoke(Gateway.java:259)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:133)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:207)
	at java.lang.Thread.run(Thread.java:745)
