In [1]:
import os
import sys
os.environ["PYSPARK_SUBMIT_ARGS"]='--num-executors 5 --executor-memory 4g --executor-cores 1 --driver-memory 2g pyspark-shell --master yarn --deploy-mode cluster'
os.environ["PYSPARK_PYTHON"]='/opt/anaconda/envs/bd9/bin/python'
os.environ["SPARK_HOME"]='/usr/hdp/current/spark2-client'

spark_home = os.environ.get('SPARK_HOME', None)
if not spark_home:
    raise ValueError('SPARK_HOME environment variable is not set')
sys.path.insert(0, os.path.join(spark_home, 'python'))
sys.path.insert(0, os.path.join(spark_home, 'python/lib/py4j-0.10.7-src.zip'))

In [2]:
from pyspark import SparkConf
from pyspark.sql import SparkSession

spark = SparkSession.builder \
                    .master("local[2]") \
                    .appName("lab04_demenev") \
                    .config("spark.driver.memory", "1g") \
                    .config("spark.jars.packages", "org.apache.spark:spark-sql-kafka-0-10_2.11:2.4.5") \
                    .getOrCreate()
spark

In [3]:
from pyspark.sql import functions as f
from pyspark.sql.types import *
from pyspark import Row
import json

In [4]:
from time import time
import datetime
def print_worktime(t):
    h = int(t//3600)
    m = int(t // 60)
    s = (t % 60)
    print('Код отработал за {0} часов {1} минут {2} секунд'.format(h,m,s))

In [5]:
! hadoop fs -ls /labs/slaba04/

Found 1 items
-rw-r--r--   3 hdfs hdfs  655090069 2022-01-06 18:46 /labs/slaba04/gender_age_dataset.txt


In [6]:
train_df = spark.read.csv('/labs/slaba04/gender_age_dataset.txt', header=True, sep="\t")

In [7]:
test_dict = """{"visits": [{"url": "http://sweetrading.ru/?p=900", "timestamp": 1419717886224}, {"url": "http://sweetrading.ru/?p=884", "timestamp": 1419717884437}, {"url": "http://sweetrading.ru/?p=1002", "timestamp": 1419717816375}, {"url": "http://101.ru/?an=port_channel_mp3", "timestamp": 1419717804934}, {"url": "http://sweetrading.ru/?cat=62", "timestamp": 1419714194423}, {"url": "http://sweetrading.ru/?p=1046", "timestamp": 1419713998481}, {"url": "http://sweetrading.ru/?p=978", "timestamp": 1419713927085}, {"url": "http://sweetrading.ru/?cat=171", "timestamp": 1419713908863}, {"url": "http://sweetrading.ru/?cat=62", "timestamp": 1419713908679}, {"url": "http://sweetrading.ru/?p=3648", "timestamp": 1419713798879}, {"url": "http://oesex.ru/955457", "timestamp": 1419595564407}, {"url": "http://www.interfax.ru/russia/408800", "timestamp": 1419542965224}, {"url": "http://101.ru/?an=port_channel_mp3&channel=30", "timestamp": 1418818241900}, {"url": "http://www.interfax.ru/russia/413508", "timestamp": 1418802080857}, {"url": "http://www.euroavtoprokat.ru/sitemap/car-rental/france.htm", "timestamp": 1418722961181}, {"url": "http://www.euroavtoprokat.ru/sitemap/car-rental.htm", "timestamp": 1418722945825}, {"url": "http://www.euroavtoprokat.ru/car-rental/germany.htm", "timestamp": 1418722937847}, {"url": "http://www.euroavtoprokat.ru/car-rental/germany.htm", "timestamp": 1418722923196}, {"url": "http://www.euroavtoprokat.ru/sitemap/car-rental.htm", "timestamp": 1418722909804}, {"url": "http://www.eavtoprokat.ru/prokat-avto/france", "timestamp": 1418646101953}, {"url": "http://www.wordparts.ru/numeral/", "timestamp": 1418592793587}, {"url": "http://rsdn.ru/forum/alg/3305190.flat", "timestamp": 1418591162814}, {"url": "http://www.euroavtoprokat.ru/car-rental/turkey/istanbul.htm", "timestamp": 1418571531780}, {"url": "http://citieslist.ru/", "timestamp": 1418488992092}, {"url": "http://www.euroavtoprokat.ru/car-rental/turkey/istanbul.htm", "timestamp": 1418480798674}, {"url": "http://rutv.ru/brand/show/episode/453757", "timestamp": 1418253037406}, {"url": "http://www.fodors.com/community/europe/best-car-rental-company-in-italy.cfm", "timestamp": 1418247198586}, {"url": "http://wheelsabroad.com/car-rental/united-kingdom/england/london?gclid=cjwkeaia-5-kbrdylpg5096r8masjabqedm4cmiichc-_-ewkbtsqyci5bu9ucwvjmxp4o0tficaarocljdw_wcb", "timestamp": 1418245144696}, {"url": "http://lestinet.com/site/stopagent.ru", "timestamp": 1418243376170}, {"url": "http://android-help.ru/q2a/16774/\u043a\u0430\u043a-\u043f\u043e\u043b\u0443\u0447\u0438\u0442\u044c-root-\u043f\u0440\u0430\u0432\u0430-\u043d\u0430-philips-w832-android-4-0-4", "timestamp": 1418169606439}, {"url": "http://club.dns-shop.ru/rabinovich/blog/\u044f-\u0432\u0441\u0435-\u0435\u0449\u0435-\u0434\u0435\u0440\u0436\u0443\u0441\u044c-\u043e\u0431\u0437\u043e\u0440-\u0441\u043c\u0430\u0440\u0442\u0444\u043e\u043d\u0430-philips-xenium-w832/", "timestamp": 1418169602505}, {"url": "http://www.supportforum.philips.com/ru/showthread.php?1529-philips-xenium-w832/page6", "timestamp": 1418167859617}, {"url": "http://www.supportforum.philips.com/ru/showthread.php?842-\u043d\u0435-\u0440\u0430\u0431\u043e\u0442\u0430\u0435\u0442-gps-\u0432-\u0441\u043c\u0430\u0440\u0442\u0444\u043e\u043d\u0435-philips-xenium-w832", "timestamp": 1418166430112}, {"url": "http://rabota.ua/info/jobsearcher/post/umora.aspx", "timestamp": 1418114698621}, {"url": "http://www.enter.ru/product/appliances/myasorubka-philips-hr2728-2020103007131", "timestamp": 1418053557067}, {"url": "http://www.ferra.ru/ru/byt/news/2013/12/02/polaris-pmg-1805/", "timestamp": 1417866883735}, {"url": "http://www.ferra.ru/ru/byt/news/2013/10/12/bosch-mfw6-propower/", "timestamp": 1417862586856}, {"url": "http://www.linotype.com/1266/neuehelvetica-family.html", "timestamp": 1417856979616}, {"url": "http://www.linotype.com/1546/tradegothic-family.html?site=webfonts", "timestamp": 1417812010753}, {"url": "http://www.vandelaydesign.com/best-ecommerce-website-designs/", "timestamp": 1417807232287}, {"url": "http://www.awwwards.com/20-of-the-very-best-e-commerce-web-sites.html", "timestamp": 1417805189928}, {"url": "http://101.ru/?an=port_channel_mp3&channel=82", "timestamp": 1417711286305}, {"url": "http://www.just.ru/myasorubki/56658_elektromyasorybky_kenwood_mg_450/?from=yandex_msk&utm_source=yandex&utm_medium=cpc&utm_campaign=10817239_model_bytovaya-tehnika-melkaya_msk_p_api&utm_content=612422293_2792852770_\u043c\u044f\u0441\u043e\u0440\u0443\u0431\u043a\u0443 mg 450&position_type=premi", "timestamp": 1417701042306}, {"url": "http://101.ru/?an=port_channel_mp3&channel=5", "timestamp": 1417695760398}, {"url": "http://101.ru/?an=port_channel_mp3&channel=5", "timestamp": 1417689964129}, {"url": "http://101.ru/?an=port_channel_mp3&channel=17", "timestamp": 1417683034834}, {"url": "http://www.kenwoodworld.com/ru-ru/all-products/blenders-mixers-and-meat-grinders/meat-grinders-ru/-mg350-0w21910001", "timestamp": 1417608945879}, {"url": "http://101.ru/?an=port_channel_mp3&channel=24", "timestamp": 1417605700777}, {"url": "http://101.ru/?an=port_channel_mp3&channel=24", "timestamp": 1417605639264}, {"url": "http://101.ru/?an=port_channel_mp3&channel=82", "timestamp": 1417605624817}, {"url": "http://www.kenwoodworld.com/ru-ru/all-products/blenders-mixers-and-meat-grinders/meat-grinders-ru/mg470-meat-grinder-0wmg470008", "timestamp": 1417604804579}, {"url": "http://livedemo00.template-help.com/magento_48517/blackberry-bold-9000-phone.html", "timestamp": 1417604730951}, {"url": "http://www.kenwoodworld.com/ru-ru/all-products/blenders-mixers-and-meat-grinders/meat-grinders-ru/mg470-meat-grinder-0wmg470008", "timestamp": 1417548651645}, {"url": "http://www.kenwoodworld.com/en-int/products/blenders/meat-grinders/mg474-meat-grinder", "timestamp": 1417548321763}, {"url": "http://www.kenwoodworld.com/ru-ru/all-products/blenders-mixers-and-meat-grinders/meat-grinders-ru/-mg350-0w21910001", "timestamp": 1417548310507}, {"url": "http://www.kenwoodworld.com/ru-ru/all-products/blenders-mixers-and-meat-grinders/meat-grinders-ru/-mg350-0w21910001", "timestamp": 1417548309162}, {"url": "http://www.kenwoodworld.com/ru-ru/all-products/blenders-mixers-and-meat-grinders/meat-grinders-ru/-mg350-0w21910001?feat=6405fda1-43cc-42cc-8860-1c2a492555c5&tabsegment=key-features", "timestamp": 1417548297576}, {"url": "http://www.kenwoodworld.com/ru-ru/all-products/blenders-mixers-and-meat-grinders/meat-grinders-ru/-mg350-0w21910001?tabsegment=key-features", "timestamp": 1417548284970}, {"url": "http://www.kenwoodworld.com/ru-ru/all-products/blenders-mixers-and-meat-grinders/meat-grinders-ru/-mg350-0w21910001", "timestamp": 1417548264964}, {"url": "http://www.kenwoodworld.com/en-int/products/blenders/meat-grinders", "timestamp": 1417546314287}, {"url": "http://www.kenwoodworld.com/ru-ru/all-products/blenders-mixers-and-meat-grinders/meat-grinders-ru/mg700-meat-grinder-0wmg700006", "timestamp": 1417545459520}, {"url": "http://www.kenwoodworld.com/ru-ru/all-products/blenders-mixers-and-meat-grinders/meat-grinders-ru/mg700-meat-grinder-0wmg700006", "timestamp": 1417545200191}, {"url": "http://www.kenwoodworld.com/ru-ru/all-products/kmix-by-kenwood/kmix-kitchen-machines-/kmx51-kmix-kitchen-machine-0wkmx51002", "timestamp": 1417545116313}, {"url": "http://www.kenwoodworld.com/ru-ru/all-products/blenders-mixers-and-meat-grinders/meat-grinders-ru/---mg517---0wmg517007", "timestamp": 1417544991760}, {"url": "http://www.kenwoodworld.com/ru-ru/all-products/blenders-mixers-and-meat-grinders/meat-grinders-ru/-mg350-0w21910001", "timestamp": 1417544967371}, {"url": "http://www.kenwoodworld.com/ru-ru/all-products/blenders-mixers-and-meat-grinders/meat-grinders-ru/-mg350-0w21910001?feat=ac86d868-3ea4-4523-93e1-885bbf4222cd&tabsegment=key-features", "timestamp": 1417544772661}, {"url": "http://www.kenwoodworld.com/ru-ru/all-products/blenders-mixers-and-meat-grinders/meat-grinders-ru/-mg350-0w21910001?feat=3a288c22-e5f2-448e-a573-ccde95fd2341&tabsegment=key-features", "timestamp": 1417544765049}, {"url": "http://www.kenwoodworld.com/ru-ru/all-products/blenders-mixers-and-meat-grinders/meat-grinders-ru/-mg350-0w21910001?feat=ac86d868-3ea4-4523-93e1-885bbf4222cd&tabsegment=key-features", "timestamp": 1417544748628}, {"url": "http://www.kenwoodworld.com/ru-ru/all-products/blenders-mixers-and-meat-grinders/meat-grinders-ru/-mg350-0w21910001?tabsegment=key-features", "timestamp": 1417544731238}, {"url": "http://www.kenwoodworld.com/ru-ru/all-products/blenders-mixers-and-meat-grinders/meat-grinders-ru/-mg350-0w21910001", "timestamp": 1417544522237}, {"url": "http://www.kenwoodworld.com/ru-ru/all-products/blenders-mixers-and-meat-grinders/meat-grinders-ru/-mg350-0w21910001", "timestamp": 1417544351791}, {"url": "http://www.kenwoodworld.com/ru-ru/all-products/blenders-mixers-and-meat-grinders/meat-grinders-ru/-mg350-0w21910001", "timestamp": 1417544282950}, {"url": "http://www.kenwoodworld.com/ru-ru", "timestamp": 1417544269909}, {"url": "http://www.kenwoodworld.com/ru-ru/all-products/blenders-mixers-and-meat-grinders/meat-grinders-ru/mg516-meat-grinder-and-roto-food-cutter-0wmg516006?tabsegment=specifications", "timestamp": 1417544204394}, {"url": "http://www.kenwoodworld.com/ru-ru/all-products/blenders-mixers-and-meat-grinders/meat-grinders-ru/mg516-meat-grinder-and-roto-food-cutter-0wmg516006", "timestamp": 1417544190747}, {"url": "http://www.kenwoodworld.com/ru-ru/all-products/blenders-mixers-and-meat-grinders/meat-grinders-ru/mg516-meat-grinder-and-roto-food-cutter-0wmg516006", "timestamp": 1417544045014}, {"url": "http://www.kenwoodworld.com/ru-ru/all-products/blenders-mixers-and-meat-grinders/meat-grinders-ru/mg516-meat-grinder-and-roto-food-cutter-0wmg516006?tabsegment=specifications", "timestamp": 1417544035023}, {"url": "http://www.kenwoodworld.com/ru-ru/all-products/blenders-mixers-and-meat-grinders/meat-grinders-ru/mg516-meat-grinder-and-roto-food-cutter-0wmg516006", "timestamp": 1417544015196}, {"url": "http://www.kenwoodworld.com/ru-ru", "timestamp": 1417544004579}, {"url": "http://www.kenwoodworld.com/ru-ru/all-products/blenders-mixers-and-meat-grinders/meat-grinders-ru/mg510-meat-grinder-0wmg510009?tabsegment=specifications", "timestamp": 1417543914820}, {"url": "http://www.kenwoodworld.com/uk/search-results", "timestamp": 1417543814629}, {"url": "http://www.kenwoodworld.com/uk/search-results", "timestamp": 1417543642699}, {"url": "http://www.kenwoodworld.com/uk/search-results", "timestamp": 1417543628088}, {"url": "http://www.kenwoodworld.com/uk/search-results", "timestamp": 1417543616074}, {"url": "http://www.kenwoodworld.com/uk/products/food-mixers/chef-major-attachments/potato-peeler-at444-awat444001", "timestamp": 1417543439173}, {"url": "http://www.kenwoodworld.com/uk/search-results", "timestamp": 1417543352117}, {"url": "http://www.kenwoodworld.com/uk/search-results", "timestamp": 1417543294005}, {"url": "http://www.kenwoodworld.com/uk/search-results", "timestamp": 1417543192107}, {"url": "http://www.kenwoodworld.com/uk", "timestamp": 1417543022466}, {"url": "http://www.kenwoodworld.com/ru-ru/all-products/blenders-mixers-and-meat-grinders/meat-grinders-ru/mg510-meat-grinder-0wmg510009?tabsegment=specifications", "timestamp": 1417542940415}, {"url": "http://www.kenwoodworld.com/ru-ru/all-products/blenders-mixers-and-meat-grinders/meat-grinders-ru/mg510-meat-grinder-0wmg510009?tabsegment=support", "timestamp": 1417542907491}, {"url": "http://www.kenwoodworld.com/ru-ru/all-products/blenders-mixers-and-meat-grinders/meat-grinders-ru/mg510-meat-grinder-0wmg510009?tabsegment=specifications", "timestamp": 1417542866623}, {"url": "http://www.kenwoodworld.com/ru-ru/all-products/blenders-mixers-and-meat-grinders/meat-grinders-ru/mg510-meat-grinder-0wmg510009", "timestamp": 1417542858206}, {"url": "http://www.kenwoodworld.com/ru-ru/all-products/blenders-mixers-and-meat-grinders/meat-grinders-ru/mg510-meat-grinder-0wmg510009", "timestamp": 1417542839578}, {"url": "http://www.kenwoodworld.com/ru-ru/all-products/blenders-mixers-and-meat-grinders/meat-grinders-ru/mg510-meat-grinder-0wmg510009", "timestamp": 1417542795850}, {"url": "http://www.kenwoodworld.com/ru-ru/all-products/blenders-mixers-and-meat-grinders/meat-grinders-ru/mg510-meat-grinder-0wmg510009", "timestamp": 1417542742883}, {"url": "http://www.kenwoodworld.com/ru-ru/all-products/blenders-mixers-and-meat-grinders/meat-grinders-ru/mg510-meat-grinder-0wmg510009", "timestamp": 1417542725367}, {"url": "http://www.kenwoodworld.com/ru-ru/all-products/blenders-mixers-and-meat-grinders/meat-grinders-ru/mg510-meat-grinder-0wmg510009", "timestamp": 1417542659966}, {"url": "http://www.kenwoodworld.com/ru-ru", "timestamp": 1417542501523}, {"url": "http://101.ru/?an=port_channel_mp3&channel=24", "timestamp": 1417542435930}, {"url": "http://www.shop-script.ru/platform/", "timestamp": 1417473193974}, {"url": "http://101.ru/?an=port_channel_mp3&channel=34", "timestamp": 1417451297674}]} """
visits_test = "[http://metanol.lv/news/dakota_nort_khochet_poekhat_v_swc/, http://metanol.lv/news/, http://metanol.lv/news/sbornaja_avstralii_nakonec_to_podala_sostav_na_pervyj_raund_kubka_luchshikh_par/, http://metanol.lv/news/, http://metanol.lv/news/sparring_betard_sparta_vroclav_zks_row_rybnik_57_33/, http://metanol.lv/news/, http://deita.ru/news/auto/18.03.2015/4860769-honda-priostanovila-postavku-lyubimogo-u-primortsev-avtomobilya/, http://primorye.ru/, http://deita.ru/news/culture/13.03.2015/4857537-pevitsa-linda-priznalas-v-lyubvi-k-vladivostoku-i-rasskazala-pro-novye-proekty/, http://speedway-press.ru/2015/03/, http://speedway-press.ru/, http://speedway-press.ru/, http://irgiz.narod.ru/, http://speedway-press.ru/2015/03/, http://speedway-press.ru/, http://speedway-press.ru/, http://irgiz.narod.ru/, http://irgiz.narod.ru/, http://irgiz.narod.ru/, http://irgiz.narod.ru/, http://n52.adshostnet.com/, http://vestiprim.ru/2015/03/19/, http://vestiprim.ru/2015/03/19/, http://vestiprim.ru/2015/03/19/, http://vestiprim.ru/2015/03/19/, http://vestiprim.ru/2015/03/19/, http://vestiprim.ru/2015/03/19/, http://vestiprim.ru/2015/03/19/, http://vestiprim.ru/2015/03/19/, http://vestiprim.ru/2015/03/19/, http://vestiprim.ru/2015/03/19/, http://vestiprim.ru/2015/03/19/, http://vestiprim.ru/2015/03/19/, http://vestiprim.ru/2015/03/19/, http://vestiprim.ru/2015/03/19/, http://vestiprim.ru/2015/03/19/, http://vestiprim.ru/2015/03/19/, http://vestiprim.ru/2015/03/19/, http://vestiprim.ru/2015/03/19/, http://vestiprim.ru/2015/03/19/, http://primorye.ru/, http://vestiprim.ru/2015/03/19/, http://primorye.ru/] "

In [8]:
from urllib.parse import urlparse
domain = urlparse(test_dict).netloc
print(domain)




In [9]:
import re
pattern = r'(https?://([^\s]+)/)'
list_of_url = re.findall(pattern, visits_test)[:10]
[urlparse(url[0]).netloc for url in list_of_url]

['metanol.lv',
 'metanol.lv',
 'metanol.lv',
 'metanol.lv',
 'metanol.lv',
 'metanol.lv',
 'deita.ru',
 'primorye.ru',
 'deita.ru',
 'speedway-press.ru']

In [10]:
pattern_t = r'("timestamp": (\d+))'
list_of_t = re.findall(pattern_t, test_dict)[:10]
list_of_t

[('"timestamp": 1419717886224', '1419717886224'),
 ('"timestamp": 1419717884437', '1419717884437'),
 ('"timestamp": 1419717816375', '1419717816375'),
 ('"timestamp": 1419717804934', '1419717804934'),
 ('"timestamp": 1419714194423', '1419714194423'),
 ('"timestamp": 1419713998481', '1419713998481'),
 ('"timestamp": 1419713927085', '1419713927085'),
 ('"timestamp": 1419713908863', '1419713908863'),
 ('"timestamp": 1419713908679', '1419713908679'),
 ('"timestamp": 1419713798879', '1419713798879')]

In [11]:
@f.udf(ArrayType(StringType()))
def parse_visits(visits: str):
    list_of_url = re.findall(pattern, visits)
    return [urlparse(url[0]).netloc for url in list_of_url]
#     return ".".join([url[1] for url in list_of_url]).split(".")

In [12]:
@f.udf(DoubleType())
def parse_length(times: str):
    list_of_t = re.findall(pattern_t, times)
    list_of_t = [float(t[1]) for t in list_of_t]
    length = max(list_of_t) - min(list_of_t)
    return round(length,4)

In [13]:
len(visits_test.split(","))

43

In [14]:
type(test_dict)

str

In [15]:
train_df.dtypes

[('gender', 'string'),
 ('age', 'string'),
 ('uid', 'string'),
 ('user_json', 'string')]

In [16]:
train_df = train_df.withColumn('visits', parse_visits('user_json'))
train_df = train_df.withColumn('length', parse_length('user_json'))
# train_df.show()

In [17]:
train_df.show(2)

+------+-----+--------------------+--------------------+--------------------+-------------+
|gender|  age|                 uid|           user_json|              visits|       length|
+------+-----+--------------------+--------------------+--------------------+-------------+
|     F|18-24|d50192e5-c44e-4ae...|{"visits": [{"url...|[zebra-zoya.ru, n...|6.978153933E9|
|     M|25-34|d502331d-621e-472...|{"visits": [{"url...|[sweetrading.ru, ...| 2.26658855E9|
+------+-----+--------------------+--------------------+--------------------+-------------+
only showing top 2 rows



In [18]:
train_df.dtypes

[('gender', 'string'),
 ('age', 'string'),
 ('uid', 'string'),
 ('user_json', 'string'),
 ('visits', 'array<string>'),
 ('length', 'double')]

In [240]:
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.classification import RandomForestClassifier, LogisticRegression, GBTClassifier
from pyspark.ml.feature import Tokenizer, HashingTF, IDF, CountVectorizer
from pyspark.ml.feature import VectorAssembler, StringIndexer
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

In [20]:
concat_f = f.udf(lambda a, b: str(a) + '_' + str(b), StringType())

In [21]:
train_df = train_df.withColumn("gender_age", concat_f('gender','age'))

In [22]:
train_df.filter((f.col('age') == "-") | (f.col('gender') == "-")).count() / train_df.count()

0.12154212650104526

In [23]:
train_df = train_df.filter((f.col('age') != "-") | (f.col('gender') != "-"))

In [24]:
# другой подход
# enc_age = {'18-24' : 1,
#           '25-34' : 2,
#           '35-44' : 3,
#           '45-54' : 4,
#           '>=55' : 5}
# enc_gender = {'F': 0,
#              'M': 1}

# age_enc = f.udf(lambda x: enc_age[x], ByteType())
# gender_enc = f.udf(lambda x: enc_gender[x], ByteType())

# train_df = train_df.withColumn('age_enc', age_enc('age'))
# train_df = train_df.withColumn('gender_enc', gender_enc('gender'))


In [25]:
enc_map = {'M_18-24' : 1,
          'M_25-34' : 2,
          'M_35-44' : 3,
          'M_45-54' : 4,
          'M_>=55' : 5,
          'F_18-24' : 6,
          'F_25-34' : 7,
          'F_35-44' : 8,
          'F_45-54' : 9,
          'F_>=55' : 10,
          }
label_enc = f.udf(lambda x: enc_map[x], ByteType())
train_data = train_df.withColumn('labels', label_enc('gender_age'))

In [26]:
train_df.show(2)

+------+-----+--------------------+--------------------+--------------------+-------------+----------+
|gender|  age|                 uid|           user_json|              visits|       length|gender_age|
+------+-----+--------------------+--------------------+--------------------+-------------+----------+
|     F|18-24|d50192e5-c44e-4ae...|{"visits": [{"url...|[zebra-zoya.ru, n...|6.978153933E9|   F_18-24|
|     M|25-34|d502331d-621e-472...|{"visits": [{"url...|[sweetrading.ru, ...| 2.26658855E9|   M_25-34|
+------+-----+--------------------+--------------------+--------------------+-------------+----------+
only showing top 2 rows



In [101]:
train_df.columns

['gender', 'age', 'uid', 'user_json', 'visits', 'length', 'gender_age']

In [27]:
# train_data = train_data.select('uid', 'visits', 'labels')

# другой подход
# train_data = train_df.select('uid', 'visits', 'age_enc', 'gender_enc')

In [136]:
cv = CountVectorizer(inputCol="visits", outputCol="visits_enc")
rf = RandomForestClassifier(featuresCol='visits_enc', labelCol="labels", numTrees=50)
# vecAssembler = VectorAssembler(outputCol="features") # , handleInvalid="keep"
# vecAssembler.setInputCols(["visits_enc", "length"])
# # другой подход
# rf = RandomForestClassifier(featuresCol='visits_enc', labelCol="age_enc", numTrees=15)

# gbt = GBTClassifier(featuresCol='visits_enc', labelCol="gender_enc", maxBins=50, stepSize=0.1, maxDepth=4)

In [29]:
# cv = CountVectorizer(inputCol="visits", outputCol="visits_enc")
# model_cv = cv.fit(train_data)
# result = model_cv.transform(train_data)

In [30]:
# result.show(2)

In [137]:
pipeline = Pipeline(stages=[
            cv,
            rf
        ])

# другой подход
# cv = CountVectorizer(inputCol="visits", outputCol="visits_enc")
# model_cv = cv.fit(train_data)
# train_data = model_cv.transform(train_data)


In [32]:
# train_data = train_data.select('uid', 'age_enc', 'gender_enc', 'visits_enc')

In [33]:
evaluator = MulticlassClassificationEvaluator(metricName='accuracy', labelCol = "labels", predictionCol="prediction")


In [34]:
# test.show()

In [35]:
train, test= train_data.randomSplit([0.8, 0.2])
# train = train_data.sampleBy("labels", fractions={1: 0.8,
#                                                  2: 0.8,
#                                                  3: 0.8,
#                                                  4: 0.8,
#                                                  5: 0.8,
#                                                  6: 0.8,
#                                                  7: 0.8,
#                                                  8: 0.8,
#                                                  9: 0.8,
#                                                  10: 0.8}, seed=42).cache()
# test = train_data.join(train, on="uid", how="leftanti").coalesce(10).cache()

In [36]:
train.columns

['gender',
 'age',
 'uid',
 'user_json',
 'visits',
 'length',
 'gender_age',
 'labels']

In [37]:
train.groupby('gender_age').agg(f.count('uid')).sort('count(uid)').show()

+----------+----------+
|gender_age|count(uid)|
+----------+----------+
|    M_>=55|       624|
|    F_>=55|       720|
|   M_18-24|      1624|
|   M_45-54|      1741|
|   F_45-54|      2061|
|   F_18-24|      2340|
|   F_35-44|      3426|
|   M_35-44|      4095|
|   F_25-34|      5474|
|   M_25-34|      6962|
+----------+----------+



In [38]:
train.count()

29067

In [39]:
# sample_f = train.filter((f.col('gender_age') == "M_>=55") | (f.col('gender_age') == "F_>=55")).select("*").sample(fraction=(1124/28968)*3)
# train = train.union(sample_f).cache()
# train.count()

In [40]:
test = test.cache()
train = train.cache()

In [41]:
train.show()

+------+-----+--------------------+--------------------+--------------------+-------------+----------+------+
|gender|  age|                 uid|           user_json|              visits|       length|gender_age|labels|
+------+-----+--------------------+--------------------+--------------------+-------------+----------+------+
|     F|18-24|0500be5a-e84b-4ad...|{"visits": [{"url...|[www.roddoma.ru, ...|  8.7331155E7|   F_18-24|     6|
|     F|18-24|052d1f2d-f12f-435...|{"visits": [{"url...|[www.gorenskoe-sp...|          0.0|   F_18-24|     6|
|     F|18-24|05474d68-66b2-4a9...|{"visits": [{"url...|[www.xnxx.com, ww...|     101006.0|   F_18-24|     6|
|     F|18-24|0554173d-0976-4b9...|{"visits": [{"url...|[online.translate...|2.158560583E9|   F_18-24|     6|
|     F|18-24|05647555-f621-4d2...|{"visits": [{"url...|[www.startsmile.r...| 8.97816291E9|   F_18-24|     6|
|     F|18-24|056983f8-f953-494...|{"visits": [{"url...|[evo-centr.e-stil...| 3.46205001E8|   F_18-24|     6|
|     F|18

In [138]:
start = time()
pipeline_model = pipeline.fit(train_data)

# другой подход
# rf_model_age = rf.fit(train)
# rf_model_age = gbt.fit(train)
print_worktime(time()-start)

Код отработал за 0 часов 4 минут 6.415687799453735 секунд


In [92]:
predictions = pipeline_model.transform(test)

In [82]:
list(enc_map.keys())

['M_18-24',
 'M_25-34',
 'M_35-44',
 'M_45-54',
 'M_>=55',
 'F_18-24',
 'F_25-34',
 'F_35-44',
 'F_45-54',
 'F_>=55']

In [86]:
x = 2
list(enc_map.keys())[x-1].split("_")[0]

'M'

In [87]:
decode_g = f.udf(lambda x: list(enc_map.keys())[int(x)-1].split("_")[0])
decode_age = f.udf(lambda x: list(enc_map.keys())[int(x)-1].split("_")[1])

In [93]:
predictions = predictions.select(decode_age('prediction').alias('age'), decode_g('prediction').alias('gender'), 'uid')

In [94]:
predictions.show()

+-----+------+--------------------+
|  age|gender|                 uid|
+-----+------+--------------------+
|25-34|     M|0513b3f0-4ada-4ee...|
|25-34|     M|053b77f9-9c8c-467...|
|25-34|     M|05f08c3f-04d7-4da...|
|25-34|     M|06041b7c-ca15-4c4...|
|25-34|     M|065a265a-db0c-489...|
|25-34|     M|093ee658-beef-4d6...|
|25-34|     M|0940cdad-7a11-42b...|
|25-34|     M|09801ac3-0cc7-40b...|
|25-34|     M|0a2438b3-1181-420...|
|25-34|     M|0a38a1d1-fec6-4d2...|
|25-34|     M|0a410c43-0552-494...|
|25-34|     M|0a6868f4-2cda-425...|
|25-34|     M|0a8636d3-db74-447...|
|25-34|     M|192d666b-5ff7-473...|
|25-34|     M|194079e6-ef4b-4ad...|
|25-34|     M|19eff2e6-a8c9-418...|
|25-34|     M|1f150051-3415-4db...|
|25-34|     M|1f217bd3-e7f9-481...|
|25-34|     M|1f45548c-d8db-403...|
|25-34|     M|1f936078-f8d8-4dc...|
+-----+------+--------------------+
only showing top 20 rows



In [44]:
evaluator.evaluate(predictions)

0.2412671475038891

In [None]:
pipeline_model.save('./model')

In [None]:
train.show(2)

# Kafka

In [103]:
from pyspark.sql.functions import *

In [160]:
def kill_all():
    streams = SparkSession.builder.getOrCreate().streams.active
    if streams:
        for s in streams:
            desc = s.lastProgress["sources"][0]["description"]
            s.stop()
            print("Stopped {s}".format(s=desc))

In [257]:
# чтение
read_kafka_params = {
    "kafka.bootstrap.servers": 'spark-master-1.newprolab.com:6667',
    "subscribe": "input_aleksandr.demenev",
    "failOnDataLoss": 'False',
    "startingOffsets": "latest"
}
kafka_sdf = spark.readStream.format("kafka").options(**read_kafka_params).load()

parsed_sdf = kafka_sdf.select(
    json_tuple(col("value").cast("string"), 'uid', 'visits').alias('uid', 'visits')
)

parsed_sdf = parsed_sdf.withColumn('visits', parse_visits('visits'))

In [258]:
trained_model = PipelineModel.load('./model')

In [259]:
parsed_sdf.isStreaming


True

In [267]:
!hadoop fs -ls streaming/chk/chk_kafka/offsets

Found 40 items
-rw-r--r--   3 aleksandr.demenev aleksandr.demenev        446 2023-02-25 02:03 streaming/chk/chk_kafka/offsets/0
-rw-r--r--   3 aleksandr.demenev aleksandr.demenev        446 2023-02-25 13:41 streaming/chk/chk_kafka/offsets/1
-rw-r--r--   3 aleksandr.demenev aleksandr.demenev        446 2023-02-25 13:42 streaming/chk/chk_kafka/offsets/10
-rw-r--r--   3 aleksandr.demenev aleksandr.demenev        446 2023-02-25 13:42 streaming/chk/chk_kafka/offsets/11
-rw-r--r--   3 aleksandr.demenev aleksandr.demenev        446 2023-02-25 13:42 streaming/chk/chk_kafka/offsets/12
-rw-r--r--   3 aleksandr.demenev aleksandr.demenev        446 2023-02-25 13:42 streaming/chk/chk_kafka/offsets/13
-rw-r--r--   3 aleksandr.demenev aleksandr.demenev        446 2023-02-25 13:42 streaming/chk/chk_kafka/offsets/14
-rw-r--r--   3 aleksandr.demenev aleksandr.demenev        446 2023-02-25 13:42 streaming/chk/chk_kafka/offsets/15
-rw-r--r--   3 aleksandr.demenev aleksandr.demenev        446 2023

In [261]:
# kill_all()

In [262]:
predictions = trained_model.transform(parsed_sdf)

In [263]:
predictions = predictions.select(decode_age('prediction').alias('age'), decode_g('prediction').alias('gender'), 'uid')

In [265]:
# запись
write_kafka_params = {
   "kafka.bootstrap.servers": 'spark-master-1.newprolab.com:6667',
   "topic": "aleksandr.demenev"
}
predictions.select(f.to_json(f.struct(*predictions.columns)).alias('value')).writeStream.format("kafka").options(**write_kafka_params)\
    .option("checkpointLocation", "streaming/chk/chk_kafka")\
    .outputMode("append").start()

Py4JJavaError: An error occurred while calling o4418.start.
: java.lang.IllegalStateException: Cannot start query with id baf764d0-dc74-4c9b-81fb-2b51bf6c997c as another query with same id is already active. Perhaps you are attempting to restart a query from checkpoint that is already active.
	at org.apache.spark.sql.streaming.StreamingQueryManager.startQuery(StreamingQueryManager.scala:345)
	at org.apache.spark.sql.streaming.DataStreamWriter.start(DataStreamWriter.scala:325)
	at sun.reflect.NativeMethodAccessorImpl.invoke0(Native Method)
	at sun.reflect.NativeMethodAccessorImpl.invoke(NativeMethodAccessorImpl.java:62)
	at sun.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
	at java.lang.reflect.Method.invoke(Method.java:498)
	at py4j.reflection.MethodInvoker.invoke(MethodInvoker.java:244)
	at py4j.reflection.ReflectionEngine.invoke(ReflectionEngine.java:357)
	at py4j.Gateway.invoke(Gateway.java:282)
	at py4j.commands.AbstractCommand.invokeMethod(AbstractCommand.java:132)
	at py4j.commands.CallCommand.execute(CallCommand.java:79)
	at py4j.GatewayConnection.run(GatewayConnection.java:238)
	at java.lang.Thread.run(Thread.java:745)


In [72]:
kafka_sdf.stop

AttributeError: 'DataFrame' object has no attribute 'stop'

In [69]:
kafka_sdf.take(2)

AnalysisException: 'Queries with streaming sources must be executed with writeStream.start();;\nkafka'