# Predictive Data Analytics

In this notebook we will build an ML model and evaluate results

In [34]:
from pyspark.sql import SparkSession

# Add here your team number teamx
team = "team15"

# location of your Hive database in HDFS
warehouse = "project/hive/warehouse"

spark = SparkSession.builder\
        .appName("{} - spark ML".format(team))\
        .config("hive.metastore.uris", "thrift://hadoop-02.uni.innopolis.ru:9883")\
        .config("spark.sql.warehouse.dir", warehouse)\
        .config("spark.sql.avro.compression.codec", "snappy")\
        .enableHiveSupport()\
        .getOrCreate()

#We can also add
# .config("spark.sql.catalogImplementation","hive")\ 
# But this is the default configuration
# You can switch to Spark Catalog by setting "in-memory" for "spark.sql.catalogImplementation"


In [35]:
spark.sql("SHOW DATABASES").show()

+--------------------+
|           namespace|
+--------------------+
|             default|
|             root_db|
|     team0_projectdb|
|team12_hive_proje...|
|    team13_projectdb|
|    team14_projectdb|
|    team15_projectdb|
|    team16_projectdb|
|    team18_projectdb|
|    team19_projectdb|
|     team1_projectdb|
|    team20_projectdb|
|    team21_projectdb|
|    team23_projectdb|
|    team25_projectdb|
|    team26_projectdb|
|    team28_projectdb|
|     team2_projectdb|
|    team30_projectdb|
|     team7_projectdb|
+--------------------+
only showing top 20 rows



In [36]:
spark.sql("USE team15_projectdb").show()

++
||
++
++



In [37]:
spark.sql("SHOW TABLES").show()

+----------------+--------------------+-----------+
|       namespace|           tableName|isTemporary|
+----------------+--------------------+-----------+
|team15_projectdb|     car_description|      false|
|team15_projectdb|car_vehicles_ext_...|      false|
|team15_projectdb|          q1_results|      false|
|team15_projectdb|          q2_results|      false|
|team15_projectdb|          q3_results|      false|
+----------------+--------------------+-----------+



In [38]:
spark.sql("SELECT * FROM team15_projectdb.car_vehicles_ext_part_bucket").show()

+----------+--------------------+-----+-----------------+-------------+--------------------+-------------+-----------+------+--------+------------+---------+---------+---------+-----------+------------------+-------------------+--------+
|  entry_id|          region_url|price|manufactured_year| manufacturer|               model|car_condition|  cylinders|  fuel|odometer|transmission|car_drive| car_size| car_type|paint_color|          latitude|          longitude|us_state|
+----------+--------------------+-----+-----------------+-------------+--------------------+-------------+-----------+------+--------+------------+---------+---------+---------+-----------+------------------+-------------------+--------+
|7303328786|wyoming.craigslis...|23995|             2013|       toyota|              tundra|         good|8 cylinders|   gas|  114215|   automatic|      4wd|full-size|    truck|      black|44.354923248291016|-106.68038177490234|      wy|
|7303328988|wyoming.craigslis...|54995|         

In [39]:
print(*spark.catalog.listDatabases(), sep='\n')

Database(name='default', description='Default Hive database', locationUri='hdfs://hadoop-02.uni.innopolis.ru:8020/apps/hive/warehouse')
Database(name='root_db', description='', locationUri='hdfs://hadoop-02.uni.innopolis.ru:8020/user/root/root_db')
Database(name='team0_projectdb', description='', locationUri='hdfs://hadoop-02.uni.innopolis.ru:8020/user/team0/project/hive/warehouse')
Database(name='team12_hive_projectdb', description='', locationUri='hdfs://hadoop-02.uni.innopolis.ru:8020/user/team12/project/hive/warehouse')
Database(name='team13_projectdb', description='', locationUri='hdfs://hadoop-02.uni.innopolis.ru:8020/user/team13/project/hive/warehouse')
Database(name='team14_projectdb', description='', locationUri='hdfs://hadoop-02.uni.innopolis.ru:8020/user/team14/project/hive/warehouse')
Database(name='team15_projectdb', description='', locationUri='hdfs://hadoop-02.uni.innopolis.ru:8020/user/team15/project/hive/warehouse')
Database(name='team16_projectdb', description='', loc

In [40]:
print(*spark.catalog.listTables("team15_projectdb"), sep='\n')

Table(name='car_description', database='team15_projectdb', description=None, tableType='EXTERNAL', isTemporary=False)
Table(name='car_vehicles_ext_part_bucket', database='team15_projectdb', description=None, tableType='EXTERNAL', isTemporary=False)
Table(name='q1_results', database='team15_projectdb', description=None, tableType='EXTERNAL', isTemporary=False)
Table(name='q2_results', database='team15_projectdb', description=None, tableType='EXTERNAL', isTemporary=False)
Table(name='q3_results', database='team15_projectdb', description=None, tableType='EXTERNAL', isTemporary=False)


In [41]:
cars = spark.read.format("avro").table('team15_projectdb.car_vehicles_ext_part_bucket')

In [42]:
cars.printSchema()

root
 |-- entry_id: long (nullable = true)
 |-- region_url: string (nullable = true)
 |-- price: long (nullable = true)
 |-- manufactured_year: integer (nullable = true)
 |-- manufacturer: string (nullable = true)
 |-- model: string (nullable = true)
 |-- car_condition: string (nullable = true)
 |-- cylinders: string (nullable = true)
 |-- fuel: string (nullable = true)
 |-- odometer: integer (nullable = true)
 |-- transmission: string (nullable = true)
 |-- car_drive: string (nullable = true)
 |-- car_size: string (nullable = true)
 |-- car_type: string (nullable = true)
 |-- paint_color: string (nullable = true)
 |-- latitude: double (nullable = true)
 |-- longitude: double (nullable = true)
 |-- us_state: string (nullable = true)



In [46]:
lat_long = spark.sql("SELECT region_url, latitude, longitude FROM team15_projectdb.car_vehicles_ext_part_bucket").collect()

In [49]:
lat_long[:10]

[Row(region_url='honolulu.craigslist.org', latitude=20.88819122314453, longitude=-156.45892333984375),
 Row(region_url='honolulu.craigslist.org', latitude=20.888132095336914, longitude=-156.45892333984375),
 Row(region_url='honolulu.craigslist.org', latitude=20.888141632080078, longitude=-156.45887756347656),
 Row(region_url='honolulu.craigslist.org', latitude=20.881399154663086, longitude=-156.47830200195312),
 Row(region_url='honolulu.craigslist.org', latitude=21.329500198364258, longitude=-157.8614959716797),
 Row(region_url='honolulu.craigslist.org', latitude=21.329500198364258, longitude=-157.8614959716797),
 Row(region_url='honolulu.craigslist.org', latitude=21.329500198364258, longitude=-157.8614959716797),
 Row(region_url='honolulu.craigslist.org', latitude=21.317899703979492, longitude=-157.8520965576172),
 Row(region_url='honolulu.craigslist.org', latitude=21.281099319458008, longitude=-157.82659912109375),
 Row(region_url='honolulu.craigslist.org', latitude=21.29980087280273

In [62]:
from pyspark import keyword_only
from pyspark.ml import Transformer
from pyspark.ml.param.shared import HasInputCols, HasOutputCols, Param, Params, TypeConverters
import pyproj


class LatLongToECEF(Transformer, HasInputCols, HasOutputCols):
    @keyword_only
    def __init__(self, inputCols=None, outputCols=None):
        super(LatLongToECEF, self).__init__()
        self.pyproj_transformer = pyproj.Transformer.from_crs(
            {"proj": "latlong", "ellps": "WGS84", "datum": "WGS84"},
            {"proj": "geocent", "ellps": "WGS84", "datum": "WGS84"}
        )

    @keyword_only
    def setParams(self, inputCols=None, outputCols=None):
        kwargs = self.setParams._input_kwargs
        return self._set(**kwargs)


    def _transform(self, dataset):
        out_col = self.getOutputCol()
        in_col = dataset[self.getInputCol()]
        
        print(in_col)
        print(out_col)

#         # Define transformer logic
#         def f(vector):
#             return self.pyproj_transformer.transform(vector[0], vector[1], 0, radians=False)

#         return dataset.withColumn(out_col, udf(lambda x: f(x), t)(in_col))

    # Required in Spark >= 3.0
    def setInputCol(self, value):
        """
        Sets the value of :py:attr:`inputCol`.
        """
        return self._set(inputCol=value)

    # Required in Spark >= 3.0
    def setOutputCol(self, value):
        """
        Sets the value of :py:attr:`outputCol`.
        """
        return self._set(outputCol=value)

In [None]:
pair_assembler = VectorAssembler(inputCols=["col1", "col2"], outputCol="cols_vector")