## Task 4 
**Top 2 Highest earning sellers by each location.**</br>
> We have 2 tables with sales history and data with all of the sellers. Sellers are assigned to one location. Our task is to find two sellers with highest profits grouped by location. Because we have information about state and city we will find top sellers grouping by those two categories. We assume that the revenue is equal to price of product subtracted by the freight price that the sellers has to settle. At the end we will create the map of the whole country with scaled markers according to the number of sales done and revenue gained. 

In [None]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import TimestampType
import pyspark.sql.functions as F
from pyspark.sql.window import Window
import folium
from folium import plugins
from IPython.display import clear_output
import pandas as pd

spark = SparkSession.builder.getOrCreate()

In [3]:
# Orders data
# "order_id","order_item_id","product_id","seller_id","shipping_limit_date","price","freight_value"
olist_order_items_dataset = spark.read.options(header='True', inferSchema='True', delimiter=',') \
                            .csv("data/olist_order_items_dataset.csv")
olist_order_items_dataset.show(2)
# Sellers data 
# "seller_id","seller_zip_code_prefix","seller_city","seller_state"
olist_sellers_dataset = spark.read.options(header='True', inferSchema='True', delimiter=',') \
                            .csv("data/olist_sellers_dataset.csv")
olist_sellers_dataset.show(2)

+--------------------+-------------+--------------------+--------------------+-------------------+-----+-------------+
|            order_id|order_item_id|          product_id|           seller_id|shipping_limit_date|price|freight_value|
+--------------------+-------------+--------------------+--------------------+-------------------+-----+-------------+
|00010242fe8c5a6d1...|            1|4244733e06e7ecb49...|48436dade18ac8b2b...|2017-09-19 09:45:35| 58.9|        13.29|
|00018f77f2f0320c5...|            1|e5f2d52b802189ee6...|dd7ddc04e1b6c2c61...|2017-05-03 11:05:13|239.9|        19.93|
+--------------------+-------------+--------------------+--------------------+-------------------+-----+-------------+
only showing top 2 rows

+--------------------+----------------------+-----------+------------+
|           seller_id|seller_zip_code_prefix|seller_city|seller_state|
+--------------------+----------------------+-----------+------------+
|3442f8959a84dea7e...|                 13023|   

In [5]:
# Joining sales with sellers 
# And calculating sum of all the sales 
items = olist_order_items_dataset.select("seller_id","price")
sellers = olist_sellers_dataset.select("seller_id","seller_city","seller_state")
sales_df = items.join(sellers, ['seller_id'], "Inner")
sales_df = sales_df\
    .groupBy('seller_id','seller_state','seller_city')\
    .agg({'price':'sum'})\
    .select('seller_id','seller_state','seller_city',round('sum(price)',2))\
    .withColumnRenamed("round(sum(price), 2)", "sales")
sales_df.show(4)
sales_df.createOrReplaceTempView("join_table")

+--------------------+------------+--------------+--------+
|           seller_id|seller_state|   seller_city|   sales|
+--------------------+------------+--------------+--------+
|7142540dd4c91e223...|          SP|     penapolis|37373.56|
|897060da8b9a21f65...|          SP|ribeirao preto|23023.92|
|318f287a62ab7ac10...|          SP|     sao paulo| 2517.48|
|609e1a9a6c2539919...|          SC|       brusque| 6595.23|
+--------------------+------------+--------------+--------+
only showing top 4 rows



### Top 2 sellers in every state
**using pyspark dataframe**

In [6]:
# Partition by state
window_state = Window.partitionBy('seller_state').orderBy(col('sales').desc())
state_df = sales_df\
    .withColumn('rank',rank().over(window_state))\
    .filter(col('rank')<=2)\
    .select('seller_id','seller_state','sales','rank')

**using pyspark sql**

In [None]:
df = spark.sql("""WITH seller AS (
               SELECT
               seller_id,
               seller_state,
               ROUND(SUM(sales)) AS sum_price
               FROM join_table
               GROUP BY seller_state, seller_id
               )
               SELECT seller_id, seller_state, sum_price
               FROM (SELECT *,
               ROW_NUMBER() OVER(PARTITION BY seller_state ORDER BY sum_price DESC) AS row_number
               FROM seller)
               WHERE row_number < 3
               """)
df.show(4)

### Top 2 sellers in every city
**using pyspark dataframe**

In [7]:
# Partition by city
window_state = Window.partitionBy('seller_city').orderBy(col('sales').desc())
state_df = sales_df\
    .withColumn('rank',rank().over(window_state))\
    .filter(col('rank')<=2)\
    .select('seller_id','seller_city','sales','rank')

**using pyspark sql**

In [None]:
df = spark.sql("""WITH seller AS (
               SELECT
               seller_id,
               seller_city,
               ROUND(SUM(sales)) AS sum_price
               FROM join_table
               GROUP BY seller_city, seller_id
               )
               SELECT seller_id, seller_city, sum_price
               FROM (SELECT *,
               ROW_NUMBER() OVER(PARTITION BY seller_city ORDER BY sum_price DESC) AS row_number
               FROM seller)
               WHERE row_number < 3
               """)
df.show(6)

In [9]:
# Show 6 results of final answer for testing
# Top sellers by states 
state_df.show(6)

# Show 6 results of final answer for testing
# Top sellers by city 
state_df.show(6)

                                                                                

+--------------------+-----------+--------+----+
|           seller_id|seller_city|   sales|rank|
+--------------------+-----------+--------+----+
|da20530872245d6cd...|  igrejinha|  314.96|   1|
|c33847515fa6305ce...|    brusque|15519.85|   1|
|ad97a199236354e53...|    brusque| 13205.9|   2|
|2c4c47cb51acd5ea5...|   buritama|  2575.9|   1|
|f181738b150df1f37...|carapicuiba|  5529.7|   1|
|f680f85bee2d25355...|carapicuiba| 5183.92|   2|
+--------------------+-----------+--------+----+
only showing top 6 rows



                                                                                

+--------------------+-----------+--------+----+
|           seller_id|seller_city|   sales|rank|
+--------------------+-----------+--------+----+
|da20530872245d6cd...|  igrejinha|  314.96|   1|
|c33847515fa6305ce...|    brusque|15519.85|   1|
|ad97a199236354e53...|    brusque| 13205.9|   2|
|2c4c47cb51acd5ea5...|   buritama|  2575.9|   1|
|f181738b150df1f37...|carapicuiba|  5529.7|   1|
|f680f85bee2d25355...|carapicuiba| 5183.92|   2|
+--------------------+-----------+--------+----+
only showing top 6 rows



### Creating map with sales markers 

In [None]:
# Connecting city names with geo codes 
from pyspark.sql.types import *

# Reading geolocation data and creating one geo coords per city 
geo = spark.read.options(header='True', inferSchema='True', delimiter=',') \
                            .csv("data/olist_geolocation_dataset.csv").select('geolocation_city','geolocation_lng','geolocation_lat')
geo = geo.groupBy('geolocation_city')\
    .agg({'geolocation_lat':'avg','geolocation_lng':'avg'})\
    .orderBy('geolocation_city')\
    .withColumnRenamed('avg(geolocation_lat)','lat')\
    .withColumnRenamed('avg(geolocation_lng)','lng')\
    .withColumnRenamed('geolocation_city','seller_city')

# Joining sales df and location df 
loc_df = sales_df\
    .groupBy('seller_city')\
    .agg({'sales':'count'})\
    .withColumnRenamed('count(sales)','num_sales')\
    .join(geo,['seller_city'])
# Adding fraction of all of the sales for point scaling 
sum_sales = loc_df.agg({'num_sales':'sum'}).collect()[0][0]
loc_df = loc_df\
    .withColumn('fract',col('num_sales')/sum_sales)\
    .orderBy(col('fract'))\
    .select('seller_city','lat','lng','fract')

In [None]:
+----------------+---------+-------------------+-------------------+
|     seller_city|num_sales|                lat|                lng|
+----------------+---------+-------------------+-------------------+
|       igrejinha|        1|-29.571977530057968|-50.794729763963296|
|         brusque|       10|-27.100857082718772| -48.91495449463352|
|        buritama|        1| -21.06728750877154| -50.14440187114215|
|     carapicuiba|       10|-23.545578557335205| -46.83877557576887|
|fernando prestes|        1|-21.267134804144067|-48.686678524301236|
|           garca|        4|-22.211232140083208| -49.65819174411557|
+----------------+---------+-------------------+-------------------+

In [None]:
# Initialize folium map
sales_map = folium.Map(
    zoom_start=4,
    location=[-23.54, -48.91], prefer_canvas=True)
def group_val(x):
    if x>=0.2:
        return 0
    elif x>=0.02:
        return 1
    elif x>=0.008:
        return 2
    else:
        return 3

In [None]:
# Collect values from loc_df
# loc_data[i][j] | i for row num and j for 0-city, 1-lat, 2-lng, 3-fract
LAT = 1
LNG = 2
FRA = 3
colors = ['red','orange','yellow','green']
sizes = [13,8,2,1]
# Collect max num of sales 
loc_data = loc_df.collect()
i = 0
# W poprzednim pliku pętla for dla pysparka była spowolniona 
# przez to że collect był wywoływany w każdej iteracji pętli 
# zamiast raz przed pętlą for
for row in range(loc_df.count()):
    folium.CircleMarker(
        location=[loc_data[i][LAT],loc_data[i][LNG]],
        radius=sizes[group_val(loc_data[i][FRA])],
        color=colors[group_val(loc_data[i][FRA])],
        fill=True,
        fill_color=colors[group_val(loc_data[i][FRA])],
        fill_opacity=1
    ).add_to(sales_map)
    i += 1
sales_map.save('sales_map.html')

# Map with markers scaled by number of sales done in the given city
<img src='sales_map.png'/>