## Task 4 
**Top 2 Highest earning sellers by each location.**</br>
> We have 2 tables with sales history and data with all of the sellers. Sellers are assigned to one location. Our task is to find two sellers with highest profits grouped by location. Because we have information about state and city we will find top sellers grouping by those two categories. We assume that the revenue is equal to price of a product. At the end we will create the map of the whole country with scaled markers according to the number of sales done and revenue gained. 

In [2]:
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
from pyspark.sql.types import TimestampType
import pyspark.sql.functions as F
from pyspark.sql.window import Window
import folium
from folium import plugins
from IPython.display import clear_output
import pandas as pd

spark = SparkSession.builder.getOrCreate()

In [9]:
# Reading data from parquet and selecting only needed columns
# Orders data
items = spark.read.parquet('data_parquet/olist_order_items_dataset.parquet')\
    .select('order_id','seller_id','price')
# Sellers data 
sellers = spark.read.parquet('data_parquet/olist_sellers_dataset.parquet')\
    .select('seller_id','seller_zip_code_prefix','seller_city','seller_state')\
    .withColumnRenamed('seller_zip_code_prefix','zip_code')
# Geolocation data 
geo = spark.read.parquet('data_parquet/olist_geolocation_dataset.parquet')\
    .select('geolocation_zip_code_prefix','geolocation_lat','geolocation_lng')\
    .withColumnRenamed('geolocation_zip_code_prefix','zip_code')
# Orders data 
orders = spark.read.parquet('data_parquet/olist_orders_dataset.parquet')\
    .select('order_id','order_purchase_timestamp')

# |-- order_id: string (nullable = true)
# |-- seller_id: string (nullable = true)
# |-- price: double (nullable = true)

# |-- seller_id: string (nullable = true)
# |-- seller_zip_code_prefix: integer (nullable = true)
# |-- seller_city: string (nullable = true)
# |-- seller_state: string (nullable = true)

# |-- geolocation_zip_code_prefix: integer (nullable = true)
# |-- geolocation_lat: double (nullable = true)
# |-- geolocation_lng: double (nullable = true)

# |-- order_id: string (nullable = true)
# |-- order_purchase_timestamp: string (nullable = true)

In [16]:
# Calculating sum of money earned by sellers 
# and joining sellers table to be able to calculate 
# money earned partitioning by location 
sales = items\
    .groupBy('seller_id')\
    .agg({'price':'sum'})\
    .withColumnRenamed('sum(price)','revenue')\
    .join(sellers,['seller_id'])

### Top 2 sellers in every state
**using pyspark dataframe**

In [24]:
# Partition by state
window_state = Window.partitionBy('seller_state').orderBy(col('revenue').desc())
state_df = sales\
    .withColumn('rank',rank().over(window_state))\
    .filter(col('rank')<=2)\
    .select('seller_id','seller_state','revenue')\
    .orderBy(col('revenue').desc())

### Top 2 sellers in every city
**using pyspark dataframe**

In [25]:
# Partition by city
window_state = Window.partitionBy('seller_city').orderBy(col('revenue').desc())
city_df = sales\
    .withColumn('rank',rank().over(window_state))\
    .filter(col('rank')<=2)\
    .select('seller_id','seller_city','revenue')\
    .orderBy(col('revenue').desc())

In [26]:
# Showing first 12 results from answer. First calculated for states and second for cities. 
print('Top 2 sellers from every state:\n')
state_df.show(12)
print('\nTop 2 sellers from every city:\n')
city_df.show(12)

Top 2 sellers from every state:



                                                                                

+--------------------+------------+------------------+
|           seller_id|seller_state|           revenue|
+--------------------+------------+------------------+
|4869f7a5dfa277a7d...|          SP| 229472.6300000005|
|53243585a1d6dc264...|          BA|222776.05000000002|
|4a3ca9315b744ce9f...|          SP| 200472.9200000013|
|46dc3b2cc0980fb8e...|          RJ|128111.19000000028|
|620c87c171fb2a6dd...|          RJ|114774.50000000041|
|a1043bafd471dff53...|          MG|101901.16000000018|
|ccc4bbb5f32a6ab2b...|          PR|          74004.62|
|04308b1ee57b6625f...|          SC| 60130.59999999999|
|522620dcb18a6b31c...|          PR| 57168.48999999999|
|de722cd6dad950a92...|          PE|55426.099999999926|
|25c5c91f63607446a...|          MG| 54679.21999999999|
|eeb6de78f79159600...|          SC|43739.840000000004|
+--------------------+------------+------------------+
only showing top 12 rows


Top 2 sellers from every city:





+--------------------+----------------+------------------+
|           seller_id|     seller_city|           revenue|
+--------------------+----------------+------------------+
|4869f7a5dfa277a7d...|         guariba| 229472.6300000005|
|53243585a1d6dc264...|lauro de freitas|222776.05000000002|
|4a3ca9315b744ce9f...|        ibitinga| 200472.9200000013|
|fa1c13f2614d7b5c4...|          sumare|194042.03000000038|
|7c67e1448b00f6e96...| itaquaquecetuba|         187923.89|
|7e93a43ef30c4f03f...|         barueri|         176431.87|
|da8622b14eb17ae28...|      piracicaba|160236.57000000114|
|7a67c85e85bb2ce85...|       sao paulo|141745.53000000032|
|1025f0e2d44d7041d...|       sao paulo|138968.55000000022|
|46dc3b2cc0980fb8e...|  rio de janeiro|128111.19000000028|
|620c87c171fb2a6dd...|      petropolis|114774.50000000041|
|7d13fca1522535862...|  ribeirao preto|113628.97000000007|
+--------------------+----------------+------------------+
only showing top 12 rows





### Creating map with sales markers 

In [None]:
# Connecting city names with geo codes 
from pyspark.sql.types import *

# Reading geolocation data and creating one geo coords per city 
geo = spark.read.options(header='True', inferSchema='True', delimiter=',') \
                            .csv("data/olist_geolocation_dataset.csv").select('geolocation_city','geolocation_lng','geolocation_lat')
geo = geo.groupBy('geolocation_city')\
    .agg({'geolocation_lat':'avg','geolocation_lng':'avg'})\
    .orderBy('geolocation_city')\
    .withColumnRenamed('avg(geolocation_lat)','lat')\
    .withColumnRenamed('avg(geolocation_lng)','lng')\
    .withColumnRenamed('geolocation_city','seller_city')

# Joining sales df and location df 
loc_df = sales_df\
    .groupBy('seller_city')\
    .agg({'sales':'count'})\
    .withColumnRenamed('count(sales)','num_sales')\
    .join(geo,['seller_city'])
# Adding fraction of all of the sales for point scaling 
sum_sales = loc_df.agg({'num_sales':'sum'}).collect()[0][0]
loc_df = loc_df\
    .withColumn('fract',col('num_sales')/sum_sales)\
    .orderBy(col('fract'))\
    .select('seller_city','lat','lng','fract')

In [None]:
'''+----------------+---------+-------------------+-------------------+
|     seller_city|num_sales|                lat|                lng|
+----------------+---------+-------------------+-------------------+
|       igrejinha|        1|-29.571977530057968|-50.794729763963296|
|         brusque|       10|-27.100857082718772| -48.91495449463352|
|        buritama|        1| -21.06728750877154| -50.14440187114215|
|     carapicuiba|       10|-23.545578557335205| -46.83877557576887|
|fernando prestes|        1|-21.267134804144067|-48.686678524301236|
|           garca|        4|-22.211232140083208| -49.65819174411557|
+----------------+---------+-------------------+-------------------+'''

In [None]:
# Initialize folium map
sales_map = folium.Map(
    zoom_start=4,
    location=[-23.54, -48.91], prefer_canvas=True)
def group_val(x):
    if x>=0.2:
        return 0
    elif x>=0.02:
        return 1
    elif x>=0.008:
        return 2
    else:
        return 3

In [None]:
# Collect values from loc_df
# loc_data[i][j] | i for row num and j for 0-city, 1-lat, 2-lng, 3-fract
LAT = 1
LNG = 2
FRA = 3
colors = ['red','orange','yellow','green']
sizes = [13,8,2,1]
# Collect max num of sales 
loc_data = loc_df.collect()
i = 0
# W poprzednim pliku pętla for dla pysparka była spowolniona 
# przez to że collect był wywoływany w każdej iteracji pętli 
# zamiast raz przed pętlą for
for row in range(loc_df.count()):
    folium.CircleMarker(
        location=[loc_data[i][LAT],loc_data[i][LNG]],
        radius=sizes[group_val(loc_data[i][FRA])],
        color=colors[group_val(loc_data[i][FRA])],
        fill=True,
        fill_color=colors[group_val(loc_data[i][FRA])],
        fill_opacity=1,
        popup=loc_data[i][FRA],
        tooltip=loc_data[i][FRA]
    ).add_to(sales_map)
    i += 1
sales_map.save('sales_map.html')

# Map with markers scaled by number of sales done in the given city
<img src='sales_map.png'/>

In [None]:
#!pip install Pillow
#!pip install selenium
import io
from PIL import Image

img_data = sales_map._to_png(5)
img = Image.open(io.BytesIO(img_data))
img.save('sales_map.png')