# Saudi Point Of sales Transaction dashboard

In [None]:
# import libraries
import pandas as pd
import numpy as np
import glob


import pdfplumber
import re
from helper import * 


import requests
from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.support.wait import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.common.exceptions import WebDriverException
from fake_useragent import UserAgent
from geopy.geocoders import Nominatim

In [None]:
url = "https://www.sama.gov.sa/ar-sa/Indices/pages/pos.aspx"
#url= "https://www.sama.gov.sa/ar-sa/Indices/Pages/POS.aspx?Paged=TRUE&p_SortBehavior=0&p_Created=20210316%2013%3a24%3a35&p_ID=50&PageFirstRow=31&View=e107b92a-9e94-4513-b248-dc9a0beeae39"

with requests.Session() as s:
    s.headers = {'User-Agent': 'Mozilla/5.0 '}

    response = s.get(url)
    soup = BeautifulSoup(response.content)
    FileRef = re.findall("FileRef.*?[\.!?]", str(soup), re.MULTILINE | re.DOTALL )
    FileRefNew = [item.replace("\\u002far-sa\\u002fIndices\\u002fPOS\\u002f", "https://www.sama.gov.sa/ar-sa/Indices/POS/") for item in FileRef]
    string = 'pdf'
    FileRefNew = [x + string for x in FileRefNew]
    FileRefNew = [e[11:] for e in FileRefNew]
    print(FileRefNew[0])


In [None]:
# Loop through all urls and download pdfs
for url in FileRefNew:
    try:
        # Manage firefox specific settings in a way that geckodriver can understand 
        options = webdriver.FirefoxOptions()
        options.set_preference("browser.download.folderList", 2)
        options.set_preference("browser.download.manager.showWhenStarting", False)
        options.set_preference("browser.download.dir", "SaudiPointOfSales/pdf")
        options.set_preference("browser.download.useDownloadDir", True)
        options.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/pdf")
        options.set_preference("pdfjs.disabled", True)
        options.set_preference("pdfjs.enabledCache.state", False)
        options.set_preference("plugin.scan.Acrobat", "99.0")
        options.set_preference("plugin.scan.plid.all", False)
        # call selenium driver to automate web browser interaction
        driver = webdriver.Firefox(executable_path=r"SaudiPointOfSales/geckodriver", options = options)
        # open an URL
        driver.get(url)
        driver.implicitly_wait(10)
        driver.close()
    except WebDriverException:
        driver.quit()
        exit()

In [None]:
# Scraping tables from pdfs  & save it in dataframs     
arr_of_files = (glob.glob(r"/Users/turki/SaudiPointOfSales/SaudiPointOfSales/pdf/*.pdf"))

# Create or initialize Pandas DataFrame
sectors_df = pd.DataFrame()
cities_df = pd.DataFrame()

for i in arr_of_files:
    # Plumb a PDF for detailed information and table and store it in the initialize datafram.
    with pdfplumber.open(i) as pdf:
        tables = pdf.pages[0].find_tables()
        first_table = tables[0].extract(x_tolerance = 5)
        second_table = tables[1].extract(x_tolerance = 5)
        
        # Do some data cleansing in the first table inside a pdf 
        df_first_table = pd.DataFrame (first_table)
        df_first_table = df_first_table[[0,7,8]]
        df_first_table['Date'] = df_first_table[7][0]
        df_first_table = df_first_table[2:]
        df_first_table = df_first_table.reset_index(drop=True)
        
        sectors_df = sectors_df.append(df_first_table)
        
        # Do some data cleansing in the second table inside a pdf 
        df_second_table = pd.DataFrame (second_table)
        df_second_table = df_second_table[[0,7,8]]
        df_second_table['Date'] = df_second_table[7][0]
        df_second_table = df_second_table[2:]
        df_second_table = df_second_table.reset_index(drop=True)
        
        cities_df = cities_df.append(df_second_table)
        
print("** Done converting tables to data frames **")

# Renames columns  
sectors_df = sectors_df.rename(columns={0: 'Sector',7: 'Number of Transactions', 8: 'Value of Transactions'})
cities_df = cities_df.rename(columns={0: 'City',7: 'Number of Transactions', 8: 'Value of Transactions'})
to_int(cities_df,'Value of Transactions')
to_int(cities_df,'Number of Transactions')



# parsing dates
sectors_df["Start Date"]= sectors_df['Date'].str.split("-", n = 1, expand = True)[0]
sectors_df["End Date"]= sectors_df['Date'].str.split("-", n = 1, expand = True)[1]
del sectors_df["Date"]
# TODO: tranfare the date to year, month, week 

cities_df["Start Date"]= cities_df['Date'].str.split("-", n = 1, expand = True)[0]
cities_df["End Date"]= cities_df["Date"].str.split("-", n = 1, expand = True)[1]
del cities_df["Date"]
# TODO: tranfare the date to year, month, week 

# Doing more data cleansing
sectors(sectors_df)
cities(cities_df)


# Add latitude & longitude for map chart
group_City = cities_df.groupby(by='English_City').agg({'Value of Transactions' : 'sum', 'Number of Transactions' : 'sum'})

location = [x for x in group_City['English_City'].unique().tolist() 
            if type(x) == str]
latitude = []
longitude =  []
for i in range(0, len(location)):
    # remove things that does not seem usefull here
    try:
        address = location[i] + ', Saudi Arabia'
        geolocator = Nominatim(user_agent="sa_explorer@gmail.com")
        loc = geolocator.geocode(address)
        latitude.append(loc.latitude)
        longitude.append(loc.longitude)
        print('The geographical coordinate of location are {}, {}.'.format(loc.latitude, loc.longitude))
    except:
        # in the case the geolocator does not work, then add nan element to list
        # to keep the right size
        latitude.append(np.nan)
        longitude.append(np.nan)
# create a dataframe with the locatio, latitude and longitude
df_ = pd.DataFrame({'English_City':location, 
                    'location_latitude': latitude,
                    'location_longitude':longitude})
# merge on English_City with Groupe_City to get the column 
Grouped_City = group_City.merge(df_, on='English_City', how='left')
Grouped_City.at[Grouped_City['English_City'] == 'OTHER','location_latitude'] = float(25)
Grouped_City.at[Grouped_City['English_City'] == 'OTHER','location_longitude'] = float(45)

sectors_df.to_csv('output/sectors_df.csv', index = False)
cities_df.to_csv('output/cities_df.csv', index = False)
Grouped_City.to_csv('output/Grouped_City.csv', index = False)

print("** Done cleansing data frames **")

In [None]:
px.set_mapbox_access_token('pk.eyJ1IjoiYWxzdXR1cmtpIiwiYSI6ImNrdjUzOXM4cTAzZmIydnBqMWh1cms0a2MifQ.HDRkBwCGJl3wMaWzsyMtDQ')

In [90]:
fig = px.scatter_mapbox(df, lat="location_latitude", lon="location_longitude",
                        hover_name='English_City',
                        color="Value of Transactions", 
                        size="Value of Transactions", zoom=4,
                  color_continuous_scale= px.colors.sequential.Blugrn, size_max=30)

fig.show()

In [106]:
import plotly.io as pio
import plotly.express as px

df = pd.read_csv(r'C:\Users\talsughayyir\Desktop\pos\SaudiPointOfSales-staging\output\cities_df.csv')
df = df.sort_values('Value of Transactions')

aggs = ["count","sum","avg","median","mode","rms","stddev","min","max","first","last"]

agg = []
agg_func = []
for i in range(0, len(aggs)):
    agg = dict(
        args=['transforms[0].aggregations[0].func', aggs[i]],
        label=aggs[i],
        method='restyle'
    )
    agg_func.append(agg)


data = [dict(
  type = 'bar',
  x = df['Arabic_City'],
  y = df['Value of Transactions'],
  mode = 'markers',
  marker = dict(color = 'rgb(68, 68, 68)'),
  transforms = [dict(
    type = 'aggregate',
    aggregations = [dict(
        target = 'y', func = 'sum', enabled = True)
    ]
  )]
)]

layout = dict(
  title = 'قيمة العمليات لكل مدينة',
  xaxis = dict(title = 'المدينة'),
  yaxis = dict(title = 'قيمةالعمليات'),
  updatemenus = [dict(
        x = 0.85,
        y = 1.15,
        xref = 'paper',
        yref = 'paper',
        yanchor = 'top',
        active = 1,
        showactive = False,
        buttons = agg_func
  )]
)

fig_dict = dict(data=data, layout=layout)

pio.show(fig_dict, validate=False)