# Passo necessários para configurar o spark

In [1]:
# Para que o Jupyter consiga carregar o Spark corretamente no notebook
import findspark
findspark.init('/usr/local/spark')
# findspark.init('/usr/lib/spark')

# Para que os executors tenham mais memória e não falhem por falta de recursos
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--executor-memory 1G pyspark-shell'

# A partir daqui é código Spark que normalmente é executado com um comando similar ao comando abaixo:
# spark-submit --executor-memory 1G nome_do_script.py
from pyspark import SparkConf, SparkContext

# A linha abaixo está comentada porque essa é a forma de executar Spark em uma instalação local usando todos os cores
# conf = SparkConf().setMaster("local[*]").setAppName("NomeDoApp")

conf = SparkConf()
sc = SparkContext(conf=conf)

# Importando as libs necessárias para o projeto

In [2]:
import pandas as pd
from s3fs import S3FileSystem

s3 = S3FileSystem(anon=True)

# Carregando datasets

In [3]:
file_name = 'data-eng-t2-school/projetos/problema-06/dados/problema-06-sample-01.csv'

with s3.open(file_name, 'rb') as f:
    df_visits = pd.read_csv(f)

In [4]:
file_name = 'data-eng-t2-school/projetos/problema-06/dados/problema-06_estabelecimentos.csv'

with s3.open(file_name, 'rb') as f:
    df_stores = pd.read_csv(f)

In [5]:
df_stores.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 3 columns):
lat         10 non-null float64
lng         10 non-null float64
category    10 non-null object
dtypes: float64(2), object(1)
memory usage: 320.0+ bytes


# Preprocessamento dos datasets

In [6]:
df_stores['category'] = df_stores['category'].apply(lambda x: x.strip())

In [7]:
df_visits['weekday'] = pd.to_datetime(df_visits['timestamp']).map(lambda x: x.dayofweek)

In [8]:
result = pd.merge(df_visits, df_stores, how='inner', on=['lat', 'lng'])

In [13]:
count_series = result[['user_id']].groupby(['user_id']).size()
# # df.nlargest(3, 'a')
# new_df = count_series.to_frame(name = 'size').reset_index()
# top_users = new_df.nlargest(10, 'size')
# type(top_users)

user_id
017c9a02-2f40-484b-b8d0-6f57b9de54c8    650
022ca9f6-9b68-4f1a-9312-356a82298c4c    661
062cdf6b-09db-499a-a2be-8daf152f5b86    684
0afb6fa7-a13e-42ec-bc62-2b29797049d0    648
0e17515a-0d27-4685-8563-c806cb1fe98c    666
11d551b6-13d7-451f-9587-e1ff10bd0a0b    659
15c64998-8792-4d45-b699-e6b3214a4f77    635
20a5ea38-b929-4d63-a033-dd6bfd7427f1    687
232c8c98-e1a6-41e3-b038-a4d6faf85cf6    643
238900a8-d2f9-4fd3-a023-bd4639d51920    687
2400f385-fba3-40d3-848b-98edd2e97dd7    639
25c1cbdc-f766-472b-85e1-366e36a62445    658
2716e6af-06b8-45f7-abf8-18d59edde85e    604
271c70d7-b918-4729-bae0-cc53f78884f7    675
322de2aa-d0df-43d6-af3c-da3ee2641e06    708
34cf6471-f426-4d3f-ad18-ca850ef22543    621
36ad8fff-c857-48c3-8e8a-af4470bb1247    657
36bde086-622e-4f09-b5d3-81c38f7a8d90    654
37c705e7-138d-44f5-a12b-0ec011569a8a    664
380b090b-24a1-4e2a-9e79-5297ffb911f8    716
3a216152-1325-4e2c-9ab6-14b1ce633330    636
3db29280-7e5e-4ae1-a896-45e9b184145d    655
3f668086-64f7-4592-996b-

In [14]:
top_results = result[result.user_id.isin(top_users.user_id)]

In [15]:
grouped_results = top_results[['user_id', 'category', 'weekday']].groupby(['user_id', 'category', 'weekday']).size()

In [16]:
grouped_results.values
with pd.option_context('display.max_rows', 1000, 'display.max_columns', 3):
    print(grouped_results)
type(grouped_results)

user_id                               category       weekday
322de2aa-d0df-43d6-af3c-da3ee2641e06  'clothing'     0          17
                                                     1           9
                                                     2          17
                                                     3          12
                                                     4          14
                                                     5          16
                                                     6          11
                                      'electronics'  0          17
                                                     1          16
                                                     2          22
                                                     3          18
                                                     4          23
                                                     5          11
                                                     6          18
 

pandas.core.series.Series

In [None]:
# Import pyplot
import matplotlib.pyplot as plt

grouped_results.plot()
plt.show()