# lat long stuff

In [40]:
from geopy.geocoders import Nominatim

In [41]:
geolocator = Nominatim()

In [43]:
location = geolocator.reverse("48.8588443, 2.2943506")

In [47]:
location.raw

{'place_id': '241370569',
 'licence': 'Data © OpenStreetMap contributors, ODbL 1.0. https://osm.org/copyright',
 'osm_type': 'relation',
 'osm_id': '5489555',
 'lat': '48.8561357',
 'lon': '2.29782184060949',
 'display_name': 'Champ de Mars, Place Jacques Rueff, Gros-Caillou, 7e, Paris, Île-de-France, France métropolitaine, 75007, France',
 'address': {'attraction': 'Champ de Mars',
  'road': 'Place Jacques Rueff',
  'suburb': 'Gros-Caillou',
  'city_district': '7e',
  'city': 'Paris',
  'county': 'Paris',
  'state': 'Île-de-France',
  'country': 'France',
  'postcode': '75007',
  'country_code': 'fr'},
 'boundingbox': ['48.8522459', '48.8600801', '2.2918711', '2.3037935']}

# agg

In [1]:
import pandas as pd
import numpy as np

In [2]:
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [16, 6]
plt.style.use('ggplot')

In [3]:
from concurrent.futures import ProcessPoolExecutor, as_completed
from fastprogress import progress_bar

def parallel(func, job_list, n_jobs=14):
    with ProcessPoolExecutor(max_workers=n_jobs) as pool:
        futures = [pool.submit(func, job) for job in job_list]
        for f in progress_bar(as_completed(futures), total=len(job_list)):
            pass
    return [f.result() for f in futures]

In [4]:
import gc

In [5]:
from warnings import filterwarnings
filterwarnings("ignore")

In [6]:
import datetime as dt
def to_dt(x):
    return dt.datetime.fromtimestamp(int(x/1000))

In [7]:
from multiprocessing import Pool
num_partitions = 100
num_cores = 12
def parallelize_dataframe(df, func):
    df_split = np.array_split(df, num_partitions)
    pool = Pool(num_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [8]:
sessions = pd.read_pickle('../session.pkl')

In [9]:
sessions.sort_values(['system_time', 'user_id_hash'], inplace=True)

In [10]:
sessions.user_created_timestamp = sessions.user_created_timestamp.apply(to_dt)

In [11]:
sessions.drop(['week_start', 'device_id', 'user_days'], inplace=True, axis=1)

In [12]:
sessions.country.unique().shape

(229,)

In [13]:
sessions.region.unique().shape

(756,)

In [14]:
sessions.region.value_counts()[160:170]

o      4007
alx    3964
05     3924
62     3862
wy     3840
pm     3806
85     3734
83     3709
sh     3662
lma    3571
Name: region, dtype: int64

In [15]:
region_remove = set([k for k, v in dict(sessions.region.value_counts()).items() if v <= 1000])

In [16]:
len(region_remove)

471

In [17]:
sessions.region = sessions.region.cat.add_categories(['unk'])

In [18]:
sessions.loc[sessions.region.isin(region_remove), ['region']] = 'unk'

In [19]:
sessions.region = sessions.region.cat.remove_unused_categories()

In [20]:
sessions.region.value_counts()[:10]

eng    328237
tx     238275
ca     218335
fl     141941
00     129040
ny     128095
il     126364
ga     125351
?      120463
unk    111717
Name: region, dtype: int64

In [21]:
sessions.city.unique().shape

(31541,)

In [22]:
sessions.city.value_counts()[400:410]

independence      1607
belo horizonte    1599
newport news      1594
durban north      1591
chula vista       1589
gaborone          1585
hull              1584
meridian          1584
curitiba          1583
monroe            1576
Name: city, dtype: int64

In [23]:
city_remove = set([k for k, v in dict(sessions.city.value_counts()).items() if v <= 1000])

In [24]:
len(city_remove)

30899

In [25]:
sessions.city = sessions.city.cat.add_categories(['unk'])
sessions.loc[sessions.city.isin(city_remove), ['city']] = 'unk'
sessions.city = sessions.city.cat.remove_unused_categories()

In [26]:
sessions.city.unique().shape

(659,)

In [27]:
sessions['lat_long'] = list(zip(sessions.latitude, sessions.longitude))

In [28]:
sessions.drop(['longitude', 'longitude'], inplace=True, axis=1)

In [29]:
sessions.locale.unique().shape

(1760,)

In [30]:
sessions.locale.value_counts()[120:130]

zh-Hans-CN_CN    1243
zh_HK            1232
ko_KR            1224
es_CR            1221
en-NO_NO         1189
en-SI_SI         1172
en-RS_RS         1142
en-BG_BG         1055
en-QA_QA         1029
en_DE            1024
Name: locale, dtype: int64

In [31]:
locale_remove = set([k for k, v in dict(sessions.locale.value_counts()).items() if v <= 1000])

In [32]:
sessions.locale = sessions.locale.cat.add_categories(['unk'])
sessions.loc[sessions.locale.isin(locale_remove), ['locale']] = 'unk'
sessions.locale = sessions.locale.cat.remove_unused_categories()

In [33]:
sessions.locale.unique().shape

(134,)

In [34]:
sessions.dtypes

session_id                          category
start_timestamp                        int64
timezone                            category
timezone_offset                     category
previous_sessions_duration             int64
user_created_timestamp        datetime64[ns]
is_user_first_session                   bool
country                             category
region                              category
city                                category
latitude                             float64
locale                              category
os_name                             category
session_index                          int64
user_id_hash                        category
user_time                     datetime64[ns]
system_time                   datetime64[ns]
lat_long                              object
dtype: object

In [35]:
def custom_aggregate(df, t='anchor_date'):
    grp_cols = [t, 'user_id_hash']
    grpby_obj = df.groupby(grp_cols)

    # number of sessions
    temp = grpby_obj.city.count().to_frame().reset_index().\
                rename(columns={'city': 'no_sessions'})

    # number of cities
    temp1 = grpby_obj.city.agg(lambda x: x.unique().shape[0])\
                     .to_frame().reset_index()\
                     .rename(columns={'city': 'no_city'})

    # last city
    temp2 = grpby_obj.city.last().to_frame().reset_index()\
                     .rename(columns={'city': 'last_city'})
    
    # last country
    temp3 = grpby_obj.country.last().to_frame().reset_index()\
                     .rename(columns={'country': 'last_country'})

    # last region
    temp4 = grpby_obj.region.last().to_frame().reset_index()\
                     .rename(columns={'region': 'last_region'})

    # no of region
    temp5 = grpby_obj.region.agg(lambda x: x.unique().shape[0])\
                     .to_frame().reset_index()\
                     .rename(columns={'region': 'region_count'})

    # last locale
    temp6 = grpby_obj.locale.last().to_frame().reset_index()\
                     .rename(columns={'locale': 'last_locale'})

    # agg session time
    temp7 = grpby_obj.previous_sessions_duration.sum()\
                      .to_frame().reset_index()\
                      .rename(columns={'previous_sessions_duration': 'total_session_time'})

    # mean session time
    temp8 = grpby_obj.previous_sessions_duration.mean().\
                to_frame().reset_index().\
                rename(columns={'previous_sessions_duration': 'mean_session_time'})

    # mean session time
    temp9 = grpby_obj.previous_sessions_duration.median().\
                to_frame().reset_index().\
                rename(columns={'previous_sessions_duration': 'median_session_time'})

    # last lat_long
    temp10 = grpby_obj.lat_long.last().to_frame().reset_index()\
                     .rename(columns={'lat_long': 'last_lat_long'})
    
    for i in range(1, 11):
        temp = temp.merge(locals()[f'temp{i}'], 
                          on=grp_cols)
    return temp

In [36]:
def wrapper_agg(wst):
    sessions_ss = sessions[sessions['anchor_date'] == wst].copy()
    return custom_aggregate(sessions_ss)

In [37]:
sessions.reset_index(inplace=True, drop=True)

In [38]:
sessions.sort_values(['system_time', 'user_id_hash'], inplace=True)

In [37]:
# for i in range(7):
#     print((sessions.system_time -\
#           sessions.system_time.dt.weekday.apply(lambda x: x + i
#                                                 ).astype('timedelta64[D]')
#           ).dt.date.unique())

In [39]:
sessions.shape

(5165760, 18)

In [45]:
(sessions.system_time - sessions.system_time.dt.weekday\
                                .apply(lambda x: dt.timedelta(days=(x+3) % 7)))\
    .dt.date.value_counts()

2018-11-23    618447
2018-11-16    618051
2018-11-09    617304
2018-11-02    595912
2018-10-26    550508
2018-10-19    540153
2018-10-12    451345
2018-11-30    406861
2018-10-05    345490
2018-12-07    289887
2018-09-28     97709
2018-12-14     34093
Name: system_time, dtype: int64

In [47]:
results = []
for i in range(7):
    sessions['anchor_date'] = (sessions.system_time - \
                               sessions.system_time.dt.weekday\
                                       .apply(lambda x: dt.timedelta(days=(x+i) % 7)))\
                                       .dt.date

    sessions.anchor_date = sessions.anchor_date.astype('datetime64[ns]')
    print(f'{i}')
    result = parallel(wrapper_agg, sessions.anchor_date.unique())
    result = pd.concat(result)
    results += [result]

0


1


2


3


4


5


6


In [48]:
result = pd.concat(results)

In [49]:
result.shape

(6976625, 13)

In [50]:
result.head()

Unnamed: 0,anchor_date,user_id_hash,no_sessions,no_city,last_city,last_country,last_region,region_count,last_locale,total_session_time,mean_session_time,median_session_time,last_lat_long
0,2018-10-01,000062e9be78f3da274fec338e78f89d12000e781967f2...,5,1,unk,US,ca,1,en_US,4730609,946121.8,1176899.0,"(37.28716659545898, -121.94995880126952)"
1,2018-10-01,00026e5050a70ef12d421f75c6a5c80d0f62d37acab8bd...,2,1,unk,US,ga,1,en_US,403432,201716.0,201716.0,"(33.751495361328125, -84.74771118164062)"
2,2018-10-01,0003f8bda56230a49445880e559b718e94ba37344494a7...,2,1,unk,US,ky,1,en_US,150138,75069.0,75069.0,"(36.60725784301758, -83.71428680419922)"
3,2018-10-01,000542d6fc6e9dcb83328d30503a7e022e5a6b4ea6357b...,1,1,omaha,US,ne,1,en_US,0,0.0,0.0,"(41.25653839111328, -95.93450164794922)"
4,2018-10-01,00059c820dca1ccadfd45361c40f68b656df8a5212b993...,1,1,rio de janeiro,BR,rj,1,pt-BR_BR,123131,123131.0,123131.0,"(-22.90684700012207, -43.17289733886719)"


In [44]:
result.tail()

Unnamed: 0,anchor_date,user_id_hash,no_sessions,no_city,last_city,last_country,last_region,region_count,last_locale,total_session_time,mean_session_time,median_session_time
29516,2018-12-04,fff344ce906c9c4bec2f3ec1b7497f03d071c22b118892...,4,1,jakarta,ID,jk,1,in_ID,568190667,142047700.0,142175260.5
29517,2018-12-04,fff51a2561e713b5908da891f97da981f815b3bacfd67c...,19,1,unk,BR,mg,1,pt_BR,2389490765,125762700.0,126621278.0
29518,2018-12-04,fff78ff8c11a4d3b9370f88d7117229322e137e8083aa7...,2,2,charlotte,US,nc,1,es_US,10504583,5252292.0,5252291.5
29519,2018-12-04,fff8c078f8a874679661cdc5dddfacf64fa1bdd14c5b62...,23,1,lagos,NG,la,1,en_US,9054280346,393664400.0,392921913.0
29520,2018-12-04,fffdb3fdf921294adb7e5db84d846ed79dd2395bcf006f...,1,1,unk,US,tn,1,en-US_US,88274983,88274980.0,88274983.0


In [51]:
result.dtypes

anchor_date            datetime64[ns]
user_id_hash                   object
no_sessions                     int64
no_city                         int64
last_city                      object
last_country                   object
last_region                    object
region_count                    int64
last_locale                    object
total_session_time              int64
mean_session_time             float64
median_session_time           float64
last_lat_long                  object
dtype: object

In [52]:
result.to_pickle('../session_roll_agg.pkl')