In [2]:
import pandas as pd
import numpy as np

from sklearn.cluster import KMeans, SpectralClustering
from sklearn.mixture import GaussianMixture, BayesianGaussianMixture
from sklearn.preprocessing import scale

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import matplotlib.cm as cm
import os
matplotlib.style.use('ggplot')

import seaborn as sns

In [8]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
from msci.utils import utils

In [41]:
COLUMNS_TO_IMPORT = ['mac_address', 'date_time', 'location', 'store_id', 'x', 'y', 'wifi_type', 'email']

In [42]:
shopper_df = pd.read_csv('/Users/SebastianLee/Documents/Repos/msci-complexity-project/msci/data/bag_mus_12-22-2016.csv', usecols=COLUMNS_TO_IMPORT)

In [43]:
shopper_df.location.drop_duplicates().tolist()

['Mall of Mauritius', 'Home & Leisure', 'Phoenix Mall']

In [44]:
mac_address_df = utils.import_mac_addresses()

In [4]:
mac_address_df.head()

Unnamed: 0,frequency,mac_address,centroid,radius_of_gyration,manufacturer,count_density_variance,length_of_stay,is_out_of_hours,av_speed,av_turning_angle,total_turning_angle,av_turning_angle_velocity,av_path_length,total_path_length,av_straightness,av_speed_from_total,turning_angle_density,wifi_type
0,4344,bc:20:10:8c:fe:05,[ 248.77371087 52.52286679],12.025153,,10.071358,86335,1,0.825502,2.020544,8561.045646,0.006382,9.778814,42469.389503,2.982753,0.491914,0.201582,unknown
1,4078,38:ff:36:2d:f1:88,[ 295.51814615 29.36096126],16.875408,Ruckus Wireless,6.927397,86335,1,0.888605,1.945714,6516.196923,-0.029064,11.785791,48050.668616,3.281043,0.556561,0.135611,Discovered-AP
2,4024,9c:99:a0:07:5e:a9,[ 254.38680417 53.64417661],10.12416,Xiaomi Communications Co Ltd,11.369713,86335,1,0.848888,2.027772,8060.39519,0.000939,9.815611,39488.202654,2.854801,0.457383,0.204122,unknown
3,3972,c4:f0:81:19:be:fc,[ 242.23308996 52.00499329],8.260927,"HUAWEI TECHNOLOGIES CO.,LTD",10.447394,86284,1,0.787383,2.096552,8195.419843,0.00184,9.093106,36108.722719,2.904129,0.418487,0.226965,unknown
4,3876,e8:de:27:5e:bf:99,[ 197.38132095 17.06037152],49.939965,"TP-LINK TECHNOLOGIES CO.,LTD.",6.613266,86089,1,2.490503,1.439755,5564.654963,-0.000188,35.829046,138837.554597,3.39485,1.612721,0.04008,Discovered-AP


In [5]:
signal_df = utils.import_signals()

In [6]:
signal_df.head()

Unnamed: 0,mac_address,wifi_type,date_time,email,store_id,location,x,y
0,fe:55:36:4e:bd:83,lawifiuser,2016-12-22 00:00:04,unknown,,Mall of Mauritius,151.0,18.0
85,dc:cf:96:e8:01:53,unknown,2016-12-22 00:00:04,unknown,A165,Mall of Mauritius,252.0,93.0
84,dc:d9:16:77:4d:a5,unknown,2016-12-22 00:00:04,unknown,,Mall of Mauritius,111.0,111.0
83,dc:ee:06:c5:51:3c,unknown,2016-12-22 00:00:04,unknown,P0001,Mall of Mauritius,22.0,26.0
82,e8:3a:12:23:c3:33,unknown,2016-12-22 00:00:04,unknown,A141A,Mall of Mauritius,151.0,67.0


In [7]:
filter_df = signal_df.filter(items=['mac_address', 'date_time', 'x', 'y'])

In [8]:
filter_df.head()

Unnamed: 0,mac_address,date_time,x,y
0,fe:55:36:4e:bd:83,2016-12-22 00:00:04,151.0,18.0
85,dc:cf:96:e8:01:53,2016-12-22 00:00:04,252.0,93.0
84,dc:d9:16:77:4d:a5,2016-12-22 00:00:04,111.0,111.0
83,dc:ee:06:c5:51:3c,2016-12-22 00:00:04,22.0,26.0
82,e8:3a:12:23:c3:33,2016-12-22 00:00:04,151.0,67.0


In [4]:
v1_mm_df = utils.import_signals(mall='Mall of Mauritius', v1=True)
v1_p_df = utils.import_signals(mall='Phoenix Mall', v1=True)
v1_hl_df = utils.import_signals(mall='Home & Leisure', v1=True)
v1_mm_macs = v1_mm_df.mac_address.drop_duplicates().tolist()
v1_p_macs = v1_p_df.mac_address.drop_duplicates().tolist()
v1_hl_macs = v1_hl_df.mac_address.drop_duplicates().tolist()
print('Number of Signals in Mall of Mauritius', len(v1_mm_df))
print('Number of Signals in Phoenix Mall', len(v1_p_df))
print('Number of Signals in Home & Leisure', len(v1_hl_df))
print('Number of Devices in Mall of Mauritius', len(v1_mm_macs))
print('Number of Devices in Phoenix Mall', len(v1_p_macs))
print('Number of Devices in Home & Leisure', len(v1_hl_macs))

Number of Signals in Mall of Mauritius 2762923
Number of Signals in Phoenix Mall 1617170
Number of Signals in Home & Leisure 105856
Number of Devices in Mall of Mauritius 154152
Number of Devices in Phoenix Mall 90901
Number of Devices in Home & Leisure 12555


In [28]:
mm_df = utils.import_signals(mall='Mall of Mauritius')
p_df = utils.import_signals(mall='Phoenix Mall')
hl_df = utils.import_signals(mall='Home & Leisure')

In [29]:
mm_macs = mm_df.mac_address.drop_duplicates().tolist()
p_macs = p_df.mac_address.drop_duplicates().tolist()
hl_macs = hl_df.mac_address.drop_duplicates().tolist()

In [30]:
print('Number of Signals in Mall of Mauritius', len(mm_df))
print('Number of Signals in Phoenix Mall', len(p_df))
print('Number of Signals in Home & Leisure', len(hl_df))

Number of Signals in Mall of Mauritius 2367624
Number of Signals in Phoenix Mall 1515481
Number of Signals in Home & Leisure 102668


In [31]:
print('Number of Devices in Mall of Mauritius', len(mm_macs))
print('Number of Devices in Phoenix Mall', len(p_macs))
print('Number of Devices in Home & Leisure', len(hl_macs))

Number of Devices in Mall of Mauritius 154152
Number of Devices in Phoenix Mall 90901
Number of Devices in Home & Leisure 12555


In [10]:
shopper_df = utils.import_signals(v1=True)

In [11]:
from msci.cleaning.duplicate_analysis import *

In [12]:
dups = identify_duplicate_data(shopper_df)
duplicate_macs = dups[1]
grouped = shopper_df.groupby('mac_address')
group_dup_ex = grouped.get_group(duplicate_macs[0])

In [36]:
identicals = []
for mac in duplicate_macs:
    group = grouped.get_group(mac)
    times = group_dup_ex.date_time.tolist()
    identical = [i for i in range(len(times)-1) if times[i] == times[i+1]]
    identicals.append(len(identical))

KeyboardInterrupt: 

In [34]:
print('Number of macs with identical time signals in Mall of Mauritius:', len(duplicate_macs))
print('Number of duplicate time signals in Mall of Mauritius:', np.sum(identicals))

Number of macs with identical time signals in Mall of Mauritius: 50585


In [13]:
group_dup_ex.head()

Unnamed: 0,mac_address,wifi_type,date_time,email,store_id,location,x,y
4485843,38:ff:36:2d:f1:88,unknown,2016-12-22 00:00:04,unknown,A102,Mall of Mauritius,295.0,39.0
4485667,38:ff:36:2d:f1:88,Discovered-AP,2016-12-22 00:00:15,unknown,B257,Mall of Mauritius,287.0,28.0
4485472,38:ff:36:2d:f1:88,Discovered-AP,2016-12-22 00:00:25,unknown,,Mall of Mauritius,283.0,22.0
4485059,38:ff:36:2d:f1:88,unknown,2016-12-22 00:00:45,unknown,B257,Mall of Mauritius,287.0,25.0
4484839,38:ff:36:2d:f1:88,Discovered-AP,2016-12-22 00:00:55,unknown,,Mall of Mauritius,283.0,21.0


In [14]:
times = group_dup_ex.date_time.tolist()
identical = [i for i in range(len(times)-1) if times[i] == times[i+1]]
group_dup_ex.iloc[identical[0]:identical[0]+2]

Unnamed: 0,mac_address,wifi_type,date_time,email,store_id,location,x,y
4460104,38:ff:36:2d:f1:88,Discovered-AP,2016-12-22 00:23:04,unknown,B104,Mall of Mauritius,322.0,26.0
4460105,38:ff:36:2d:f1:88,Discovered-AP,2016-12-22 00:23:04,unknown,P0010,Mall of Mauritius,322.0,26.0


In [29]:
x = group_dup_ex.x.tolist()
y = group_dup_ex.y.tolist()
coordinates = list(zip(x,y))
different_coordinates = [i for i in range(len(coordinates) - 1) if coordinates[i] != coordinates[i+1] and i in identical]
    

In [30]:
group_dup_ex.iloc[different_coordinates[0]:different_coordinates[0]+2]

Unnamed: 0,mac_address,wifi_type,date_time,email,store_id,location,x,y
4182109,38:ff:36:2d:f1:88,Discovered-AP,2016-12-22 06:42:28,unknown,P0010,Mall of Mauritius,323.0,24.0
4182401,38:ff:36:2d:f1:88,Discovered-AP,2016-12-22 06:42:28,unknown,A259,Mall of Mauritius,279.0,17.0


In [33]:
filter_ex = group_dup_ex.filter(items=['mac_address', 'date_time', 'x', 'y'])
filter_ex.iloc[different_coordinates[0]:different_coordinates[0]+2]

Unnamed: 0,mac_address,date_time,x,y
4182109,38:ff:36:2d:f1:88,2016-12-22 06:42:28,323.0,24.0
4182401,38:ff:36:2d:f1:88,2016-12-22 06:42:28,279.0,17.0


In [38]:
print('Fraction of different coordinates:', len(different_coordinates)/len(identical))

Fraction of different coordinates: 0.12244897959183673
