# Mac Address Feature Cleaning

The notebook aims at understanding the mac addresses using a clustering algorithm (k-means) to determine whether the mac addresses are stationary or moving (shoppers).

In [14]:
import pandas as pd
import numpy as np

from sklearn.cluster import KMeans 

%matplotlib inline
import matplotlib
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
import matplotlib.cm as cm
matplotlib.style.use('ggplot')

import mac_address_features as maf

In [2]:
%load_ext autoreload
%autoreload 2

## Import the data and create mac address data frame

In [4]:
signal_df = maf.import_data('Mall of Mauritius')
mac_address_df = maf.create_mac_address_df(signal_df)

Wall time: 7.65 s


In [5]:
print('{} signals'.format(len(signal_df)))
print('{} mac addresses'.format(len(mac_address_df)))

2762923 signals
154152 mac addresses


## Add the features

In [6]:
%%time
mac_address_df['manufacturer'] = maf.find_device_type(mac_address_df)

Wall time: 342 ms


In [7]:
%%time
mac_address_df['gyration'] = maf.calculate_radius_gyration(signal_df, mac_address_df)[1]

Wall time: 4min 18s


In [8]:
%%time
mac_address_df['cdv'] = maf.count_density_variance(signal_df, mac_address_df, minute_resolution='15')

Wall time: 3min 30s


In [9]:
%%time
mac_address_df['length_of_stay'] = maf.calculate_length_of_stay(signal_df, mac_address_df)

Wall time: 1min 13s


In [10]:
%%time
mac_address_df['is_out_of_hours'] = maf.is_out_of_hours(signal_df, mac_address_df)

Wall time: 4.03 s


In [11]:
mac_address_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 154152 entries, 0 to 154151
Data columns (total 7 columns):
count              154152 non-null int64
mac_address        154152 non-null object
manufacturer       49037 non-null object
gyration           154152 non-null float64
cdv                41365 non-null float64
length_of_stay     154152 non-null int64
is_out_of_hours    154152 non-null int64
dtypes: float64(2), int64(3), object(2)
memory usage: 8.2+ MB


In [12]:
mac_address_df.to_csv('mac_address_features.csv', index=False)

## Cluster Analysis

In [None]:
mac_address_clean_df = mac_address_df.dropna()

In [None]:
samples = mac_address_clean_df.as_matrix(columns=['gyration', 'cdv', 'length_of_stay'])

In [None]:
model = KMeans(n_clusters=3)
model.fit(samples)

In [None]:
labels = model.predict(samples)
labels

In [None]:
fig, axes = plt.subplots(nrows=2, ncols=2, figsize=(20, 12))

xs = samples[:,0]
ys = samples[:,1]

axes[0][0].scatter(xs, ys, c=labels)
axes[0][0].set_title('Number of signals per manufacturer')
axes[0][0].set_xlabel('Manufacturer')
axes[0][0].set_ylabel('Signals');