In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

%matplotlib inline

pd.set_option('display.float_format', lambda x: '%.6f' % x)

In [2]:
df_train_b1_se_f2 = pd.read_csv("../data/b1/df_train_b1_se_f2.csv", index_col=0)

In [3]:
df = df_train_b1_se_f2.copy()
df_wap = df_train_b1_se_f2.loc[:, :'WAP503']

In [262]:
wap_in_use = [i for i in df_wap if df_wap[i].nunique() != 1]
print("There are %d WAPs used." % len(wap_in_use))

df_wap = df_wap[wap_in_use]

There are 90 WAPs used.


In [263]:
# number of samples in each room
df.groupby(['LOC', 'LATITUDE', 'LONGITUDE']).size()

LOC  LATITUDE        LONGITUDE   
196  4864815.057400  -7469.889200    20
199  4864834.110161  -7469.328742    20
200  4864820.413200  -7466.963700    24
204  4864852.046477  -7461.524049     7
205  4864810.030100  -7460.767900    20
207  4864815.285700  -7457.755500    20
209  4864831.290900  -7457.927200    20
210  4864852.084300  -7458.264200    29
212  4864853.930300  -7457.242100    38
214  4864828.604900  -7453.059500    20
dtype: int64

In [264]:
# number of unique visitors in each room
unique_users = pd.DataFrame(df.groupby(['LOC', 'LATITUDE', 'LONGITUDE', 'USERID']).size()).reset_index()

"""
unique_users['GPS'] = '(' + round(unique_users['LATITUDE'], 6).astype(str) + ', ' \
                       + round(unique_users['LONGITUDE'], 6).astype(str) + ')'
"""
# unique_users.to_csv("unique_users.csv")

"\nunique_users['GPS'] = '(' + round(unique_users['LATITUDE'], 6).astype(str) + ', '                        + round(unique_users['LONGITUDE'], 6).astype(str) + ')'\n"

In [283]:
from collections import defaultdict

common_waps = defaultdict(dict)
wap_names = defaultdict(dict)
sample_counts = dict(df.groupby(['LOC']).size())
print("For users who have been to floor 2")
for loc in np.unique(df['LOC']):
    n_samples = sample_counts[loc]
    data = df_wap.loc[df[df['LOC'] == loc].index]
    cols = np.where(data.astype(bool).astype(int).sum(axis=0) == n_samples)[0]
    wap_names[loc] = np.asarray(data.columns[cols])
    common_waps[loc] = [n_samples, data.columns[cols]]
    print("Location [%s] has [%2d] repeated WAPs in all [%2d] samples" 
          % (loc, len(cols), n_samples))

For users who have been to floor 2
Location [196] has [20] repeated WAPs in all [20] samples
Location [199] has [12] repeated WAPs in all [20] samples
Location [200] has [14] repeated WAPs in all [24] samples
Location [204] has [16] repeated WAPs in all [ 7] samples
Location [205] has [12] repeated WAPs in all [20] samples
Location [207] has [13] repeated WAPs in all [20] samples
Location [209] has [11] repeated WAPs in all [20] samples
Location [210] has [ 1] repeated WAPs in all [29] samples
Location [212] has [ 6] repeated WAPs in all [38] samples
Location [214] has [12] repeated WAPs in all [20] samples


In [397]:
wap_usage = set([wap for ls in wap_names.values() for wap in ls])
print("There are %d repeated WAPs." % len(wap_usage))

wap_usage = pd.DataFrame(0, index=wap_usage, columns=wap_names.keys())

for loc, wap in wap_names.items():
    wap_usage.loc[wap, loc] = 1

wap_usage.sort_index(inplace=True)
# wap_usage.to_csv("../data/b1/wap_usage_b1_se.csv")

There are 34 repeated WAPs.


In [398]:
def highlight_cell(val):
    color = 'yellow' if val == 1 else ''
    return 'background-color: %s' % color

wap_usage.style.applymap(highlight_cell)

Unnamed: 0,196,199,200,204,205,207,209,210,212,214
WAP015,1,0,0,1,0,0,0,0,0,0
WAP016,1,0,1,1,0,0,0,0,0,0
WAP082,1,0,0,0,0,0,0,0,0,0
WAP083,1,0,1,0,0,1,0,0,0,0
WAP085,0,0,0,0,0,1,0,0,0,0
WAP109,1,1,1,0,1,0,1,0,0,1
WAP110,1,1,1,0,1,1,1,0,0,1
WAP111,1,0,1,0,1,1,0,0,0,1
WAP112,1,0,1,0,1,1,1,0,0,0
WAP115,0,0,0,1,0,0,0,0,1,0


In [323]:
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer, balanced_accuracy_score

clf = SVC(kernel='linear', C=1)
# clf = MultinomialNB()
cvs = cross_val_score(clf, df_wap.astype(bool), df['LOC'], scoring=make_scorer(balanced_accuracy_score), cv=10)
# print("Average accuracy: %.2f%% Â± %.2f%%" % (mean_ci(cvs)[0], mean_ci(cvs)[1]))

