## Item (CDN) Features Preprocessing

**Input:** Raw data tables/logs (hidden for future research purposes)

**Output:** Array of CDN feature vectors in one-hot format

**Features used:**
* Unique CDN code
* CDN type (free/self-made/commercial)
* Number of CDN IP addresses (binned by quartile)

In [2]:
import numpy as np
import pandas as pd
import scipy.sparse as sp
import csv

In [3]:
import pickle

interactions_pickle = '20170629-interactions-mappings.pkl'
with open(interactions_pickle, 'rb') as output:
    (interactions, iidx_to_cdn, cdn_to_iidx, uidx_to_icp, icp_to_uidx) = pickle.load(output)

In [4]:
# Read in cdn_ips file
cdn_ips_filepath = 'CDN_ips/all_cdn_ips.txt'
cdn_ips_header = ['cdn', 'location', 'isp', 'ips', 'ts']
cdn_ips_datatypes = {
    'cdn': str,
    'location': str,
    'isp': str,
    'ips': np.int64,
    'ts': str
}

cdn_ips_df = pd.read_csv(cdn_ips_filepath, 
                              sep=',', header=None, 
                              names=cdn_ips_header,
                              dtype=cdn_ips_datatypes)

In [5]:
cdn_ips_df.head()

Unnamed: 0,cdn,location,isp,ips,ts
0,1,1101,10,99,201501
1,1,1101,11,24,201501
2,1,1101,12,5,201501
3,1,1200,10,19,201501
4,1,1200,11,122,201501


In [6]:
# Read in cdn_ip_locations file
cdn_ip_locations_filepath = 'CDN_IP_Locations/all_cdn_ip_locations.txt'
cdn_ip_locations_header = ['cdn', 'ip', 'location', 'isp', 'ts']
cdn_ip_locations_dtypes = {
    'cdn': str,
    'ip': str,
    'location': str,
    'isp': str,
    'ts': str
}

cdn_ip_locations_df = pd.read_csv(cdn_ip_locations_filepath, 
                              sep=',', header=None, 
                              names=cdn_ip_locations_header,
                              dtype=cdn_ip_locations_dtypes)

In [85]:
# Get no. unique IPs for each CDN
cdn_num_ips = cdn_ip_locations_df.groupby('cdn')['ip'].nunique()
cdn_num_ips

cdn
001    9857
002    3223
003    1949
004     372
005     499
006      83
007      15
008       1
009      17
011       1
013     140
201     231
202     306
203     207
204     363
Name: ip, dtype: int64

In [8]:
# No. CDNs represented in each file
print cdn_ips_df['cdn'].nunique()
print cdn_ip_locations_df['cdn'].nunique()
# https://stackoverflow.com/questions/38309729/count-unique-values-with-pandas

35
15


In [9]:
# Create CDN feature dict with CDN type (encoded in 1st digit of CDN code)
# 0: free
# 1: self-built
# 2: commercial
cdn_feature_dict = { cdn:dict(type=cdn[0]) for cdn in cdn_to_iidx }
len(cdn_feature_dict)

39

In [10]:
cdn_feature_dict['201']

{'type': '2'}

In [37]:
cdn_temp_dns_filepath = '~/Desktop/CDN Data/cdn_temp_dns/000009_0'
cdn_temp_dns_header = ['cdn', 'isp', 'cname', 'location', 'delay', 'time', 'ip', 'ts']
cdn_temp_dns_dtypes = {
    'cdn': str,
    'isp': str,
    'cname': str,
    'location': str,
    'delay': str,
    'time': str,
    'ip': str,
    'ts': str
}

cdn_temp_dns = pd.read_csv(cdn_temp_dns_filepath, 
                              sep=',', header=None, 
                              names=cdn_temp_dns_header,
                              dtype=cdn_temp_dns_dtypes)

In [57]:
cdn_temp_dns.head()
# Looked at .csv file
# Looks like there can be multiple 'ip's listed before 'ts'

Unnamed: 0,cdn,isp,cname,location,delay,time,ip,ts
0,2,11,v.pcgames.com.cn.wscdns.com,1201,3891,20150329005953,219.136.245.206,201503
1,4,11,pic9.huitu.com.cloudcdn.net,1201,28923,20150329005953,60.191.223.86,201503
2,1,11,user.shuuemura.ccgslb.net,1201,12644,20150329005953,220.181.66.132,220.181.46.163
3,201,11,user.ourhost.com.cn.aqb.so,1201,57362,20150329005953,220.181.135.166,201503
4,1,11,user.kiehls.ccgslb.net,1201,9526,20150329005953,220.181.46.163,220.181.66.132


In [56]:
# Appears to be same CDN list as in CDN_ip_locations!
# Tested for all cdn_temp_dns files --> confirmed
print cdn_temp_dns['cdn'].nunique()
cdn_temp_dns.groupby('cdn')['ip'].nunique()

15


  inc = np.r_[1, val[1:] != val[:-1]]


cdn
001    4829
002    3262
003    1760
004     380
005     482
006      92
007      18
008       2
009      20
011       2
013     120
201     236
202     308
203     209
204     359
Name: ip, dtype: int64

In [51]:
cdn_temp_qos_filepath = '~/Desktop/CDN Data/cdn_temp_qos/000004_0'
cdn_temp_qos_header = ['cdn', 'cdnip', 'location', 'isp', 'tcp', 'ft',\
                      'mt', 'faultFlag', 'rc', 'url', 'dns', 'ssl', 'dt',\
                      'tt', 'ds', 'avg', 'max', 'min', 'loss',\
                      'ip', 'cname', 'ts']

cdn_temp_qos = pd.read_csv(cdn_temp_qos_filepath,
                          sep=',', header=None,
                          names=cdn_temp_qos_header,
                          dtype=str)

In [52]:
print cdn_temp_qos['cdn'].nunique()
print sorted(cdn_temp_qos['cdn'].unique())
# Missing CDN 008, 013

13
['001', '002', '003', '004', '005', '006', '007', '009', '013', '201', '202', '203', '204']


In [53]:
cdn_temp_qos.head()

Unnamed: 0,cdn,cdnip,location,isp,tcp,ft,mt,faultFlag,rc,url,...,dt,tt,ds,avg,max,min,loss,ip,cname,ts
0,1,123.150.53.7,1200,11,8.464,12.299,20150318010035,0,504,0,...,15.811,29.816,62.363,2.767,2.802,2.702,0,0,0,201503
1,1,123.150.53.76,1200,11,8.78,16.784,20150318010035,0,404,0,...,17.192,31.786,25.128,2.913,2.923,2.892,0,0,0,201503
2,1,123.150.53.77,1200,11,2.991,6.351,20150318010035,0,403,0,...,6.542,9.599,149.946,4.407,4.442,4.376,0,0,0,201503
3,1,123.150.53.78,1200,11,8.768,18.798,20150318010035,0,404,0,...,19.261,33.405,22.429,2.809,2.846,2.789,0,0,0,201503
4,1,123.150.53.79,1200,11,10.325,18.632,20150318010035,0,403,0,...,19.138,34.987,51.26,4.737,4.78,4.716,0,0,0,201503


In [55]:
cdn_temp_qos.groupby('cdn')['cdnip'].nunique()

cdn
001    2617
002    1673
003    1058
004     149
005     123
006      45
007       8
009      14
013      45
201     145
202      91
203      80
204     225
Name: cdnip, dtype: int64

In [86]:
cdn_num_ips_df = cdn_num_ips.to_frame()

In [101]:
cdn_num_ips_df['ip'].quantile(1)

9857.0

In [110]:
num_ips_q1 = 0
num_ips_q2 = cdn_num_ips_df['ip'].quantile(0.25)
num_ips_q3 = cdn_num_ips_df['ip'].quantile(0.50)
num_ips_q4 = cdn_num_ips_df['ip'].quantile(0.75)

In [111]:
def num_ips_bin(num_ips):
    if num_ips >= num_ips_q1 and num_ips < num_ips_q2: return 'q1'
    elif num_ips >= num_ips_q2 and num_ips < num_ips_q3: return 'q2'
    elif num_ips >= num_ips_q3 and num_ips < num_ips_q4: return 'q3'
    elif num_ips >= num_ips_q4: return 'q4'
    else: return 'Error'

In [113]:
for entry in cdn_num_ips_df.itertuples():
    cdn = entry[0]
    num_ips = entry[1]
    ips_bin = num_ips_bin(num_ips)
    
    if cdn in cdn_to_iidx:
        cdn_feature_dict[cdn]['num_ips_bin'] = ips_bin

In [115]:
for cdn, features in cdn_feature_dict.iteritems():
    features['cdn'] = cdn

In [116]:
cdn_feature_dict

{'001': {'cdn': '001', 'num_ips_bin': 'q4', 'type': '0'},
 '002': {'cdn': '002', 'num_ips_bin': 'q4', 'type': '0'},
 '003': {'cdn': '003', 'num_ips_bin': 'q4', 'type': '0'},
 '004': {'cdn': '004', 'num_ips_bin': 'q3', 'type': '0'},
 '005': {'cdn': '005', 'num_ips_bin': 'q4', 'type': '0'},
 '006': {'cdn': '006', 'num_ips_bin': 'q2', 'type': '0'},
 '007': {'cdn': '007', 'num_ips_bin': 'q1', 'type': '0'},
 '008': {'cdn': '008', 'num_ips_bin': 'q1', 'type': '0'},
 '009': {'cdn': '009', 'num_ips_bin': 'q1', 'type': '0'},
 '011': {'cdn': '011', 'num_ips_bin': 'q1', 'type': '0'},
 '013': {'cdn': '013', 'num_ips_bin': 'q2', 'type': '0'},
 '014': {'cdn': '014', 'type': '0'},
 '101': {'cdn': '101', 'type': '1'},
 '102': {'cdn': '102', 'type': '1'},
 '103': {'cdn': '103', 'type': '1'},
 '104': {'cdn': '104', 'type': '1'},
 '105': {'cdn': '105', 'type': '1'},
 '106': {'cdn': '106', 'type': '1'},
 '107': {'cdn': '107', 'type': '1'},
 '108': {'cdn': '108', 'type': '1'},
 '109': {'cdn': '109', 'type'

In [117]:
# Create a list of CDN feature dicts
# Ordered by iidx (item/CDN index)
cdn_feature_list = [cdn_feature_dict[iidx_to_cdn[iidx]] for iidx in range(len(iidx_to_cdn))]

In [120]:
print cdn_feature_list[0]
print cdn_feature_list[1]
print iidx_to_cdn[0]
print iidx_to_cdn[1]

{'num_ips_bin': 'q4', 'type': '0', 'cdn': '002'}
{'num_ips_bin': 'q3', 'type': '0', 'cdn': '004'}
002
004


In [121]:
# Vectorize! (One-hot encodings of each ICP)
from sklearn.feature_extraction import DictVectorizer
cdn_vectorizer = DictVectorizer()
cdn_feature_vectors = cdn_vectorizer.fit_transform(cdn_feature_list)

In [122]:
cdn_feature_vectors
# 7 extra features: 4 bins + 3 types

<39x46 sparse matrix of type '<type 'numpy.float64'>'
	with 93 stored elements in Compressed Sparse Row format>

In [123]:
import pickle
with open('20170703-cdn-feature-vectors.pkl', 'w') as output:
    pickle.dump(cdn_feature_vectors, output, -1)