## Training Data Representations
### This notebook is available at:
https://github.com/numeristical/resources/tree/master/lecture_notebooks/Training_Data_Reps.ipynb

In [1]:
import numpy as np
import pandas as pd
pd.options.display.max_rows = 999

In [2]:
us_states = np.array(['AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA',
       'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME',
       'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM',
       'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX',
       'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY'])
us_states

array(['AK', 'AL', 'AR', 'AZ', 'CA', 'CO', 'CT', 'DC', 'DE', 'FL', 'GA',
       'HI', 'IA', 'ID', 'IL', 'IN', 'KS', 'KY', 'LA', 'MA', 'MD', 'ME',
       'MI', 'MN', 'MO', 'MS', 'MT', 'NC', 'ND', 'NE', 'NH', 'NJ', 'NM',
       'NV', 'NY', 'OH', 'OK', 'OR', 'PA', 'RI', 'SC', 'SD', 'TN', 'TX',
       'UT', 'VA', 'VT', 'WA', 'WI', 'WV', 'WY'], dtype='<U2')

In [3]:
us_states_dict = {us_states[i]:i for i in range(len(us_states))}
us_states_dict

{'AK': 0,
 'AL': 1,
 'AR': 2,
 'AZ': 3,
 'CA': 4,
 'CO': 5,
 'CT': 6,
 'DC': 7,
 'DE': 8,
 'FL': 9,
 'GA': 10,
 'HI': 11,
 'IA': 12,
 'ID': 13,
 'IL': 14,
 'IN': 15,
 'KS': 16,
 'KY': 17,
 'LA': 18,
 'MA': 19,
 'MD': 20,
 'ME': 21,
 'MI': 22,
 'MN': 23,
 'MO': 24,
 'MS': 25,
 'MT': 26,
 'NC': 27,
 'ND': 28,
 'NE': 29,
 'NH': 30,
 'NJ': 31,
 'NM': 32,
 'NV': 33,
 'NY': 34,
 'OH': 35,
 'OK': 36,
 'OR': 37,
 'PA': 38,
 'RI': 39,
 'SC': 40,
 'SD': 41,
 'TN': 42,
 'TX': 43,
 'UT': 44,
 'VA': 45,
 'VT': 46,
 'WA': 47,
 'WI': 48,
 'WV': 49,
 'WY': 50}

In [4]:
n_data_pts = 1000000

In [5]:
# age: integer between 20 and 100 inclusive
x1 = np.random.randint(20,101,size=n_data_pts)

In [6]:
# height (in meters): float between 1.0 and 2.4 inclusive, increment 0.1
x2 = np.random.randint(10,25,size=n_data_pts)/10

In [7]:
# State of residence: 50 US States + DC
x3 = np.random.choice(us_states, size=n_data_pts)

In [8]:
# Binary 0/1 target (40% of people like broccoli)
y = (np.random.uniform(size=n_data_pts)<.4).astype(np.int)

In [9]:
y[:100]

array([1, 1, 0, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1,
       1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1, 0, 0,
       1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 1,
       1, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1])

In [10]:
data_frame = pd.DataFrame()
data_frame['age'] = x1
data_frame['height'] = x2
data_frame['state'] = x3
data_frame['broccoli'] = y
data_frame

Unnamed: 0,age,height,state,broccoli
0,21,1.7,IA,1
1,75,1.1,AL,1
2,82,1.2,NE,0
3,51,1.4,CA,0
4,28,1.8,IL,0
...,...,...,...,...
999995,34,2.1,TX,1
999996,46,1.8,MI,1
999997,36,2.2,SC,0
999998,97,1.0,IL,0


In [11]:
age_index = data_frame.age - 20
height_index = ((data_frame.height * 10).astype(np.int) - 10)
state_index = data_frame.state.apply(lambda x: us_states_dict[x])

In [12]:
np.unique(age_index)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50,
       51, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 62, 63, 64, 65, 66, 67,
       68, 69, 70, 71, 72, 73, 74, 75, 76, 77, 78, 79, 80])

In [13]:
np.unique(height_index)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14])

In [14]:
np.unique(state_index)

array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15, 16,
       17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33,
       34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, 45, 46, 47, 48, 49, 50])

In [15]:
81*15*51

61965

In [16]:
x_index = age_index * 15*51 + height_index * 51 + state_index
x_index[:10]

0     1134
1    42127
2    47561
3    23923
4     6542
5    15711
6    16141
7    46075
8    41972
9    15094
dtype: int64

In [17]:
data_list_1 = pd.DataFrame()
data_list_1['x'] = x_index
data_list_1['y'] = y

In [18]:
data_frame.head(10)

Unnamed: 0,age,height,state,broccoli
0,21,1.7,IA,1
1,75,1.1,AL,1
2,82,1.2,NE,0
3,51,1.4,CA,0
4,28,1.8,IL,0
5,40,1.8,AZ,1
6,41,1.1,MS,0
7,80,1.3,MI,1
8,74,2.2,WY,0
9,39,2.0,WV,1


In [19]:
data_list_1.head(10)

Unnamed: 0,x,y
0,1134,1
1,42127,1
2,47561,0
3,23923,0
4,6542,0
5,15711,1
6,16141,0
7,46075,1
8,41972,0
9,15094,1


In [20]:
data_list_2 = list(zip(x_index,y))
data_list_2[:10]

[(1134, 1),
 (42127, 1),
 (47561, 0),
 (23923, 0),
 (6542, 0),
 (15711, 1),
 (16141, 0),
 (46075, 1),
 (41972, 0),
 (15094, 1)]

In [21]:
np.min(x_index), np.max(x_index)

(0, 61964)

In [22]:
x_num_vals = 81*15*51
x_num_vals

61965

In [23]:
count_matrix = np.zeros((x_num_vals,2)).astype(np.int)

In [24]:
for xi, yi in data_list_2:
    count_matrix[xi,yi]+=1

In [25]:
count_matrix

array([[16,  7],
       [ 9,  5],
       [ 5,  6],
       ...,
       [12,  3],
       [14,  8],
       [13,  7]])

In [26]:
count_matrix[3870:3880,:]

array([[ 7,  8],
       [15,  9],
       [13,  9],
       [ 8,  4],
       [12,  3],
       [ 6,  6],
       [ 6,  6],
       [11,  5],
       [10, 10],
       [ 8,  3]])

In [27]:
row_sum_vec = np.sum(count_matrix, axis=1)
len(row_sum_vec)

61965

In [28]:
prob_matrix = (count_matrix.T / row_sum_vec).T

In [29]:
prob_matrix[:10,:]

array([[0.69565217, 0.30434783],
       [0.64285714, 0.35714286],
       [0.45454545, 0.54545455],
       [0.63636364, 0.36363636],
       [0.41176471, 0.58823529],
       [0.5       , 0.5       ],
       [0.53333333, 0.46666667],
       [0.66666667, 0.33333333],
       [0.82352941, 0.17647059],
       [0.66666667, 0.33333333]])

In [30]:
count_matrix[:10,:]

array([[16,  7],
       [ 9,  5],
       [ 5,  6],
       [14,  8],
       [ 7, 10],
       [11, 11],
       [ 8,  7],
       [10,  5],
       [14,  3],
       [10,  5]])