# AWS ALS Factor Importation and Convertion

In [2]:
! aws s3 ls

2019-07-23 11:10:37 aws-logs-260329411851-us-east-1
2019-07-16 19:00:42 fp-movielens-data


In [21]:
import boto3
import os
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from scipy.spatial import distance_matrix

In [4]:
s3 = boto3.resource('s3')
client = boto3.client('s3') 
my_bucket = s3.Bucket('fp-movielens-data')

In [5]:
for obj in my_bucket.objects.all():
    print(os.path.join(obj.bucket_name, obj.key))

fp-movielens-data/_metadata
fp-movielens-data/e-MJ39QHZPM80UH29RR62G779J/FIFP.ipynb
fp-movielens-data/e-MJ39QHZPM80UH29RR62G779J/metadata/metadata.zip
fp-movielens-data/item_factors.csv
fp-movielens-data/item_factors.csv/_SUCCESS
fp-movielens-data/item_factors.csv/part-00000-40db7616-e552-48cd-bb18-9fba706fe5aa-c000.csv
fp-movielens-data/item_factors.csv/part-00001-40db7616-e552-48cd-bb18-9fba706fe5aa-c000.csv
fp-movielens-data/item_factors.csv/part-00002-40db7616-e552-48cd-bb18-9fba706fe5aa-c000.csv
fp-movielens-data/item_factors.csv/part-00003-40db7616-e552-48cd-bb18-9fba706fe5aa-c000.csv
fp-movielens-data/item_factors.csv/part-00004-40db7616-e552-48cd-bb18-9fba706fe5aa-c000.csv
fp-movielens-data/item_factors.csv/part-00005-40db7616-e552-48cd-bb18-9fba706fe5aa-c000.csv
fp-movielens-data/item_factors.csv/part-00006-40db7616-e552-48cd-bb18-9fba706fe5aa-c000.csv
fp-movielens-data/item_factors.csv/part-00007-40db7616-e552-48cd-bb18-9fba706fe5aa-c000.csv
fp-movielens-data/item_factors.csv

## Item Factors

In [6]:
item_factors = []
for i in list(range(0,10)):
    obj = client.get_object(Bucket='fp-movielens-data', Key='item_factors.csv/part-0000{}-40db7616-e552-48cd-bb18-9fba706fe5aa-c000.csv'.format(i))
    factors = pd.read_csv(obj['Body'], header=None)
    print(len(factors))
    item_factors.append(factors)

item_factors_df = pd.concat(item_factors, axis=0, ignore_index=True)

120036
124236
118776
120540
121170
121926
116340
121170
118230
125286


In [7]:
item_factors_df.columns = ['id', 'features']
item_factors_df.head()

Unnamed: 0,id,features
0,10,-0.255329
1,10,0.135568
2,10,0.424694
3,10,0.135304
4,10,0.058897


In [8]:
rank = item_factors_df.groupby(['id']).agg('count')[0:1]['features'][1]

In [9]:
multiplier_factors = item_factors_df['id'].nunique()
factors_feature_array = np.array(list(range(1,rank+1))*multiplier_factors)
item_factors_df['value'] = factors_feature_array
item_factors_unstacked = item_factors_df.pivot(index='id', columns='value', values='features')
item_factors_unstacked.head()

value,1,2,3,4,5,6,7,8,9,10,...,33,34,35,36,37,38,39,40,41,42
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.339038,0.133855,0.431191,0.170945,0.060187,0.008598,0.05416,-0.401662,-0.193391,0.355262,...,-0.052306,-0.392383,0.141144,0.529034,0.49472,-0.141302,-0.088752,-0.589613,-0.088734,-0.093626
2,-0.206917,0.183585,0.479785,0.222415,-0.039014,0.10934,0.026195,-0.383216,-0.142015,0.147401,...,-0.10604,-0.21355,0.180178,0.553887,0.478742,-0.033819,0.00149,-0.661385,0.055707,-0.049976
3,-0.207074,0.255722,0.39671,0.260146,-0.004491,0.083058,0.009378,-0.374817,-0.110707,0.119876,...,-0.175207,-0.145357,0.177796,0.57237,0.468172,-0.011677,0.052738,-0.643598,0.087316,-0.037295
4,-0.106216,0.210775,0.450775,0.287757,-0.121598,0.122187,-0.048723,-0.411978,-0.057141,0.138302,...,-0.213138,-0.150968,0.100139,0.528075,0.437014,-0.018255,0.019652,-0.623169,0.139524,0.032842
5,-0.15002,0.249664,0.427565,0.288323,-0.057598,0.054907,-0.013541,-0.382258,-0.067867,0.162254,...,-0.28803,-0.102036,0.111018,0.560844,0.508875,-0.012856,0.088838,-0.613712,0.123424,0.020842


In [10]:
item_factors_unstacked.to_csv('../data/processed/item_factors.csv')

## User Factors

In [11]:
user_factors = []
for i in list(range(0,10)):
    obj = client.get_object(Bucket='fp-movielens-data', Key='user_factors.csv/part-0000{}-59dd1ef1-da71-4926-b18b-5a0d5f059a90-c000.csv'.format(i))
    factors = pd.read_csv(obj['Body'], header=None)
    print(len(factors))
    user_factors.append(factors)

user_factors_df = pd.concat(user_factors, axis=0, ignore_index=True)

1027362
1024464
1024590
1021818
1022112
1020138
1026438
1022574
1022322
1021818


In [12]:
user_factors_df.columns = ['id', 'features']
user_factors_df.head()

Unnamed: 0,id,features
0,10,-0.352436
1,10,0.078241
2,10,0.438761
3,10,0.1232
4,10,0.041807


In [13]:
multiplier_factors = user_factors_df['id'].nunique()
factors_feature_array = np.array(list(range(1,rank+1))*multiplier_factors)
user_factors_df['value'] = factors_feature_array
user_factors_unstacked = user_factors_df.pivot(index='id', columns='value', values='features')
user_factors_unstacked.head()

value,1,2,3,4,5,6,7,8,9,10,...,33,34,35,36,37,38,39,40,41,42
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,-0.38047,0.103641,0.310486,0.07845,0.129363,-0.033363,0.030401,-0.360824,-0.213137,0.370407,...,0.022118,-0.480875,0.194733,0.479119,0.32839,-0.123725,-0.188246,-0.525,-0.133443,-0.167593
2,-0.226508,0.167774,0.53694,0.235142,-0.036765,0.118235,0.024486,-0.449235,-0.145772,0.214047,...,-0.048399,-0.314272,0.170148,0.567598,0.492936,-0.072118,-0.063404,-0.715515,0.038176,-0.056638
3,-0.240955,0.179579,0.439598,0.231143,0.003269,0.110166,0.018843,-0.457156,-0.184155,0.168279,...,-0.037431,-0.375902,0.142993,0.607431,0.525451,-0.006926,-0.018648,-0.715832,0.011471,-0.136575
4,-0.271722,0.041331,0.368844,-0.016227,0.102699,0.047941,0.020815,-0.335555,-0.272367,0.089399,...,0.136988,-0.530468,0.179969,0.52721,0.44011,0.124539,-0.082171,-0.663929,-0.0655,-0.327186
5,-0.368851,0.104798,0.349379,0.013516,0.07051,0.036399,-0.017281,-0.401808,-0.26634,0.203034,...,0.118943,-0.581576,0.247609,0.590982,0.318466,0.053089,-0.191376,-0.671003,-0.065487,-0.266448


In [14]:
user_factors_unstacked.to_csv('../data/processed/user_factors.csv')

In [18]:
scaler = StandardScaler()
user_factors_scaled = scaler.fit_transform(user_factors_unstacked)
user_factors_scaled = pd.DataFrame(user_factors_scaled)
user_factors_scaled.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,32,33,34,35,36,37,38,39,40,41
0,-1.188306,-0.423128,-0.865804,-0.461143,1.503241,-1.420149,0.436637,0.537105,-0.388644,1.678028,...,0.147211,-0.610899,0.022109,-0.623824,-0.597647,-1.195553,-0.860215,0.950591,-1.34448,-0.125517
1,0.610881,0.386616,0.886891,1.04974,-0.870483,0.911214,0.325355,-0.699159,0.774798,0.203294,...,-0.290968,0.543098,-0.195558,0.082682,0.54048,-0.689802,0.302174,-0.429075,0.473121,1.031875
2,0.44205,0.535666,0.133487,1.011178,-0.298466,0.787118,0.21921,-0.809915,0.111883,-0.228379,...,-0.222816,0.116207,-0.435975,0.400747,0.765383,-0.050916,0.718896,-0.431378,0.190288,0.19803
3,0.082511,-1.209848,-0.414129,-1.37406,1.122249,-0.169807,0.256308,0.890445,-1.4116,-0.972348,...,0.861,-0.954414,-0.108612,-0.239817,0.175094,1.23746,0.127439,-0.055505,-0.624907,-1.79028
4,-1.052518,-0.408519,-0.564778,-1.087267,0.662312,-0.347311,-0.460341,-0.035977,-1.307515,0.099425,...,0.748868,-1.308421,0.490244,0.269406,-0.666289,0.537238,-0.889354,-0.106731,-0.62477,-1.156709


In [19]:
user_factors_scaled.to_csv('../data/processed/user_factors_scaled.csv')

# Clusters

In [20]:
centers = [[ 0.28877629, -0.31662165,  1.22037831, -0.18632312, -0.8404911 ,
       -0.17297981,  0.06137925, -0.16357236, -0.29054178, -0.1119541 ,
       -1.51812969, -1.36944816, -0.01075161,  0.8239816 , -0.64700946,
       -0.29521195,  0.29542348,  0.54606688,  0.12868545,  0.71261011,
        0.96606564,  0.56516016,  1.55411054, -0.69703048,  0.47069417,
       -0.3598933 ,  0.38375023,  0.31860142,  0.76133581, -1.49494138,
       -0.98625828,  0.83935706, -0.77331557,  0.10952667,  0.47394083,
        0.86596856,  0.76381158,  0.783475  ,  0.33024765, -0.64871124,
        0.76001841,  0.32408677],[ 1.55341457, -0.1518466 , -0.14055032,  0.54191726, -1.23388717,
       -0.34642904, -1.21651759, -0.45825569,  1.3801457 ,  0.73151009,
       -0.64683757, -0.38082225, -1.31753049,  0.94402068,  1.3722987 ,
        1.58976338, -1.89859576,  0.25557667, -0.87077814, -1.15819278,
       -0.18346182, -0.17011643,  0.54042879, -1.08493901,  0.15058933,
        0.85015969, -0.37188864,  1.14397349, -1.50776033,  0.21964242,
        1.17240095, -1.21245827, -0.79847665, -0.19749224, -1.68476471,
       -0.50476243,  0.38720879,  0.06331017,  0.64850728,  0.59878004,
        0.38314049,  1.0792702 ],[-1.08952573, -0.43511791, -0.54457696, -0.59364048,  0.98957106,
       -0.13369503,  0.48324086,  0.28445418, -0.71030091,  0.49939541,
        0.34042822,  1.0464559 ,  0.41027915,  0.01502657,  0.19792598,
       -0.85159251,  0.55521978,  0.00406205, -0.52363425, -0.06254098,
       -0.87173117, -0.8077491 , -0.88378639,  0.31878713, -0.23696542,
       -0.11373483, -0.56372177, -1.06249854, -0.17930149,  0.82175701,
        0.39349707, -0.37075555,  0.95859703, -0.62824499,  0.36069278,
       -0.46648314, -0.77569688, -0.70812571, -1.02470484,  0.41225868,
       -1.00392825, -0.56851676],[ 1.55814925, -0.40222951, -0.94798064, -0.19771981, -0.40629961,
       -0.11871227, -0.03587482,  2.11079261,  1.42409285, -1.0871497 ,
        0.42990747, -1.15507698,  0.86756216, -1.91929916, -1.4671849 ,
        1.74320807, -0.82027321,  0.56302775,  1.80892727, -1.66516404,
        1.44118244,  0.17953442, -0.39680462, -0.61223195, -0.57019892,
        1.76973892,  0.18653248,  2.01279174,  1.013238  , -0.71515927,
        0.60033868, -0.22223655, -0.33506029,  1.47089518, -0.50901753,
       -1.43121815, -0.80169965,  0.2506601 ,  0.73665019,  1.51987027,
        0.46260446,  0.77282921],[ 1.0024296 ,  1.09357781,  0.72658745,  1.30101384, -1.10588607,
        0.34328838, -0.58074117, -1.03920058,  0.99462198, -0.21752746,
        0.0991846 , -0.72284016, -0.98084905,  0.15340677,  0.41802264,
        0.93393226, -0.63495555, -0.31071367,  0.29748317,  0.39704668,
        0.56808102,  0.84828377,  0.65613682,  0.13320499,  0.61000588,
       -0.18381392,  0.85559777,  0.77066089, -0.41540624, -0.40765778,
       -0.21123257,  0.23666883, -1.16592269,  0.74666312, -0.57296064,
        0.75897545,  1.15391018,  0.32734367,  1.21953737, -0.78352039,
        1.08962217,  0.81636144],[-0.1942105 , -0.87332541, -1.24610143, -1.32733089,  0.67192026,
       -0.23658286, -0.41689636,  1.14222599, -0.64934989, -0.58268714,
        0.21339069,  0.59537874,  0.92119982, -0.66187316, -0.31432103,
        0.04842762, -0.17682084, -0.11761783, -0.00368738, -1.02138609,
       -0.42476214, -0.34559763, -0.36607431, -0.47641059, -1.1986485 ,
        0.80436186, -0.89002609,  0.31220172,  0.38097933,  0.42635136,
        0.09105539, -0.38884573,  0.96187701, -0.72270365,  0.02427366,
       -0.76054097, -1.19187993,  0.58490887, -0.63965767,  0.80765963,
       -0.62624964, -1.05398189],[-0.1163038 ,  0.21515845,  0.39712053,  0.55329778, -0.14038785,
       -0.04837842,  0.09713743, -0.74752468,  0.17665016,  0.83109765,
       -0.13191536,  0.17722077, -0.80816633,  0.77208817,  0.81038866,
       -0.15344632, -0.10700395,  0.12788022, -0.65011499,  0.37757846,
       -0.33946363, -0.16561378,  0.03769785,  0.22616682,  0.63629011,
       -0.46790737,  0.14873412, -0.50355114, -0.76042804,  0.29180816,
        0.3371542 , -0.22492002, -0.25662639, -0.15332612, -0.29840641,
        0.17987347,  0.48750497, -0.59676728,  0.02261569, -0.20540452,
       -0.12732316,  0.44209562],[ 0.98727419,  1.09524775,  0.01140382, -0.04406752, -1.82241195,
       -0.2795169 , -2.73779008, -0.67643014,  1.04566015, -1.3454567 ,
       -0.28102627, -0.76723752,  0.97793282, -0.42009523, -0.39992974,
        2.11683627, -0.81756077, -1.8684314 ,  0.41649193, -0.79009612,
        0.02041577,  0.14998123,  1.21963615, -2.28932861, -2.0045164 ,
        1.91138515, -0.85491402,  1.52303778,  0.06157037, -0.31003731,
       -0.62156399, -0.77467866, -1.09506359,  0.25268968,  0.15380987,
        1.23266139, -0.5038773 ,  2.25696183,  0.39562156, -0.71176101,
        2.11868709,  0.05874267],[ 0.97136083,  1.65766547,  1.41115559,  1.62879074, -1.14508601,
        1.03533035,  0.36960573, -0.56343328,  0.99172303, -1.40108015,
        0.38703164, -1.65243939,  0.14978119, -1.06370335, -1.48880459,
        0.65361436,  0.57930546, -0.56556569,  2.06094177,  1.0983325 ,
        1.86237604,  1.44381752,  0.59036358,  0.58046834,  0.63621863,
       -0.32611849,  1.63814061,  0.92618039,  1.29470179, -1.56387215,
       -1.22852791,  1.46223537, -1.42310938,  2.08665651,  0.70866812,
        1.26165013,  1.27391137,  0.53910105,  1.59263765, -1.37384272,
        1.8771795 ,  0.97147431],[-0.54834275,  1.35629636,  1.6261856 ,  1.17986038, -0.61132196,
        1.32558501,  0.53650094, -1.41415129, -0.21742605, -0.47077116,
        0.02637271, -0.36436359,  0.40129102,  0.11644198, -0.79545013,
       -0.7437176 ,  1.4776695 , -1.14139594,  0.72173404,  1.50761453,
        0.36778579,  0.20746563,  0.12120431,  0.4032191 ,  0.39166062,
       -0.64453887,  0.61795432, -0.94878609,  0.71291076, -0.7408007 ,
       -1.02855135,  0.88205929, -0.47174905,  0.80714132,  1.55552701,
        1.59134226,  0.6865262 , -0.15514092,  0.03236078, -1.81102458,
        1.03301275,  0.3956007 ],[-0.88169987,  0.66634889, -0.04027864, -0.32396187,  0.81762261,
        0.64629782,  0.61616258,  0.68616026, -0.83420146, -1.64599121,
        1.16207661, -0.07497028,  1.98294795, -1.92373351, -2.08938883,
       -0.66512287,  1.83678174, -1.00809172,  1.78984265,  0.78335721,
        0.66634773,  0.42830989, -0.44604935,  0.6285283 , -0.94518105,
       -0.12015577,  0.31459305, -0.32789895,  2.07482195, -0.44320307,
       -1.47810679,  1.20831263,  0.3650672 ,  0.84733132,  1.90584756,
        0.6940908 , -0.63151823,  0.59943559, -0.23565875, -0.72870702,
        0.44988316, -1.00785533],[-0.17982107, -0.59064234, -0.66128017, -0.20468032,  0.35756539,
       -0.45017292, -0.04129092,  0.0537323 ,  0.35069175,  1.14215779,
        0.04256262,  0.90083917, -0.36761372,  0.59677665,  1.07017596,
        0.08596559, -0.64437568,  0.29974057, -1.14166665, -0.85999009,
       -1.06901336, -1.09235006, -0.86113193, -0.30925521, -0.07060604,
        0.375843  , -0.88981629, -0.47840824, -1.26792884,  1.09416729,
        1.38911478, -1.24742414,  0.55554005, -0.64339224, -0.7630022 ,
       -0.99492058, -0.61094838, -0.97783233, -0.73247526,  0.99139359,
       -0.87471194,  0.22272655],[ 0.38193546, -0.21335204, -0.52985343, -0.0858075 ,  0.16233539,
       -0.43747492, -0.17741188,  0.38299483,  0.32787446,  0.10173156,
        0.27603298,  0.03995951, -0.36296284, -0.20978719,  0.35273155,
        0.44162125, -0.62614894,  0.40942438, -0.09119552, -0.41613918,
        0.02845879,  0.17502425, -0.05482416,  0.13020101,  0.06380857,
        0.08598389,  0.07587495,  0.48888014, -0.31212749,  0.27929698,
        0.34451141, -0.18972416, -0.05470334, -0.01664849, -0.71082593,
       -0.5043671 ,  0.01499933,  0.01823551,  0.30945247,  0.5526431 ,
       -0.23295445, -0.01389886],[ 0.07461354,  0.54370843,  0.13732129,  0.36582083,  0.25993918,
        0.02760898,  0.35928837,  0.08849642,  0.01851777, -0.59265956,
        0.63878406, -0.35414858, -0.01455661, -0.7574104 , -0.49785057,
       -0.05520671,  0.3277009 ,  0.06308877,  0.80338985,  0.6705335 ,
        0.68116727,  0.90706097,  0.13065675,  0.88532327,  0.33146234,
       -0.66884766,  0.88824997,  0.27755741,  0.61531396, -0.31835628,
       -0.70191657,  0.91127381, -0.34038582,  0.65333206,  0.14389902,
        0.36593276,  0.55516533,  0.27372924,  0.7238396 , -0.39552561,
        0.30730936, -0.1908248 ],[-0.5160395 , -0.08534118,  0.04580661, -0.49664909,  0.0589856 ,
        0.12469523, -0.35773215, -0.27235066, -0.75687487, -0.20133842,
       -0.29935163,  0.31682053,  0.39172384,  0.30542987, -0.02676998,
       -0.43237455,  0.35798129, -0.50359847, -0.31949557,  0.17697063,
       -0.42840036, -0.14122405,  0.26335202, -0.33915725, -0.5289368 ,
        0.01172577, -0.43689283, -0.38750136,  0.16812248,  0.034891  ,
       -0.4610759 ,  0.01937044,  0.34219546, -0.63700663,  0.51439468,
        0.47476368, -0.26993179,  0.49669829, -0.48425986, -0.43848291,
       -0.01687967, -0.6253281 ],[ 1.67885237,  0.31736742,  0.80138659,  1.2863909 , -1.10598975,
       -0.28004061,  0.31139596, -0.03692182,  1.50358145,  0.31420124,
       -0.54010978, -1.61587406, -1.69491223,  0.27537861,  0.25581692,
        1.0716815 , -1.16649679,  1.37388185,  0.58134592,  0.18616761,
        1.56040442,  1.20026318,  0.93032989,  0.51076729,  1.75300428,
       -0.4206045 ,  1.61637708,  1.43249314, -0.28800146, -1.11576868,
        0.24428102,  0.59222682, -1.52473279,  1.26279264, -1.35864862,
       -0.17327422,  1.66441161, -0.17762812,  1.85466182,  0.16061104,
        0.70319225,  1.62236832],[-0.40162503, -0.86579468,  0.02016361, -0.52689752,  0.86926746,
       -0.11468269,  1.15828071,  0.7038864 , -0.94676432,  0.23154683,
       -0.27796044, -0.06714929, -0.159352  ,  0.12948867, -0.26372764,
       -1.12073497,  0.51629688,  1.08260518,  0.01379542,  0.49208956,
        0.38157701,  0.31084524,  0.09668543,  0.84582225,  0.74746559,
       -0.98692017,  0.44691958, -0.38014765,  0.58440156, -0.35987312,
       -0.34052474,  0.82975191,  0.43395174, -0.28484556,  0.14554158,
       -0.37449225,  0.28236563, -0.33312729, -0.07184176,  0.24076663,
       -0.86558374, -0.42809179],[-0.66116999, -0.99151323, -1.3102048 , -1.19981797,  0.50321765,
       -0.01985636, -0.46123438,  0.61296724, -0.26256721,  0.46419371,
       -0.08583333,  1.50738054,  0.94867812,  0.16191521,  0.43769395,
       -0.02191773, -0.3422836 , -0.51304589, -1.00204604, -1.72959679,
       -1.65159719, -1.91877381, -1.41636672, -1.17424312, -1.43822698,
        1.57873008, -1.98984892, -0.63787602, -0.80092777,  1.30011676,
        1.48109344, -1.81628021,  1.56106353, -1.18324286, -0.03503662,
       -1.30057397, -1.99687426, -0.57822405, -1.7713354 ,  1.25867872,
       -1.08164335, -0.45658692]]

In [24]:
centroids = pd.DataFrame(centers)
cluster_centroids_array = centroids.to_numpy()
cluster_distance_matrix = distance_matrix(cluster_centroids_array, cluster_centroids_array, p=2)
cluster_distance_df = pd.DataFrame(cluster_distance_matrix)
cluster_distance_df.to_csv('../data/processed/cluster_distances.csv')