In [0]:
import random
from pyspark.ml.clustering import KMeans
from pyspark.ml.evaluation import ClusteringEvaluator
import matplotlib.pyplot as plt
from pyspark.sql import functions as F
from pyspark.sql.functions import col
import mlflow
import mlflow.spark
import pyspark
import time



In [0]:
version = '2_civ7'
model_name = 'campaign_clustering_turn_100'

source_table = f'sandbox.cg_inverness.{model_name}_data_v2_features_v{version}'

#--

k_max = 7
random_seed_count = 3
version=1
experiment_name = f'/Users/jak.marshall@2k.com/Kmeans_turn_100_PCA_v2_{version}'

print(f'source_table:      {source_table}')
print(f'experiment_name:   {experiment_name}')


source_table:      sandbox.cg_inverness.campaign_clustering_turn_100_data_v2_features_v2_civ7
experiment_name:   /Users/jak.marshall@2k.com/Kmeans_turn_100_PCA_v2_1


In [0]:
df = spark.table(source_table)
df.limit(5).toPandas()

Unnamed: 0,features,CAMPAIGN_ID
0,"(0.5885875074807524, 0.965700179519515, 0.3144...",414a180b6904bbcf54c54a2bdccbf6ad
1,"(0.0, 0.3219000598398383, 0.6288079734362841, ...",11f0d5e476e68256f43c2621e400978f
2,"(0.8828812612211285, 0.3219000598398383, 0.419...",4ed7e07c53537bde06a7ebcd2c172191
3,"(0.2942937537403762, 0.4828500897597575, 0.628...",45c3dedc39afb9eda415e3643b28a5b5
4,"(0.8828812612211285, 0.4828500897597575, 0.209...",4353676d55133ebc418ae4f9c761e8ad


In [0]:
from pyspark.sql.functions import size

num_rows = df.count()
num_columns = df[['features']].schema["features"].metadata["ml_attr"]["num_attrs"]
print("Number of rows:", num_rows)
print("Number of columns:", num_columns)

Number of rows: 529282
Number of columns: 56


In [0]:
from pyspark.ml.feature import PCA

pca = PCA(k=21, inputCol="features")
pca.setOutputCol("pca_features")

model = pca.fit(df[['features']])

sum(model.explainedVariance)

np.float64(0.9033711508011085)

In [0]:
model.setOutputCol("output")

transformed_features=model.transform(df)

In [0]:
display(transformed_features)

features,CAMPAIGN_ID,output
"Map(vectorType -> sparse, length -> 56, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 18, 19, 21, 24, 33, 38, 45, 48, 52), values -> List(0.5885875074807524, 0.965700179519515, 0.31440398671814207, 1.6022970765200626, 1.8200478829342412, 5.5277310393881685, 1.302246221614133, 1.658493113181259, 2.588085580408564, 2.231726823489336, 1.3857259613269053, 1.496000183900265, 1.917831588389217, 0.6543800371290599, 1.2025101442095685, 2.0855550977392654, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))",414a180b6904bbcf54c54a2bdccbf6ad,"Map(vectorType -> dense, length -> 21, values -> List(-4.628801639618975, -4.226169662353111, 0.9617293509130748, 1.0021172060995274, -1.7440987466120115, 4.426444778025937, -0.10193919818515326, -0.5097866063389163, -0.24065919286454274, -0.7472672446119425, -0.17374395350017977, 0.9526608133548193, -0.20480703048334034, -0.36575145221982686, -0.504384250358586, 1.1701185391911892, 1.473329371433017, 0.6289210262684284, -0.91956263548686, -0.12922437736192077, -0.2968059921089431))"
"Map(vectorType -> sparse, length -> 56, indices -> List(1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 18, 19, 23, 34, 38, 45, 47, 52), values -> List(0.3219000598398383, 0.6288079734362841, 1.4247025729648832, 0.3493841918132695, 4.606442532823474, 2.9442088488667357, 2.3277096325351003, 2.588085580408564, 2.510692676425503, 2.103334048442624, 2.0262280971813715, 0.6973933048688061, 0.5712187682598089, 0.27697164154964754, 0.3950234422698009, 0.6543800371290599, 1.2025101442095685, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))",11f0d5e476e68256f43c2621e400978f,"Map(vectorType -> dense, length -> 21, values -> List(-4.6850543781962415, -4.499950198787214, 1.3142136434376506, 4.1796136571639955E-4, 0.9164517959972958, 2.263663808867489, -0.00857647015483234, -0.9779584472929409, -1.0904317466244617, -0.9395594324673288, -0.43533099570126255, 1.3121144306830506, 0.39612868543078117, 0.046662182733302165, 0.28143963959659224, 2.7857803902410385, 0.6094056257279351, -0.04423153421891168, -1.4080342182037122, -0.10827605722685724, 0.07347216219885994))"
"Map(vectorType -> sparse, length -> 56, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 17, 18, 19, 21, 23, 33, 40, 45, 47, 53), values -> List(0.8828812612211285, 0.3219000598398383, 0.41920531562418945, 1.2115891686986682, 0.6215788528770958, 4.606442532823474, 2.887589447926991, 2.9387334110755643, 5.6075187575518886, 2.324715441134725, 4.8747859710964345, 2.5943294328396997, 3.4433794427897304, 0.011900391005412684, 0.9606863850570947, 1.6359500928226498, 0.4509413040785882, 2.0855550977392654, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))",4ed7e07c53537bde06a7ebcd2c172191,"Map(vectorType -> dense, length -> 21, values -> List(-6.943745629867604, -7.449101701768911, 0.07254897675457558, -0.2841903030311495, -0.541597293923995, 3.11289198960197, 1.1254066605505528, 1.338425445568376, 0.19826486850746325, -0.48048532170161645, -0.7340604801982427, -0.42261287578374224, -0.01692500534263796, 0.8150538039777552, -0.31499829536943164, 1.4676461193328538, 0.46855224424088304, -1.3098782710935815, -1.7532026751161538, -1.2622789825294014, -0.006764620935425764))"
"Map(vectorType -> sparse, length -> 56, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 23, 33, 44, 45, 49, 52), values -> List(0.2942937537403762, 0.4828500897597575, 0.6288079734362841, 1.229348619054186, 2.0841173302349683, 5.5277310393881685, 2.661111844168011, 2.269516891721723, 4.744823564082367, 1.952760970553169, 1.039294470995179, 1.7800508517294291, 2.528050730149422, 0.48463683915741534, 0.14280469206495222, 0.27697164154964754, 0.16459310094575036, 0.5764118310342569, 0.6543800371290599, 0.4509413040785882, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))",45c3dedc39afb9eda415e3643b28a5b5,"Map(vectorType -> dense, length -> 21, values -> List(-5.164776566411206, -5.2226536737636975, 1.2471520138014798, 1.2723522089365793, 0.950085687660685, 4.546072775902652, 2.0634382927182586, -0.30066501440430293, -0.44972830381089013, -0.7900689629448154, -0.8883303621509843, 0.32786409530109445, 0.08362179090400648, 0.5677840381440834, -0.41832157129219283, 1.1748661405779304, 1.3321394855818427, 0.009481357847199778, -0.7262464236195315, 0.4607213183189663, 0.3229640398781409))"
"Map(vectorType -> sparse, length -> 56, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 23, 33, 38, 45, 49, 52), values -> List(0.8828812612211285, 0.4828500897597575, 0.20960265781209472, 0.889945790037621, 1.5844166838043618, 6.449019545952863, 1.981679032891072, 1.6293967427745704, 5.391844959184508, 2.4641983676028087, 1.1382748968042435, 3.5222282810816363, 1.5255478544005134, 0.48463683915741534, 0.09520312804330147, 0.13848582077482377, 0.09875586056745023, 0.19213727701141894, 0.32719001856452995, 0.6012550721047842, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))",4353676d55133ebc418ae4f9c761e8ad,"Map(vectorType -> dense, length -> 21, values -> List(-5.152690825407146, -5.537162389281079, 1.1562306060177427, 0.8404383432213287, 1.1752761103387803, 5.34564562897017, 1.2841655668862528, -0.1656589476723135, -0.2377079135133225, 0.3375166309354564, -0.299311125955139, 0.7501491372469806, 0.4895785719973128, 0.9848523978847914, -0.2558909920188006, 2.2874751596913683, 2.038993529971073, 0.2276557422721565, -1.289800271956508, 1.4823910353188179, -0.7394633290928176))"
"Map(vectorType -> sparse, length -> 56, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 19, 30, 34, 38, 45, 47, 54), values -> List(0.5885875074807524, 0.4828500897597575, 0.9956126246074499, 0.6472333018455425, 0.7068935973896383, 4.606442532823474, 1.5853432263128575, 2.356806002941789, 2.588085580408564, 2.417704058780114, 1.8806280903722286, 0.8521520034874926, 1.4383736912919127, 0.49981642222733275, 0.5539432830992951, 0.4937793028372511, 0.3842745540228379, 0.6012550721047842, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))",11f0e79869b9622563337381c8780a92,"Map(vectorType -> dense, length -> 21, values -> List(-4.4215146043959095, -3.547983856910944, 0.9298970984768493, 0.03300535960670252, 0.9600862302661204, 3.468668547990512, -0.12996123852638297, -0.29074141398698583, -0.8036132203351569, -0.32677940508111514, -0.5600316323134785, 0.8953782885987265, 0.2608622950937252, -0.2715325393784836, 0.5729328416561412, 1.4767561557679139, -0.10829581627100412, -0.5955262698728301, -1.2488412180794641, -0.7448181657800839, -0.14351487723686956))"
"Map(vectorType -> sparse, length -> 56, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 19, 23, 34, 38, 45, 47, 52), values -> List(1.765762522442257, 1.2876002393593533, 1.3624172757786157, 1.4029965780859168, 1.7794218141187448, 5.5277310393881685, 1.8684402310115822, 1.3966257795210604, 3.882128370612846, 3.0221300734751426, 1.2372553226133083, 1.836860985295262, 2.26652824082362, 0.523617204238158, 1.3848582077482376, 0.9217213652962021, 0.3842745540228379, 1.2025101442095685, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))",11f0e77b6297b5c0edeae881c878d69b,"Map(vectorType -> dense, length -> 21, values -> List(-6.02453149798959, -3.629973920002903, 1.2509881088248493, 1.0718496958604045, 1.1567444474537205, 4.786190228147353, 0.6110729678554542, 0.18340727941592677, -1.2363106507090225, 0.07992710610939291, -1.0914259062953722, 1.3519351748589434, 0.02689702611056108, -0.9651962419674038, -0.34923403057878144, 2.251329035816517, 0.26764144222828623, 0.24689729599580257, -1.0010361594672141, 0.43348573073824953, -0.15298795133179002))"
"Map(vectorType -> sparse, length -> 56, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 25, 33, 38, 45, 49), values -> List(0.2942937537403762, 0.6438001196796767, 0.41920531562418945, 2.717195904394244, 4.580589258947225, 5.5277310393881685, 3.7368804620231644, 4.480841042630068, 4.529149765714987, 2.045749588198558, 4.528354480764708, 4.147139750305798, 4.53305648164724, 0.9692736783148307, 0.2975097751353171, 0.13848582077482377, 0.2962675817023507, 0.19213727701141894, 0.32719001856452995, 0.7515688401309804, 1.0, 1.0, 1.0, 1.0, 1.0))",134b28d6b9cc71b33703a9e1a758589f,"Map(vectorType -> dense, length -> 21, values -> List(-7.82736195570825, -8.524041995047837, 3.6439822762990044, 2.742371938417112, 0.5548275427886195, 3.666925573406779, 1.062556975506721, -0.0729502373133116, -1.0587816722757968, -0.4832147328864987, -0.19685842223056893, -1.7352926951604577, -1.2127998626450507, 0.49411733705738253, -0.03336580696761124, 0.8307684199892402, 0.8225179995056353, -0.4208066818403899, -1.0102867908639943, -0.32271717216927776, -1.1528592750096551))"
"Map(vectorType -> sparse, length -> 56, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 21, 24, 34, 39, 45, 47, 52), values -> List(0.5885875074807524, 1.2876002393593533, 1.2576159468725683, 4.579964919461903, 1.1375299268339008, 3.685154026258779, 2.604492443228266, 1.1638548162675502, 3.0194331771433243, 1.7202894264396966, 3.786001287196723, 1.041519115373602, 1.13326412041181, 0.9692736783148307, 1.2376406645629192, 0.27697164154964754, 1.8434427305924042, 1.1528236620685137, 1.3087600742581198, 1.5031376802619607, 2.0855550977392654, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))",11f0e601cf90d416f8db144847904085,"Map(vectorType -> dense, length -> 21, values -> List(-6.939987532929945, -3.2837974457711487, 2.2063267595911156, 1.8941302805366285, -2.0905517681829697, 1.3340335973502973, -0.13453531133377056, 0.013347649849066962, 0.020607627902382908, -1.7940002820702925, -1.5084672725725214, 0.9558997368367175, 1.589728850896059, 0.04205965423094371, 0.17050476173352974, 2.13282554785072, 0.08576582344531629, -0.39918745392191823, -1.48966213042521, -1.1789253017951238, 0.3718778725109355))"
"Map(vectorType -> sparse, length -> 56, indices -> List(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 14, 15, 16, 17, 18, 19, 23, 33, 42, 45, 50, 53), values -> List(0.5885875074807524, 0.6438001196796767, 0.6812086378893079, 0.734057281361408, 0.7515822730866845, 4.606442532823474, 1.5853432263128575, 1.4839148907411266, 3.882128370612846, 3.0221300734751426, 2.276549793608487, 1.5906837398433196, 2.397289485486521, 0.22610742910284098, 0.5539432830992951, 0.23043034132405052, 0.7685491080456758, 0.9815700556935898, 0.6012550721047842, 1.0, 1.0, 1.0, 1.0, 1.0, 1.0))",4f7fc945a4933c4aadb42c617c022d97,"Map(vectorType -> dense, length -> 21, values -> List(-5.220593701779702, -4.656359301904154, 0.1903908812909184, 0.20120324535628092, 1.136076897849775, 3.5732039638622695, 0.7534844699960789, 0.5072313157877327, -0.12292179581427638, -0.29512795208047854, -0.32184494937930386, 1.1239280729119059, 0.28027737267426533, -0.09614884933308587, -0.255703489670569, 0.31785667831182, 0.9652762733756651, -0.6852154728614154, -1.1590407092394657, -0.29058138309321274, -0.12623131958663614))"


In [0]:
model_name = 'campaign_clustering_turn_100'
versionpca = '2_civ7'

target_table = f'sandbox.cg_inverness.{model_name}_data_v2_features_v2_PCA_v{versionpca}'

print(f'target_table: {target_table}')

(transformed_features[['output','CAMPAIGN_ID']]
 .write
 .format("delta")
 .mode("overwrite")
 .option("overwriteSchema", "true")
 .saveAsTable(target_table))

target_table: sandbox.cg_inverness.campaign_clustering_turn_100_data_v2_features_v2_PCA_v2_civ7
