# File creating group
This file is a helper file to create the subgroups in the group folders

In [1]:
import pandas as pd
import ast

## Loading the data

In [7]:
DATA_FOLDER = 'data/'
TOP_FOLDER = DATA_FOLDER + 'lastfm-dataset-360k/'
TIMELINE_FOLDER = DATA_FOLDER + 'lastfm-dataset-1k/'

#Using just the artists
top_user = pd.read_csv(TOP_FOLDER + 'usersha1-profile.tsv', sep = '\t', error_bad_lines = False, header = None)
top_data = pd.read_csv(TOP_FOLDER + 'usersha1-artmbid-artname-plays.tsv', sep = '\t', error_bad_lines = False, header = None)

spotify_data = pd.read_csv(DATA_FOLDER + 'full_spotify_info.csv', error_bad_lines = False, header = 0)

In [8]:
top_user.rename(columns = {0 : 'ID', 1 : 'Gender', 2 : 'Age', 3 : 'Country', 4 : 'Registered'}, inplace = True)
top_data.rename(columns = {0 : 'ID', 1 : 'Artist_ID', 2 : 'Artist', 3 : 'Plays'}, inplace = True)

In [9]:
top_merged = top_user.merge(top_data, left_on = 'ID', right_on = 'ID')

In [10]:
top_merged.head()

Unnamed: 0,ID,Gender,Age,Country,Registered,Artist_ID,Artist,Plays
0,00000c289a1829a808ac09c00daf10bc3c4e223b,f,22.0,Germany,"Feb 1, 2007",3bd73256-3905-4f3a-97e2-8b341527f805,betty blowtorch,2137
1,00000c289a1829a808ac09c00daf10bc3c4e223b,f,22.0,Germany,"Feb 1, 2007",f2fb0ff0-5679-42ec-a55c-15109ce6e320,die Ärzte,1099
2,00000c289a1829a808ac09c00daf10bc3c4e223b,f,22.0,Germany,"Feb 1, 2007",b3ae82c2-e60b-4551-a76d-6620f1b456aa,melissa etheridge,897
3,00000c289a1829a808ac09c00daf10bc3c4e223b,f,22.0,Germany,"Feb 1, 2007",3d6bbeb7-f90e-4d10-b440-e153c0d10b53,elvenking,717
4,00000c289a1829a808ac09c00daf10bc3c4e223b,f,22.0,Germany,"Feb 1, 2007",bbd2ffd7-17f4-4506-8572-c1ea58c3f9a8,juliette & the licks,706


**Function that finds the groups**

In [13]:
def get_user_groups(df):
    user_artists = df[['ID', 'Artist']].groupby('ID').agg(set) 
    all_max_groups= [] #List of the max groups
    for user1_ID, user1_artists in user_artists.iterrows():
        max_group = ([user1_ID], user1_artists.Artist) #Start with the user and try to add other users in the group
        flag = True
        while(flag):
            max_group, flag = maximize_set(max_group, user_artists)
        
        users = sorted(max_group[0])
        artists = sorted(max_group[1]) #Converts arg to list => We do this step of sorting to only get the group once
        max_group = (users, artists)
        if max_group not in all_max_groups:#Allows to not have duplicated groups
            print(max_group[0]) # We print the group to then create the files
            print("")
            all_max_groups.append(max_group)
        
    return all_max_groups

def maximize_set(max_group, user_artists):
    max_set = set()
    for user2_ID, user2_artists in user_artists.iterrows(): 
        if(user2_ID not in max_group[0]): #Go through all the user that are not already in the group
            new_set = max_group[1].intersection(user2_artists.Artist) 
            if(len(new_set) > len(max_set) and len(new_set) > 5): #keeps the biggest new set & nb_artists > 10
                max_set = new_set
                new_user_in_group = user2_ID
    
    if(len(max_set) == 0):
        return (max_group[0], max_group[1]), False
    
    max_group[0].append(new_user_in_group)
    return (max_group[0], max_set), True

In [14]:
##Need to stop it running cause it takes too much time and we don't need that many supergroups
get_user_groups(top_merged[top_merged['Country'] == 'Germany'])

['00000c289a1829a808ac09c00daf10bc3c4e223b', '55465b640e972d1131a14caede9c33f9148b5510', '73294504a5d844e9d7797914836a4de52f8fb098']

['00004d2ac9316e22dc007ab2243d6fcb239e707d', '206182320eaead0f384d07a26e021ee9d567d825']

['0002fa6e25794597126fa44529105cedb31a75aa', '27bb56c0e15fcd47f0ed0d4141a4c88fff42d382', '2f81b5901185b9ba55716255624659888a1b823a', '3107fa2479a9ec7f9a63ff59ceb80953a2b92932', '33f5094729686c4ca66d6b3f5b0fc7118c02f777', '4d1bcf7a7b0c67996f17a19bb7e515887f18ee9a', '5883a88e382cef004ba060ae6c3faf8c5b055510', '5fddd8398e464e447ec455cfe57ebe5f76bfe8e7', '9bf0ae09f012e39006996497196a3383a1b917d6', 'a9b595f3198ff67bf80be61aafb6ccc8284a4a45', 'b147842dce80fdad99eb165abf412ab9b22d2cf2', 'b1b4295177f564cc9e0ebcb69b073ed00b7ee1a6', 'b535952cbc9fb28c1b5db3289d4ec0bd9a66f3fc', 'b57dffdf8c26053bee968b8916c59a440f2c30da', 'c32de9222953bfd05c35e527da6be3a94e9c9cf9', 'd7fb0085f01d06937cfdf15277b96600229284c0']

['0004ce56b92c99a52fdcb7a20a5f71223e543a20', '361ed2023727f3e562a0ef38

KeyboardInterrupt: 

In [62]:
#Need to stop it running cause it takes too much time
get_user_groups(top_merged[top_merged['Country'] == 'United Kingdom'])

['0000c176103e538d5c9828e695fed4f7ae42dd01', '01b573f278688a642120764c503edc7b77173a1d', '119e0150c73b18247eccb4b9d103cabdf688ba08', '1561b5b3f63166e712e46d4ae3670ad9e76f203f', '1c19f4361b7f5d68db5a72450b2dec5c6ebbfbf2', '1c6f9fbbe07b2e01911af4aab6dd8183c77b27cd', '4b364898a43a75987dc771587c305ccc1530bdea', '4c365e5ab40f5bd7f2c45d0b5592e694d2fcdcfc', '582527a0d5e7b791d57ccaf1f2f2ef4a324eb4a8', '5ad63432c9a01b274c23ffbeb36b137549c67398', '60f0079ffc2b60955351c5d0ad0e5e37388449a3', '6bd1933a3581a66da1683365c69df10a515fe2bc', '77dec888169db9ba5ebc360df1c63ea8123ab419', '8c14070c590c784806e539204ce1ce9f9a6b62e5', 'bc389799489ee315495baf7eaf84100f693e4bae', 'f540eb6c7aaf56a71953e0f6d16b5a301d386697']
['00035a0368fd249d286f683e816fbdc97cbfa7d9', '01c359055370f3c72f7e5fa7e54d13fe27e9d6e9', '0574b898b57f9da19cb18c9433b531cdb8666b57', '097e571d04ec68795d3f69d3356c32c2ae9b028e', '0a4e7a2e5ad4cfe53e3c32c7287e721108c957b3', '0d2c3a8d15b7a9951a20bd94317b11e9dc20fa7a', '0edfd0b3630ef4eed597a78db2f19

['003148d4d476682966ecca8cc679e4cf5752142e', '18355344d956442e13c071d72a933dc5cd99a0df', '62349b43e155371b45ff8fbd06efa32f275974b7', '7c1f735c6b20b3efb96f252419e0ea5adb7b4137', '83c629980f18e60412ebad2538768561ee69d305', 'a779f1e7a283fc67245b67aa9f67f5eb0b05da61', 'bb19448a19d7053e13ba99948f60bd8425c6f14e', 'd67cc0fd7ef6ab1300d03d89743d5e4880212afa', 'd9e0f8d44dc80a755b04c4bb76a65d65fb875527']
['0035a444da457a04de467c2eb5b2a3313e963160', '7f2d4d13498eb1faebb19471df3e0e93000618c9', 'a322a8d189fc975e271ce07accaf32a0bb17f9bd', 'da8a1bb5507e3890bd4ea52add5c8d317c81fa11']
['00371cc2a235a9cad6a2367f6c98577e5a7b5324', '08829d0f1f8303082edce74fd40c773606be1a8c', '28fc81583b2e61492df889d3c84f5c6a14f388db', '4c85e1fc148944a36185e41903135485eebf6378', '5d56dfd1c09e44b9fdfcbf8ae5b0df3299a288c7', '6b82159d1719d15f1ad5b91ca2ae5153ea369189', '6db1ab40da4281fdf30710c990a0d8f0f3482057', '8b7a7f749dc993c12c61f39c117d7e1c0b3d877c', '990a7f25ca40dd66cbb48d5600ad944668fd3c14', '9acd8a9f430bdf6f9c5af5377de1

KeyboardInterrupt: 

In [191]:
get_user_groups(top_merged[top_merged['Country'] == 'Poland'])

['0000ef373bbd0d89ce796abae961f2705e8c1faf', '154344dddd65386ce1c5474a56662ce78916314b', '28fa96a4ab0a8a18650f446a3588025a0f99aa56']
['001273e79cefff7989586ac0dbac1eb49cc57cb6', '06b9128b2fe59f1523364740c43da5a0b456c213', '150bed600366391205e8341bdbb118581a87c227', '34c2564cb70d08ad10f244491ddda2d2a5ada84c', '6f5acad3295438c2f5b82ea5409e47e9551dfcca']
['00151290a1cee1dbc1d5dd2b0dbf3efd032a0705', '2f768ec343092f399d334f631c29685fcdc3c793']
['001e3099a5f878f74805ffd1f76a51c92192b5ae', '131d85e6c81fd7360f9131be63bbb8f1de2eb719', '150bed600366391205e8341bdbb118581a87c227', '1a3db1e4e80aedbbc383767639c6e4fadcc3575a', '1d36988874d7fa41a399e5625629ea8da08faaf2', '293ccdde77dfcf584d22eead50f51214c02299f0', '3f516388bc554613b75db90891ba9496f6f8672d', '5fa48f81af3e8a3f62651136c847bda78cd3d6ac', '7881d66a528dc27068af1f5dfcae0709c511c5b2', 'e05f00a4a8b3eed150d3b1f9429acc20f7c6c098', 'e9c5de5cff6b65783727200ed3f9f8a4e858227f', 'f23ee5a10895f6e756874aaadf41fa6ce2bb16e6', 'f6146d91e3de13ed64241604ff5

['0034b9afca46d9757a253de06e0bed59b3090fcc', '0096458ad541f5d541ae777ff1769940f605d0da', '2fd495904e449c3b8520c3a3711b770c93c2d295', '9560ab2ca4484fb2a0ac57fab9f8adcf9789ee82', 'bcbb2627dce7b133cd3ddc320867943b0e4c4b8f']
['003917e3f72fcb6e9ee172e7099902a623aea050', '079bb9402e53c3d1abc1f20260af3ac06f61109e', '35bf1debde98116474ced587ceb674bdb6e0fd9e', '3a5389fabcbc6ddd49e3d14fbbe7d065d34c71a1', '75956c87eb680a4b3f38d5db70aaec570f559a10', '83409f45f9e0e8387e04fb88839f7a90a7b91224', 'eccc69f6085a1184159047a0cf9f54c4cd23714f']
['003c2e48097a6a790360973e4070f377c5657e10', '1a8c81cc88751fcad398fe72455a0d897bb7437f', '3d1d2348734262a2d74a99c4f48d67f3bcb6b6a4', 'b44fd791688ba6ebd3d0aaf7c6595383c66bbfbf']
['003ca6ac4684fa950d0cd6610688a555b1f361c4', '9352b0b38ff95a9d2a73e701978ba8a9abd65578', 'b498353324ee10c8c555c2181b8875dc7a2e9cf0']
['003ea38ba04de2e0bbc92377d3a36c2cf567a23a', '0851b9326c7f3d6aef359ded73e1dc92f821ace1', '0a071229b4557df4b88a5630bc2b77fd32ff7a2a', '3ffb58c94dab1735496ad4cbe0

KeyboardInterrupt: 

In [201]:
get_user_groups(top_merged[top_merged['Country'] == 'Russian Federation'])

['00033e1695a2eb4e5fe2322260cf9f112072342f', '4becf2c5d3b6708e9d44223b09b9a1d8d4f9e0fb']
['0008bc10d01b73000360d4781cbf8984b169563c', '4537157b592365d11fe44bb3cd4eda5918af2d9e', '5c0cae293e603fcb977bf2ed41727b99d572ff07', '86006dbdef89878e0dfc075eecd6549d71f6312c', '9130674fc5692d46f070475efa367e33f5502fc3', 'b7d2e5a57ef9c32796115a02fb22fa95b57529be']
['0009839f678c6d9b679b7537daeb02bdad31005d']
['000af18e0a8c33fe3ad809c3d10cfbae84d1bdd9', '1f4f3f9c005fc2e1de3cc0b35da421b52a8b2eda', '43ca5896660d023c7f7f5637205804025f231718', '90845602ff48b3b03f57acdfd2638266e97379b7']
['000fa11f810cfe0fb5972f9ebe38246cff34f3a9', '6fa85806fe4e99dd8592850517b2865731af6b86']
['000fff7e107172b2fdee439636daccf8287b34a7', 'b897cadcdab6be6a8042ab3ea7ce8c42c806c13b', 'ebba39ef651ee0ee8d81b8a126895c12dfe599a5']
['00108be8919f5f8e1249af2207579f9861c1e1d6', '368a98380718a5b484a0c7037b1521790bd02764', '861a875549c651d8a3baa88b972177a3248a0962']
['00135b381367024e7efdd4ee13f981f9ebf31a65', '7aa68e8311ab785a81e5352

['00668bed7d7c4fb5e5e280acd6165bae3b510222', '0197669ff76dd9ce4491d8010f9dff01ffc8decb', '077ab2e02e6a4584fde1afb79493a4fbd054f977', '07d2cc8eaa68eb532990885f32050bd186b06961', '0fb9cf409ca244ec8a3253d9b7aebe5271a98dd3', '1c1dd2aee37f967dbaca30ff1a37f9c02042c0a9', '2d86103a3df100fd08d0245bd15be47b84ef1327', '3b2cd8a4e235ba4eb268e11bf4d967587aa19943', '3d1aa9035437af81398017ee6fe3fb0106f3602f', '40ff3dc52daa915e6ac4b0d6ac4a7ee4151d7f36', '5630737a5c1fd5a1adb5d80f15b46809ff774ba4', '66b67f4b676fc113a0dceca90e26ad4582ea69b3', '9b17edcb6827e3392f6e1f8c822dae13efad8bd0', 'ae202c777c79a3147fa8c8d2ba46c1e122189c74', 'b048e8ba55594c9c72f3e5e0d805a40c4debc579', 'c408198d67c9e36b65435c63eff878948ee7e8cc', 'c74946a8a3e7b61c20cdd3075d2b5ff8bc35af2c', 'cbf927aa3ea77020057b98b33ae75682286f4849', 'd1d6a5626d2596888aff58eced8ff2b748f7966c', 'd6e94152b3a2c3a7da559ea5e3d0889b2f91c4ac', 'e7b67128c1be904160481fddbee6420a6ea3356d']
['0068fb636eea5339fce042713c5871f5697b84a7', '2dc294d2f6fb1d2511b705f8ea6ec

['00c488fb55458cc4e135559a44f7f856dde2842b']
['00c4a5af2cf0e3ae8814e9010b4ad74dafc51c08', '09e109cb9a5de04f9230a531926124d4b7e95c0c', '2f157f3bde8c3dc97e95ef280101c775fe661d96', '31d4668771e02081774ba2597e74df6609b5fc86', '6d21b891e260edc4ddb97af38c75d76b32bf2e53', '8c416fecb50c068716c98285aa928c993f6d52ee']
['00c9a104d247d4ad4791e466e27007bfd75d5fc4', '2af0cf06176195116a324148f1dcbab430487d94', '2e1a3899d577ae7199fe13821a2b4f8e3b670c40', '33050efbf75ebd36f401e458cc5f41920d973028', '39c793a1e4e980dbcdf8c599f460bf2438ca3534', '56e3a167375ad682f1e08b778e4742af2bba01e2', 'ba551015ad8c3076421e02052dca26580f922271', 'c8d55d320d3294cddd135834635e64930586f5ed', 'e8298437391709c4fd3fbe27bd649993205c3e7b']
['00ca2c74138555821c4c2fd6e875b650b58113b9', '3621593dabb3cede2c8452957e52727210706e7b', '47dcba48f8cfd74d043f70b6ee5bbb6dc91e8284', '69203ea357ffa4bf69de106cbf0f5d78ac07cf7f']
['00cb2efeaa80437d9fd708b633b650d7c877f7b5', '0288a992a06d822a2b737f91c729ee81492789c1', '0a1b17935d40e4724647660ae0

KeyboardInterrupt: 

Here are the groups we decided to keep : (We keep 3 groups per nation and we try to maximize the number of people in the groups)

In [16]:
german_groups = {
                'group0':['000cb6427411006fe9a6193d3c4f59efed53fbef', '010619d06b9514243302b80e787c05c88747954a', '0721768adaacc26bda0c9e39de72572d6fd1af82', '1f2a9d5809d80a5d964d2d8dc686a8660481161a', '2225f430a166bad58b922542ffc78a1b9997d06b', '23261aa26dbcda5befef74cbc9536b6021211a1c', '2c9c748a768cb1fb5807fa4efd4b67a46f8d220f', '3ee5b1d15bdb33f67e2f3d7ddabba9945e4655a0', '48f411a532576f286b8134679906d82dc6e60de2', '58c5a95acad0b1afe8e8660020cbd3ef4adc845f', '5cb1f952ab773b563c057fe3281f0593a16f0690', '60a515b1288878bd1a4d87fa55a98f549816b0bb', '81d8115444360f35b2daa3a857414f9af087e87c', '8ec402896e81dc8f7492ea9db3be987f0db9bd02', '8f397428c2c883a67e4562347ad77375d6b9e165', '9b06094b0484da4f61647fc29fa02612247215a6', 'a027e37a7d9eafa9cbf4fe146627209d8895d53a', 'a7a6297d98246e911d5a13f8b42d42df9ad1af98', 'b70af018e3925646b2115301dcc5e6efbe6f7eac', 'bcf2c4d43eca6d9776f8f8dbc2990b4d4c36632f', 'bec630f088128e1f5d3bdd2771404a1af088ad97', 'caff5452752e76e8295e772c85021c5d2b209010', 'cbb96c03c3a116de6dd0c96f091794f63acda167', 'd08c610fd0dd425f432ea8cb49ede5abd8bf8722', 'd6f593e3077ddfd32555b3c44ec2758a4b5decbe', 'f2fc94213049c87046d06f8ec7440c9d5f1e65c7'],
                'group1':['0025719f0cfae17ad6498cdf7cb8dfa71f34169c', '0272e0eb39ee745c740c31003701c90481f1faab', '04ae799addf59daf5bdc25ffd9238f11d1c1ecb8', '04bc3e12e55952fb503520563dcab060f9fd6ff8', '10466bef5bcff9736920c28579847d9548faaa88', '1255ae6687729904665ece08a51057cfa48a7055', '143fc974cff01e7f0c9abcd5ce8a135e301f4316', '1445cfa9ac8d3c76ff75948d2918fe73216a3bae', '174991d5e1ae47b71e9cac4b1da35564eb117d91', '187aecd1e9fbf8aaa5d424fb9ea5973d8c14bd62', '1cbbbe339db5089bb3c5164f8c906dd751f6d610', '1dbf08998a6a74c03b3b7e3fb0bdf057d2ff5a9c', '1f0e0c02ad0f61e9620e3e8165138732ae25e0d4', '285fbe3a8461e7ac69d7a31800c0984e3b2cfe2a', '292f258a4ee1f572713975c325d3f1133f8b8458', '2b6878078de19cb15a572e332fb74952dd5e6ebc', '2fdb4c56afd90f1ada7ab2d3ff5e497993d74de5', '332b89d095e9120e2fcc164030eded51c8d08ff6', '359c0c54fe5676925468f6cb1cc4b9f24918b1cc', '3744481a361577016eff82de1858fa65622ef597', '378bdfeb1bef7787d4783d58d9dba2f9b4dfdfaa', '3c464a8d1528717958786975bab1b06755d1e816', '43d394f00b461991728fb5f0f336dd6ccf9dc39c', '44a06f66c01ebadccd8afbedf149de8eba6f37c0', '44ffc8e3eaf0bba4e95a1bd2c632037b5a37f416', '4a85df34d891de61d71b9b59dd969021ff14e6d0', '4fc4b6ee2a4e21c70f144788bce8790c4c94b8f3', '542151a903ee5908a10c12055371d161185ad544', '55b884001ccf2aed39889366218dc2d7f41235f6', '5a04687647f96915c9797719466c314b8d82de2d', '5aab1fc2ad9066a7c00341163cc7b4dae4f1859a', '5d5e607496c8a68e342ccd9a2151977e5da4145a', '60779a8930749968e83e137cd8cfab12c9f130bc', '665e851e7e28c23bd4060b17be46a9e85a4c5a17', '691c172c89793f9e0eed8f69bc72ad7ab5fdd2d7', '6d894d55da4b06f4dd7ce684f5648cd997338b40', '77a25f5dc3db74a922decec68abb36fe44b799bc', '79a980fc3798687b66fd8e5976de615e0b53e5e7', '7d397de9d3ed4bf29e94ad2ca1bf325a425c55a5', '7edc5b4ace167b34359e5ba30082db417abc08c5', '84589f0f86d0e171979a985aa0c1f5756628bd10', '8482751a10b6822370b02a0c942d720e1adbe961', '86c1bfcc3b6603b27922833ac5738b3793fccb19', '88af5762a22f75cf6fbcd93e6acd535f025d8ac3', '95eb5c62ca73443d959c5699dfb41bde8b0c72c8', '9d69ae51a7be855e4049f2904dc0c571c2d99a1f', 'a224171c1a0e7aa83a5f9593019b4e6d2b982043', 'aa4e2e7dddf35215e9b8ede9ce59d26c7d4d2e7d', 'aad38ab6a09202d04c13af26acdc8be02178c57f', 'aae7bdf3d22fce35d05416ecaeee48367c92dd18', 'b337e162577aaaaed22dc7c6090db9a3490bfe70', 'b7642ca84c53d6f76cb23204b8a0b2701aacb2aa', 'b7b1f9264345be1976d5c9778fff96c7a98246f0', 'b8885b1e8af96b492322d86422eead68ddce9097', 'b8f3f62aa48d8199955baf6b09844b49be9fbd3f', 'b9f7d872c79730aafbfe3cc6d250ab996db805bd', 'bccb683a77376b3034455f8349afd6ac68927089', 'bdfd18d99f13199588fd6849b0ff0fc58f624c12', 'be092b27b834f4b9d2a7e281bd5aea5cf96be1ec', 'be39a2cdd82209ec341d8b81e5f81977f08f9435', 'be7a182c5fa6746489b6d83161a230cbbb222e39', 'ca5fca37985cecb3308f537a29b9235a6d6be202', 'cc21eaebb7c1b1192feca3aa85f82e045dab63fb', 'dab425a36680e8f40fa844e788ff1ab9f0ce9193', 'db198df1d5fd923f2363b2ecb5dec1ef9780806e', 'dc6fa188f4ab7a21e9bd3ca5288cb8fb45ecbd86', 'dd308ed62cd53d44e1e7b646bfb946d1f76bd0a7', 'dd715310465d2b3fdb7bb45be088fe52b8416827', 'e0a6c638fb2456bee94f8117b481e9853e2f84ef', 'e1d38b074d1ee50d7fea07c8ab07e20a274d8e88', 'e6c3460350a941a510e0f7529b14ec3cc49f0fdf', 'e6f6ae6c6693ac22fc487e71ae3548f564d41b39', 'e8691cdc1db0b7ea13a3206904b1e4e832e71584', 'e8a894b081d44410f25a75b23b2c94fb510b8522', 'f357b50c29565c0d826e5f8a5b948f96b2b9959c', 'f3f75f71b255a6fc2a88e14ad168e4a3e1f36caa', 'f65b01703aa4e3a6581da6789a22214be4b57df9', 'fa5244dc9bc9ab9529396ad7c345f6df09cc7ed8', 'fac6e0a20a04b1504bae818515d58e79584ee33f', 'ff2b5d534f44c6000b08b791985d2c4f96114d50'
                ],
                'group2':['0027bdefcde28fe58e126906c08c58d843b77c69', '0ce986bcbc983c6ae73e317157fe76d9cacb96e0', '17304e89848cf3dac6017d3184a6b05f8f4ceda8', '182786420504460b94fa39e19b1e09ca19bcf93f', '18e3887f682621509786f8f6646f3cdf4e932465', '1a35322b5fb1e1026c626596294084f2d984579c', '1ebfd10c0c75b40ff35cf9c3d4c5128cf3afd87e', '3a792a7f2dc74d701cc706315790dd5dc882e9b7', '3c0e2aff637ecd919560c5f26b203902f20adfe0', '3f1a7a36d1d6bc8d7a45a156511d508c23ecb2da', '40c5ef92ef99c53e1c2841d88925b9391fe18e49', '416323791c46c7eee51325b1850bd5a1850963d0', '464725eb4eec9b3adfe65a332586c1d7689fcf7c', '517915056877c8fbe2864ee381735dad35756c7d', '5d5e46848071c01eb419988fa56bbcd3afdb17bc', '623843c45e7bc0af486a5b3cc05c5345a9d7bc77', '6d0550dd4e44e37418ef08e1b26b330645063aba', '7ae9603574caba81283abc3bcf004906c92e111d', '81cfe2a1a28b0cce06a7da464a333ae68c948851', '97388598646291e7d3be16bc10a087cd7f17169c', '9c84ea627978c968c84e978b8b02b61ffa637c88', 'a4293ba8532be409c02b55bbbd8b8961c790d343', 'aaa944ba91b4652485463fab3d3dad90a74b6904', 'b189cddbc216012eda5e5c426a683ea3e29b5040', 'b2132ddfb2d5aa53acd401afea6f25c2d0830a8c', 'bd6e3331998bdb7f324c8b33c2a15cb4c0b07af9', 'c281f3ac96eb197b6cead6e1565d08d5c46b9faf', 'c83fafe1e59f989d2b1119531a83663fcf03d071', 'c94a94ecc06a09853e5f4a1452dd0d577204b95e', 'd7ee0d8ecf834f54ac020848c83bfe4e6afcfb50', 'd9f8e8522c6e5d8e15c5fa3fca00e82f8668ac0b', 'df4c4bf7ffd2c33aecd0e80d77ef6407f8436128', 'f9d58154a1ec8ba952d8fe938d90e1334e6bed5c', 'ff6fbb1db9615f8c7011920313d6beb8f585ce58'
                          ]}

In [17]:
english_groups = {'group0': ['0000c176103e538d5c9828e695fed4f7ae42dd01', '01b573f278688a642120764c503edc7b77173a1d', '119e0150c73b18247eccb4b9d103cabdf688ba08', '1561b5b3f63166e712e46d4ae3670ad9e76f203f', '1c19f4361b7f5d68db5a72450b2dec5c6ebbfbf2', '1c6f9fbbe07b2e01911af4aab6dd8183c77b27cd', '4b364898a43a75987dc771587c305ccc1530bdea', '4c365e5ab40f5bd7f2c45d0b5592e694d2fcdcfc', '582527a0d5e7b791d57ccaf1f2f2ef4a324eb4a8', '5ad63432c9a01b274c23ffbeb36b137549c67398', '60f0079ffc2b60955351c5d0ad0e5e37388449a3', '6bd1933a3581a66da1683365c69df10a515fe2bc', '77dec888169db9ba5ebc360df1c63ea8123ab419', '8c14070c590c784806e539204ce1ce9f9a6b62e5', 'bc389799489ee315495baf7eaf84100f693e4bae', 'f540eb6c7aaf56a71953e0f6d16b5a301d386697'],
                 'group1':['00035a0368fd249d286f683e816fbdc97cbfa7d9', '01c359055370f3c72f7e5fa7e54d13fe27e9d6e9', '0574b898b57f9da19cb18c9433b531cdb8666b57', '097e571d04ec68795d3f69d3356c32c2ae9b028e', '0a4e7a2e5ad4cfe53e3c32c7287e721108c957b3', '0d2c3a8d15b7a9951a20bd94317b11e9dc20fa7a', '0edfd0b3630ef4eed597a78db2f1984cadd75f60', '233fff208cc06bb601c42c88cb36d67c78117470', '32b350dce99b040059c919bfc81759c1ca1709d5', '34d9cd40009f572bf6168adf0b4ca1d8c26b49fc', '41cd2a2c06a61abdadef690c93f6bbdf423f032d', '4799671b78b08f8f5c7f14aba87140ba999f78dc', '4c7cf6a427a365a97d0d92715da0aeccafed1ee4', '4cd5a92633b90b45665e580fd0bb252fd984c0ab', '4fa437c0b11b64f2c384022b850a6329828cadfd', '56b28f24046146932b512ea3caad56530c6b31b9', '5863ec24407aa8114cbd6802add800e209b208a0', '6c3ada69128880fd65bd8eb5944c7d9e0859e276', '6fcee3958df8c4c3cd54c6cf6bb63bfcf13acfb0', '7010692ee31be544c80908a071a3a4362849a8ae', '96b06619b5ce0817749e36a195266417e7887d56', '9ac1bc781f8c7bea923d936e475934b899de4567', '9c3b5fe60670b6f0f5e98d0a0220f3f22cd8d961', 'a10ce391787f4de723668c478dda291a5a35e08b', 'a413973fdbc5e3feb6b28af7ec7cac4e5c24d99b', 'a47cc3e8c88fdd05c529ef0282e35e7c739e94af', 'a64e95d6a3972686fea494c846f7c0b490955ac2', 'b1871534b22e2fd0f3025e97e8095c9f87ba749c', 'b7344bc519f2edfafb6e7eaf9d2f6145d48800a1', 'c04ee1ec3760bc15a2f24afbfb04b0fac9f1c1bc', 'c8387e0ed74e8eca3b03f4f484fc342000307cb7', 'd288fdf336b537479b69bef4edeb7108c2a18ead', 'd6ba3adc2d6b81ddbaa605d249eb431c6c909f6d', 'd6df7754c8443c46fd631170c9d315fca42af3d6', 'e446f8458afc831f127230b54b9f63a60c4237fb', 'f07629b8c57736882f6920a9a0ef5690a33734b4', 'f2eb7c0078bb8814a66974a7bfb0e6774507478e', 'faaeebdb373211f4428930941792e1d04d52fae0'],
                'group2':['0008ac556f2cf0ea3b53f53d84223f1c62ee3ffe', '0696ed3a3697913c39dabaf64f6248b44ee39671', '14290c1dbf8e47fa625e7de9f39adfa807bd0fdc', '1e05325b6e22ffc644ed64c9b51de93d997de816', '419dd029c20c7679ea2a2367061fa5ec8e32f5e5', '4b65f1d54cbb98872dee37867caf26f42739ac91', '5de9f23a47271c32da462a254464948207e178c1', '700430382830be41d2c0ae251f2ff5636dffb9cb', '86348184d556b7a03c500e11ded4dec20fe5c127', 'a21f58c1b8a08dcbc5dad4899fb418da6a501987', 'a7d3b7e16e7890feae6c6fbcf23ed20b29d5e63f', 'a980c53ce9a9246af8d7af20983b18a7a83e15fa', 'b24ff43c5b86b654a2a869117260dc5c655c0467', 'cbea6b1669efcdfc70438875e5b0582df6b55792', 'd592e877c3df0b2a11bea83063f56771b9d87094', 'd733b2c9a25584058f13e4ef9698400d54d94c59', 'fc5eded5593a6c64bcabaef7fab51d7aec13463c']
                 }

In [18]:
polish_groups = {'group0':['00206024f16fcc8aeea38a32e06cd452a0fd07fa', '0a43b416c82e30a48d338fb741fe8bdf15593797', '0ddae7cfd74b93edb916622b4036cea5451a68a7', '0de0889da04b4d62ddd62a16bb25c2e57443fcce', '0e80702c19a9100a19c314ffa6daba357848270c', '0f0001f36b4a9fe2c7f136831513ec0710fb3249', '106c27145dc3fc2cb9e72928c37de3c1c163dfb7', '15412eddcd332690f41150f3775b58224579a352', '17aaffd78b585afc69982d8383a599d08ca51f48', '1b8113b4b17348afd646f750a23754b21327c666', '1ba3fb68fcaee3130d019afafa2c8df2c98f14da', '1bc2e42c1d3161b256d149c710a390e5e497d7c9', '24b442971128ea409d8a659e51cae753c7dad033', '286f55938bbc2d1c23db6cc6512980a21d99edd4', '2ba87634f5993428538f48c29430ab01c3e327b1', '2bc18bee3836d4e13eb2d9ad6cfe3f5219be58a0', '30fd2a3768beb1ce6be3d916bbadafd21ce525d6', '32c5c4c1e9291044f68f964bbd396c412002b40a', '32f4159e8a17e0c5fb5edbc153a0fcf651907aee', '365173daa77f07ca92146786ebea9d4ea3f0fd8a', '380979028392365e4f69c04f8629754df036fb31', '415c59cb0089fbabf639700211ebf0861ca48324', '47d52b79797709b3cffccca88e8a554feb74355e', '48ec6b6f0ae4c86220f3f8628e935d98edf013d3', '4c24770fb92c25a16e2d7433ac34f257baf7e205', '4d2a2731caacd0fc52c7d68b882c444b87d58423', '50aba276af25f9e9ab7c2300be41f6aca2151519', '529b466fe98442a3c87bf810dbedfa3f1898c12a', '5a04077566e1d51728d4e50f22787772912d2dc1', '5aa8a38e1c2510c141a3fdd237b2e00c99d07548', '5f1d3c22c3b71e9a33abce849e56f1f60302a504', '62d38293c4b2da3e797630f281bafce753d22ea4', '66178b1df479d62d2c5799cccaa8b61a290c1073', '667e0f39099805cd3a64c35e76c077ace790b6c1', '6781a5dc741b83e9ea48380c1eddf74b40d087f9', '684a91389e9bc941464a26988c803b649e0eeb90', '688051844515cf025498e9566a19b78cb6b87da9', '6996fb242dfd61b39ca32fcc49b8afed7d51b98d', '6b0fd69733696f8a934a48006eac57b200459884', '6b3308020cf6088704e1704ed414d1b4cc49117c', '6e33437c529457b70ffa25a4f016634d516c96ef', '705940bdc2de6244f7824e61cf81c820dd2b79ae', '764c0c1f591c5e624362333aca780df47618b548', '7b28d05c6f276b1541c06e463135aefe037ef009', '84b4e8fa7cf4ae598a376b927949b03f2e36a7f5', '853c63dc512351cacab30bbc2dae96eef55fdaf2', '88ed72118c0cb3a0f1b249baf34b659f0c271b5e', '8b8756db6e7b7da2bfc57a2e665499cad6622690', '8d0cfa08f41fdc423033a1c11072ff2300b3f58e', '99c20aef32cedcb4b703f32bc7bd38d255914ecb', '9af184159ced82029544182a84f71759cadfcff3', '9b187fccf9cd1d80ad5d39c2aff2cce3dab2c487', 'ac6f851d407860967806c7727139375491acc7d5', 'ae53985a0c684fed31801006cc5a7b9d113881f8', 'afc73b7a3d6974386d08d2d73e17886ccbb664f4', 'b24177d1c9959ccbbadc6b31b7621cb745f4b351', 'b550f938f60d3510deefc03ba947c1903d872314', 'b9f918435ebe044aa79fff5ffd2fedac7ae49498', 'c3d30192c1c378ee1bb3b62f7afb42afbf8cd442', 'c907d294d85d334cdb55f619d40950ee0b66b545', 'ca1079065f45218bc5a2114c2568f74371888679', 'cbd1df2e1aab044f98ce1bcbde1de7e62f8f2dc0', 'ccd845e7534f486b59a0d42bfc074f28439d5675', 'ce6f933224a52b8e79db0e3b9700d4ce1fab0e97', 'cedfe15b2217547856b4086010a691f17ace6a7d', 'd13a0f1a0c69bb1ec965f8e8e8b20d07f0f53c9f', 'd1e7ec54d791192735e493cff39b696073468bd3', 'd1fb51366c689de2dec2a3a002dca41b3611165d', 'db0f70c19e33f7ead00207aadce819ce8897818a', 'dba12a596d970661699de657c430bca74fc36e86', 'dc9be800ce24e753df8da8a2380c86c41ec218df', 'dda6132baee939456a7fc9f02c953bd74f5f095b', 'e0e0a608bacd3fbac0637a51e4d65c5a4a71483f', 'e27634e29c9ec8cdc24f1dbb087f52784eaffbf9', 'ec0fb304f2b80aeb0875fe7f874998609cdf710a', 'f137fd0df9203c7b5d70bf36dc0158bfeeaecf5c', 'f2aa5bcffd2a564763d6ee009a8a97a65b273039', 'f444be329124583acdcfe08802001748593eb971', 'f5ddb330be6140bd37097304794e7249aa85e1ad', 'fdfa59881e35c5e7a61a7cddc069cd21a09b0113', 'ffe96e4439e3036d7216af1b5b8a11146b2b60f3'],
                 'group1':['0026e191a02571bacb601574b47b0899d0720cb4', '0386d94c88a018dc0413b7d08e2f144a777bf42e', '050a967a7647bc5ea1a7c185e9588a0b999259a2', '0eeef187c5bb0fb2f3b09ef4e204fd25af62708a', '22c050327add338556086c3761ef49c71f1dd381', '31013d9f22ba55bb73fd86cd23c1c9b2cc01e9cd', '5da6f0a143469621001c50e62d6253ae5375f968', '5e19f64f02a64b60f43891deaff8e62932e4f5b7', '66267608199f76287a5d9c9e354205fc6cff0167', '6d89d66952e85ebfaf6ce7d011519582712204e9', '7dac0972ad178847c825c0d777499635be16e68c', '8bccab20a8905b5804015f22e7e7bb45c2984171', '941e8c5bcd5eb13ea7e5a171eadc70b5904b3b0a', '951b1e7b552e5cd805d05d7b2af81e086079dc6b', '9ee6b0e7e63449543589fca32d9bb9c9221002bb', 'a0bcb765f143da18c0214f311f23a5c63dfa0b5a', 'ab3103b98b5445bbd0d523b2d2b8b5bdc26a9d38', 'acb99cd7f3fcf87f749902981eb17c0e5a0afdbf', 'afc73b7a3d6974386d08d2d73e17886ccbb664f4', 'b4833ff2f228356e8d0bc0ec750d041408cfa172', 'c0edd7425493ec8f0d76b58944f557b34763882d', 'ce4184bb0d3851f3d377083ef5f110ef0f4c29a0', 'dc01497f22a427e43630b1ab1ec063df4be7f528', 'dfce56be7180eff74b6e80c6fefa079c33165805', 'e75696cbba2c7944ebfba159e6c34c3ec6f0e5c8', 'eb2678e88411c255e65c01a831e9b1eee51590cf', 'f57fd84d7d399b3c516de0ce6027518a57f2cc85'],
                'group2':['003425eddc040f2c2c0fe6d306b1e9497ffac24c', '04fa91eec11ce0ce9c35f54bfe5869c9ae6e2564', '0a071229b4557df4b88a5630bc2b77fd32ff7a2a', '0c398c7935891837c69cb0ca41075276ba5835a1', '0df232c73c6ed7141a3482d2c525cabe480a6a18', '12b2f26f3cd937c4f65918b0813b6826d31a45b5', '16ebeaa5a6d329c7d6a1fad20066116fe7f7c8b9', '1c29036973b68a3f1f34946fea90436992c4292e', '204b67b19928843a0453a75bf33e052676603719', '293a13f9b44a7177f444bdc371f2c04122f31c92', '2b0782b449401e0f5a21c5990dfdb1c7a24dff3d', '2c37122bc549a3dbc9f278c0126c24bee136f5d3', '2c4ac3b53de2b99d4ef7cc7159f1bac6e250eddc', '2c937dd24fb1224bb6bd440e3f572d687d3981cc', '3022b664630c482e304adc363c1665a34fe8f90f', '30f6dd063fccdb8d37c740658bc1369563897d97', '318acfb3f292bebabd9e48ca6895157517a30238', '31c17164d65fe2304cb148c7d041915ce193c7f3', '341876b06a3eb8dbc4c482e9f915b37baceb2353', '378ca7fccf326639abbb3d725c7a45f7483dbe88', '3ffb58c94dab1735496ad4cbe0b0ea74506065f5', '7a04b393826b2a4e7f1cfecc197e41b57a9203d8', '7f94a24b2d3284b1311f8a697da99a4c09034b1d', '81576f53948e066bec90011011eaaafaafa0f8a5', '84ec6dc7c26e29024c1a9c53b825502ab7f878d5', '925776f0ea563ca728cb58e0c7660759328eedfb', '962cbd2199737b30e1d1640a50fe132a87e3c7e8', '9f545d2156afc301162bf7c1e3157bd88e93dea2', '9fede38d5bbe7e8a9e3478e76bade6b65eb5cbda', 'a05c5c60e588521b17dec2d8945c66d8df4694e9', 'a7181e6cd490dac790ff5a0966d0c39f6385ed5b', 'af0ec2216f41069a0b114e59f2523ad0008416f9', 'afbf1759c0e81e2ef14f0989cbb7e4443f873d21', 'b461845a64b299e20930346e383384f739b4798f', 'ba051643771d55d870416e40891963dabd8ea142', 'bc9321c58de33216cf6df9bd7403dca7946563d6', 'bcb11acddf0b811fd4b26097e95b7216dc48edce', 'c5f2e2d80f7ad1d1ff9be11543b9577f1cbf2809', 'cb8091a84c44526e75226109b1c3bb53d27b081e', 'cd4481582a86d905e31522ed39396cdcd9f3b0f8', 'db394640538e8f107a497e964197a68d102037b3', 'e9f69307d147d8b02433b3c14ebe08f5aaedae63', 'ecd4a43902be6a8d3d50de72bdd00ea7dcc10023', 'f31c30521de527ee34b5808116ca16f7e502ac75'],
                }

In [19]:
russian_groups = {'group0': ['003dc1eaf59995415b645c1f3373632a7fb3696a', '0108aaf1a211987343da59242267fc511a34425a', '09d812350249531a070dba4e99507016dc9af1a8', '131d41c9d564ea32e3662aaf68e46d3adc23cf07', '151ba3f977667d4548538b67884f998ae4c4d769', '168751713e459aa48abdd4f3539d3bb2debbc106', '1ed58ddba838b79bf584feff69694fded3f7373a', '207d189354da012810f2fc7f992d55fec4d498ae', '3cbb0043ddd38b9dbfad95f3dbe14f9fab1a9a84', '4ffe74bef44fff70fa5769c1430baf32a12f8d64', '5347e43432dabfbde6c53a4af2fc1044a4f3877a', '587abedf6e9a82169e54fb387a7cf98fce4581d7', '6d262ef51b2ef9493fbf33ea69079aca7a2e5904', '71f36cfd987087edb948be298e40c468683319ce', '82ad0549a58116add245ec9fc258aa965bff361f', '918124ddfcd6d36a0430ad3db29f52ad9dfe36e7', 'b0f35af8984b09d3e74b8b2aab530032c0acd87e', 'b7cef9e4623fb1f2afb630f022d679163fc07328', 'c4665a0ead9d2455dc12ecee1344ed378b8c8d96', 'f96d08b5d3aaddc894f7f80a5a1cee78d8eb4d5f'],
                  'group1': ['004a3a3d91d0890a3b25db93a6d10679701a552c', '01b19e65926af66fe3bb543e24e0d5f8445f58cc', '16de61317714763d6d64811aebe253c67dce4c99', '1c307b947fd14eca2c0cdc787e4ee18acf942040', '1d5d79d3282727df712667670c52039f7eeff64e', '41fd10b529e091f3a566375f2f10f81b604fdd02', '4ac002b922a6b450a1172df3ca9d48d882b4c0ad', '530debecdb5740d93d3737d75ec062cba070960e', '61de73f577ef03b973adba64d531e2e31b5be9fc', '656faf6dc53e1af2c234549a59f3308eaf9074a2', '8197f239f154bac1210c7e82b952de0bfaca9133', '8d192e3596f83251ed4479e83ad27e1f5e230abc', '903746f2eb9079e8e10b1110f59bc2387a6d3b85', 'a8c83fa33601eb64bf37d166763192aa8d83f404', 'cba18ce0d06f6c224cd8224511894942b61605e2', 'cde0b647ca4c6803913ca30af9fe6a2ce612f465'],
                  'group2': ['005c0e40de5d9376831bcd6d8e3b066459606d6b', '01a0db38b06a0c126f3cebede9c500cc696242a6', '0e2c6d31e37f3c4791afb14be826ac5643e68d6f', '268341e50c8adfce1e9b570fdc9140e0ff93e787', '302bffc1734fd2f81a51aeaefdb3a628751a625c', '337cb892d757ea21830b7318d1a03328ea806dc5', '36828ea60ace73ed3158fb46ca0322e96b0079e8', '647545deba5a145db120ccb1bf5cd644aab8a3b1', '6b5afbf9cf32fd3ceec30b64a9cde640378bed9d', '7938ee510f90c871c1556e5605707d75d1f139ca', '90b75e77a9a337d0e64cc7bf442185f23fb0d81f', '9a84a773eca62fad653d2ccd91e0376fd8f310c5', 'a9d5e0abc65edf96167016aa6ffa6419a651b2c3', 'b8085e8c33216f0de653e2d64c1acf908481112f', 'bdf9d239be05c002cefb6d8fe48096b838e99a70', 'be02e69092e92b82c4febe751ab5be9e9caafc46', 'ca81ee1545c9cbf01a8066b5e339df4bb8ea8c31', 'ce49a303553eda3e07066f27fcd5776346f10170', 'e465db8ce0a6b0423188f56f952d5850c14ad587', 'ecfc3f5a770dd8c6726dbba4d7e04a7b02371fb6', 'f7cd4a8f373eeda80d9a59534333070c4f01efab'],
                 }

In [20]:
for group in russian_groups:
    print(len(russian_groups[group]))

20
16
21


In [21]:
for group in polish_groups:
    print(len(polish_groups[group]))

81
27
44


In [22]:
for group in english_groups:
    print(len(english_groups[group]))

16
38
17


In [23]:
for group in german_groups:
    print(len(german_groups[group]))

26
80
34


Groups with many users are more interesting as sampling from them can create lots of different possibilities.

In [24]:
GROUP_FOLDER = 'data/groups/'

In [25]:

i=0
for group in german_groups:
    group_i = top_merged[top_merged['ID'].isin(german_groups[group])]
    group_i = group_i.reset_index().drop(columns = 'index')
    group_i.to_csv(f'{GROUP_FOLDER}german_group_{i}.csv')
    i+=1

In [26]:
i=0
for group in english_groups:
    group_i = top_merged[top_merged['ID'].isin(english_groups[group])]
    group_i = group_i.reset_index().drop(columns = 'index')
    group_i.to_csv(f'{GROUP_FOLDER}english_group_{i}.csv')
    i+=1

In [27]:
i = 0
for group in russian_groups:
    group_i = top_merged[top_merged['ID'].isin(russian_groups[group])]
    group_i = group_i.reset_index().drop(columns = 'index')
    group_i.to_csv(f'{GROUP_FOLDER}russian_group_{i}.csv')
    i+=1

In [28]:
i=0
for group in polish_groups:
    group_i = top_merged[top_merged['ID'].isin(polish_groups[group])]
    group_i = group_i.reset_index().drop(columns = 'index')
    group_i.to_csv(f'{GROUP_FOLDER}polish_group_{i}.csv')
    i+=1