In [1]:
import pandas as pd
import json
import os
from os import listdir
from os.path import isfile, join
import re
import multiprocessing as mp

In [2]:
datapath = os.path.join('C:/', 'data', 'poltweets', 'tweets')

datafiles = [join(datapath,f) for f in listdir(datapath) if isfile(join(datapath, f))]

datafiles = [datafile for datafile in datafiles if re.match(r'.*\.ndjson', datafile)]

filepath = os.path.join('C:/', 'data', 'poltweets', "tweets_flattened_20200115.gz")


In [10]:
len(datafiles)

1939

In [36]:
def read_datafiles(datafiles):
    records_all = list()
    for datafile in datafiles:
        records = map(json.loads, open(datafile, encoding = 'utf-8'))
        records_all = records_all + list(records)
    df = pd.DataFrame.from_records(records_all)
    return(df)

def split_job(datafiles):
    with mp.Pool(4) as pool:
        combined_df = pd.DataFrame()
        split = 4
        length = int(len(datafiles)/split) #length of each fold
        folds = []
        for i in range(split-1):
            folds += [datafiles[i*length:(i+1)*length]]
        folds += [datafiles[(split-1)*length:len(datafiles)]]
        
        results = []
        for fold in folds:
            results.append(pool.apply_async(read_datafiles, args = (fold, )))
        
        dfs = [result.get() for result in results]
        for df in dfs:
            combined_df.append(df, ignore_index = True)
            
        return(combined_df)

In [50]:
fold1 = datafiles[0:15]
fold2 = datafiles[1000:1015]

folds = [fold1, fold2]

dfs = [read_datafiles(fold) for fold in folds]
combined_df = pd.DataFrame()
for df in dfs:
    combined_df = combined_df.append(df, ignore_index = True)

combined_df.head()

Unnamed: 0,created_at,id,id_str,full_text,truncated,display_text_range,entities,source,in_reply_to_status_id,in_reply_to_status_id_str,...,quoted_status_id_str,quoted_status_permalink,retweet_count,favorite_count,favorited,retweeted,lang,extended_entities,possibly_sensitive,quoted_status
0,Thu Oct 22 13:01:05 +0000 2020,1319262500743975000,1319262500743974914,RT @NHOERUP: Tak for at holde ved @TorstenGejl...,False,"[0, 140]","{'hashtags': [], 'symbols': [], 'user_mentions...","<a href=""http://twitter.com/download/iphone"" r...",,,...,1.3192175537728591e+18,"{'url': 'https://t.co/5JGfa4Iicd', 'expanded':...",4,0,False,False,da,,,
1,Thu Oct 22 13:00:50 +0000 2020,1319262436105588700,1319262436105588737,RT @plejefamilierne: Det kan ikke lade sig gør...,False,"[0, 140]","{'hashtags': [], 'symbols': [], 'user_mentions...","<a href=""http://twitter.com/download/iphone"" r...",,,...,,,4,0,False,False,da,,,
2,Thu Oct 22 13:00:32 +0000 2020,1319262363145687000,1319262363145687041,Det skriger til himlen.\nSocialministeren bad ...,False,"[0, 274]","{'hashtags': [{'text': 'dkpol', 'indices': [25...","<a href=""http://twitter.com/download/iphone"" r...",,,...,,,7,53,False,False,da,,,
3,Thu Oct 22 12:32:30 +0000 2020,1319255308875739100,1319255308875739138,RT @alternativet_: Statsministeren sagde i nyt...,False,"[0, 139]","{'hashtags': [], 'symbols': [], 'user_mentions...","<a href=""http://twitter.com/download/iphone"" r...",,,...,,,4,0,False,False,da,,,
4,Thu Oct 22 11:40:38 +0000 2020,1319242255115391000,1319242255115390982,RT @alternativet_: Vi kan ikke være bekendt at...,False,"[0, 140]","{'hashtags': [], 'symbols': [], 'user_mentions...","<a href=""http://twitter.com/download/iphone"" r...",,,...,,,5,0,False,False,da,,,


In [55]:
combined_df.loc[27700:27705,:]

Unnamed: 0,created_at,id,id_str,full_text,truncated,display_text_range,entities,source,in_reply_to_status_id,in_reply_to_status_id_str,...,quoted_status_id_str,quoted_status_permalink,retweet_count,favorite_count,favorited,retweeted,lang,extended_entities,possibly_sensitive,quoted_status
27700,Thu Nov 26 09:31:03 +0000 2020,1331893219441991700,1331893219441991682,Forstår ganske enkelt ikke hvorfor @radikale s...,False,"[0, 197]","{'hashtags': [{'text': 'dkpol', 'indices': [19...","<a href=""http://twitter.com/download/iphone"" r...",,,...,,,10,68,False,False,da,,,
27701,Tue Dec 01 22:02:52 +0000 2020,1333894359188721700,1333894359188721665,Det er forargeligt med snyd og fusk. Men: Hvor...,False,"[0, 260]","{'hashtags': [{'text': 'dkpol', 'indices': [22...","<a href=""http://twitter.com/download/iphone"" r...",,,...,,,2,10,False,False,da,,False,
27702,Tue Dec 01 19:12:12 +0000 2020,1333851411197661200,1333851411197661189,RT @SorenEggeRasmus: Solbakken er nu stemplet ...,False,"[0, 139]","{'hashtags': [], 'symbols': [], 'user_mentions...","<a href=""https://mobile.twitter.com"" rel=""nofo...",,,...,,,5,0,False,False,da,,,
27703,Tue Dec 01 19:03:14 +0000 2020,1333849153311543300,1333849153311543299,@Smitty04491966 @R4nd4hl @Enhedslisten @Gitte_...,False,"[65, 216]","{'hashtags': [{'text': 'FL20', 'indices': [166...","<a href=""https://mobile.twitter.com"" rel=""nofo...",1.333791e+18,1.3337909243216814e+18,...,,,0,4,False,False,da,,,
27704,Tue Dec 01 19:01:58 +0000 2020,1333848833395220500,1333848833395220484,@R4nd4hl @Enhedslisten @Gitte_Maria @TomasVogt...,False,"[49, 306]","{'hashtags': [], 'symbols': [], 'user_mentions...","<a href=""https://mobile.twitter.com"" rel=""nofo...",1.333776e+18,1.3337760964504576e+18,...,,,1,20,False,False,da,,,
27705,Tue Dec 01 12:59:35 +0000 2020,1333757637515686000,1333757637515685889,@pontoppa Fra finansloven/'krigskassen',False,"[10, 39]","{'hashtags': [], 'symbols': [], 'user_mentions...","<a href=""https://mobile.twitter.com"" rel=""nofo...",1.333744e+18,1.333744439802597e+18,...,,,0,2,False,False,no,,,


In [31]:
df = pd.DataFrame.from_records(records_all)

In [33]:
df.loc[1000:1010, :]

Unnamed: 0,created_at,id,id_str,full_text,truncated,display_text_range,entities,source,in_reply_to_status_id,in_reply_to_status_id_str,...,quoted_status_id_str,quoted_status_permalink,retweet_count,favorite_count,favorited,retweeted,lang,extended_entities,possibly_sensitive,quoted_status
1000,Sun Feb 25 18:42:27 +0000 2018,967832177533694000,967832177533693952,RT @alternativet_: I får lige denne her igen: ...,False,"[0, 135]","{'hashtags': [{'text': 'dkpol', 'indices': [11...","<a href=""http://twitter.com/#!/download/ipad"" ...",,,...,,,10,0,False,False,da,,False,
1001,Sun Feb 25 18:42:10 +0000 2018,967832106276618200,967832106276618240,RT @LindaVilladsen: Det her er min mand. Det h...,False,"[0, 140]","{'hashtags': [{'text': 'dkpol', 'indices': [97...","<a href=""http://twitter.com/#!/download/ipad"" ...",,,...,9.676779034744792e+17,"{'url': 'https://t.co/XMki4anzxp', 'expanded':...",36,0,False,False,da,,,
1002,Thu Feb 22 19:46:51 +0000 2018,966761221281153000,966761221281153026,RT @uffeelbaek: Hvis vi skal undgå klimakatast...,False,"[0, 140]","{'hashtags': [], 'symbols': [], 'user_mentions...","<a href=""http://twitter.com/download/iphone"" r...",,,...,,,20,0,False,False,da,,,
1003,Mon Feb 19 13:37:26 +0000 2018,965581089451511800,965581089451511809,RT @alternativet_: Vores medlemmer har stemt: ...,False,"[0, 140]","{'hashtags': [], 'symbols': [], 'user_mentions...","<a href=""http://twitter.com/download/iphone"" r...",,,...,,,18,0,False,False,da,,,
1004,Thu Feb 15 16:09:20 +0000 2018,964169768558387200,964169768558387200,RT @alternativet_: Spændende tanker fra Tyskla...,False,"[0, 140]","{'hashtags': [], 'symbols': [], 'user_mentions...","<a href=""http://twitter.com/download/iphone"" r...",,,...,,,13,0,False,False,da,,,
1005,Thu Feb 15 16:09:06 +0000 2018,964169709544443900,964169709544443905,"RT @uffeelbaek: Hvis den frihed, som kapitalis...",False,"[0, 140]","{'hashtags': [], 'symbols': [], 'user_mentions...","<a href=""http://twitter.com/download/iphone"" r...",,,...,,,43,0,False,False,da,,,
1006,Sun Feb 11 21:15:52 +0000 2018,962797357741084700,962797357741084677,RT @aggerhomien: Jeg går i en karakterfri klas...,False,"[0, 140]","{'hashtags': [], 'symbols': [], 'user_mentions...","<a href=""http://twitter.com/download/iphone"" r...",,,...,,,34,0,False,False,da,,,
1007,Fri Feb 09 10:45:24 +0000 2018,961913917613240300,961913917613240320,RT @JPYazdani: VI GJORDE DET! \n\nFørste Borge...,False,"[0, 140]","{'hashtags': [], 'symbols': [], 'user_mentions...","<a href=""http://twitter.com/download/iphone"" r...",,,...,,,78,0,False,False,da,,,
1008,Thu Feb 08 14:56:58 +0000 2018,961614839138410500,961614839138410496,RT @DMCamilla: Hurra! Så kom der på rekordtid ...,False,"[0, 140]","{'hashtags': [{'text': 'uddannelsesloft', 'ind...","<a href=""http://twitter.com/download/iphone"" r...",,,...,,,12,0,False,False,da,,,
1009,Thu Feb 08 14:56:16 +0000 2018,961614664252739600,961614664252739584,RT @uffeelbaek: Det er SÅ demokratisk håbefuld...,False,"[0, 140]","{'hashtags': [], 'symbols': [], 'user_mentions...","<a href=""http://twitter.com/download/iphone"" r...",,,...,,,28,0,False,False,da,,,


In [24]:
record_list = list(records)

In [26]:
record_list[1]

{'created_at': 'Thu Oct 22 13:00:50 +0000 2020',
 'id': 1319262436105588700,
 'id_str': '1319262436105588737',
 'full_text': 'RT @plejefamilierne: Det kan ikke lade sig gøre at tænke økonomi ved bortadoptioner. Domstole skal sikre barnet - men er lovgivningen god n…',
 'truncated': False,
 'display_text_range': [0, 140],
 'entities': {'hashtags': [],
  'symbols': [],
  'user_mentions': [{'screen_name': 'plejefamilierne',
    'name': 'Fagforeningen Plejefamiliernes Landsforening',
    'id': 723080847725891600,
    'id_str': '723080847725891584',
    'indices': [3, 19]}],
  'urls': []},
 'source': '<a href="http://twitter.com/download/iphone" rel="nofollow">Twitter for iPhone</a>',
 'in_reply_to_status_id': None,
 'in_reply_to_status_id_str': None,
 'in_reply_to_user_id': None,
 'in_reply_to_user_id_str': None,
 'in_reply_to_screen_name': None,
 'user': {'id': 2806864609,
  'id_str': '2806864609',
  'name': 'Torsten Gejl',
  'screen_name': 'TorstenGejl',
  'location': 'Aarhus/Djursland',

In [15]:
split = 4
length = int(len(datafiles)/split) #length of each fold
folds = []
for i in range(split-1):
    folds += [datafiles[i*length:(i+1)*length]]
folds += [datafiles[(split-1)*length:len(datafiles)]]

In [11]:
tweets_df = read_datafiles(datafiles[1800:1900])

In [12]:
tweets_df.shape

(861, 31)

In [14]:
tweets_df.loc[800:850, "entities"]

800    {'hashtags': [], 'symbols': [], 'user_mentions...
801    {'hashtags': [], 'symbols': [], 'user_mentions...
802    {'hashtags': [], 'symbols': [], 'user_mentions...
803    {'hashtags': [], 'symbols': [], 'user_mentions...
804    {'hashtags': [], 'symbols': [], 'user_mentions...
805    {'hashtags': [], 'symbols': [], 'user_mentions...
806    {'hashtags': [], 'symbols': [], 'user_mentions...
807    {'hashtags': [], 'symbols': [], 'user_mentions...
808    {'hashtags': [{'text': 'dkpol', 'indices': [26...
809    {'hashtags': [], 'symbols': [], 'user_mentions...
810    {'hashtags': [{'text': 'dkpol', 'indices': [11...
811    {'hashtags': [{'text': 'dkpol', 'indices': [25...
812    {'hashtags': [{'text': 'dkpol', 'indices': [23...
813    {'hashtags': [{'text': 'dkpol', 'indices': [15...
814    {'hashtags': [], 'symbols': [], 'user_mentions...
815    {'hashtags': [{'text': 'dkpol', 'indices': [25...
816    {'hashtags': [], 'symbols': [], 'user_mentions...
817    {'hashtags': [], 'symbol

In [205]:
pd.json_normalize(tweets_df.loc[0:5, 'in_reply_to_status_id'])

AttributeError: 'float' object has no attribute 'values'

In [14]:
chunked_data = chunk_data(datapath)

In [53]:
select_columns = ['created_at', 'id', 'full_text', 'truncated', 'display_text_range', 'source', 'in_reply_to_status_id', 'in_reply_to_user_id', 
                  'in_reply_to_screen_name', 'user', 'contributors', 'is_quote_status', 'quoted_status_id', 'quoted_status_permalink', 'retweet_count', 
                  'favorite_count', 'retweeted', 'quoted_status', 'name', 'hashtags', 'urls']

In [54]:
tweets_sel_df = tweets_df.loc[:, select_columns]

In [55]:
tweets_sel_df = tweets_sel_df.sample(frac=0.05, replace=False, random_state=42)

In [58]:
tweets_sel_df.to_csv('../../data/tweets/tweets_combined_sample_20201201.csv')