# Preprocessing of Twitter Data
Author: Florian Lugstein (flugstein@cs.sbg.ac.at)  
Date: 2020-07-29

In [1]:
import json
import time
import copy
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/flugstein/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Choose Dataset

In [2]:
options = ['vegan', 'neos', 'fpoe', 'schalke', 'bvb']
data_dates = {'vegan': '20200407', 'neos': '20200311', 'fpoe': '20200311', 'schalke': '20200409', 'bvb': '20200409'}

data_name = input('Choose dataset to preprocess: ')

while data_name not in options:
    print('Choose one of ({})'.format(', '.join(options)))
    data_name = input('Choose the dataset to preprocess: ')
    
data_date = data_dates[data_name]

Choose dataset to preprocess: neos


## Read Dataset JSON File

In [3]:
tweets_path = 'tweets/hpt_{}_tweets_utf8_sorted.json'.format(data_name)

In [4]:
used_columns = [
    'id_str',
    'truncated',
    'text',
    'entities.hashtags',
    'entities.urls',
    'entities.user_mentions',
    'entities.polls',
    'entities.media',
    'extended_tweet.full_text',
    'extended_tweet.entities.hashtags',
    'extended_tweet.entities.urls',
    'extended_tweet.entities.user_mentions',
    'extended_tweet.entities.polls',
    'extended_tweet.entities.media',
    'retweeted_status.id_str',
    'retweeted_status.text',
    'retweeted_status.entities.hashtags',
    'retweeted_status.entities.urls',
    'retweeted_status.entities.user_mentions',
    'retweeted_status.entities.media',
    'retweeted_status.extended_tweet.full_text',
    'retweeted_status.extended_tweet.entities.hashtags',
    'retweeted_status.extended_tweet.entities.urls',
    'retweeted_status.extended_tweet.entities.user_mentions',
    'retweeted_status.extended_tweet.entities.media',
    'quoted_status.id_str',
    'quoted_status.text',
    'quoted_status.entities.hashtags',
    'quoted_status.entities.urls',
    'quoted_status.entities.user_mentions',
    'quoted_status.entities.media',
    'quoted_status.extended_tweet.full_text',
    'quoted_status.extended_tweet.entities.hashtags',
    'quoted_status.extended_tweet.entities.urls',
    'quoted_status.extended_tweet.entities.user_mentions',
    'quoted_status.extended_tweet.entities.media',
    'in_reply_to_status_id_str',
    'quote_count',
    'retweet_count',
    'created_at',
    'user.id_str',
    'user.followers_count',
    'user.friends_count',
    'user.verified',
    'user.default_profile',
    'user.default_profile_image',
    'user.url',
    'user.listed_count',
    'user.statuses_count',
    'user.favourites_count',
    'user.created_at'
]

In [5]:
# For small data

starttime = time.time()

with open(tweets_path, 'r', encoding='utf8') as f:
    df = json_normalize(list(map(json.loads, f.readlines())))
    # df = df[df.columns.intersection(used_columns)] # only store used columns
    df = df.fillna(0) # replace NaNs with zeros
    
print(time.time() - starttime)

71.91014003753662


In [None]:
# For big data

startime = time.time()
chunksize = 100000

with open(tweets_path, 'r', encoding='utf8') as f:
    lines = []
    frames = []
    for i, line in enumerate(f):         
        lines.append(json.loads(line))
        if i != 0 and i % chunksize == 0:
            frame = json_normalize(lines)
            frame = frame[frame.columns.intersection(used_columns)] # only store used columns
            frame = frame.fillna(0) # replace NaNs with zeros
            frames.append(frame)
            lines = []
    frame = json_normalize(lines)
    frame = frame[frame.columns.intersection(used_columns)] # only store used columns
    frame = frame.fillna(0) # replace NaNs with zeros
    frames.append(frame)

print(len(frames))
print(time.time() - starttime)
df = pd.concat(frames, ignore_index=True, sort=False)
print(time.time() - starttime)

In [6]:
# View dataframe
print('{} tweets with {} features in dataset'.format(df.shape[0], df.shape[1]))
print(df.shape)
display(df)

# Documentation: https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/intro-to-tweet-json
# Examples: https://gwu-libraries.github.io/sfm-ui/posts/2016-11-10-twitter-interaction


55843 tweets with 386 features in dataset
(55843, 386)


Unnamed: 0,contributors,coordinates,coordinates.coordinates,coordinates.type,created_at,display_text_range,entities.hashtags,entities.media,entities.polls,entities.symbols,...,user.profile_use_background_image,user.protected,user.screen_name,user.statuses_count,user.time_zone,user.translator_type,user.updated,user.url,user.utc_offset,user.verified
0,0,0.0,0,0,Fri Aug 23 00:03:41 +0000 2019,0,[],0,0,[],...,True,False,bern_sch,1998,0,none,0,0,0,False
1,0,0.0,0,0,Fri Aug 23 00:09:02 +0000 2019,"[25, 140]",[],0,0,[],...,True,False,schu_rob,48,0,none,0,0,0,False
2,0,0.0,0,0,Fri Aug 23 00:32:06 +0000 2019,0,[],0,0,[],...,False,False,harthart777,3251,0,none,0,0,0,False
3,0,0.0,0,0,Fri Aug 23 00:37:44 +0000 2019,"[38, 140]",[],0,0,[],...,True,False,didierradio,7886,0,none,[description],http://www.borsh.eu,0,False
4,0,0.0,0,0,Fri Aug 23 00:41:32 +0000 2019,"[117, 140]",[],0,0,[],...,True,False,slysmartfox,28054,0,none,"[description, name, location]",0,0,False
5,0,0.0,0,0,Fri Aug 23 00:46:02 +0000 2019,"[50, 140]",[],0,0,[],...,True,False,Freizeitrobin,178784,0,none,[name],0,0,False
6,0,0.0,0,0,Fri Aug 23 02:04:12 +0000 2019,"[56, 82]",[],0,0,[],...,True,False,WielandAlge,7138,0,none,[description],http://www.mad.ag,0,False
7,0,0.0,0,0,Fri Aug 23 03:50:14 +0000 2019,0,[],0,0,[],...,True,False,emyhier,5839,0,none,0,0,0,False
8,0,0.0,0,0,Fri Aug 23 03:55:12 +0000 2019,"[33, 86]",[],0,0,[],...,True,False,fluglehrer,109672,0,none,[description],0,0,False
9,0,0.0,0,0,Fri Aug 23 04:28:16 +0000 2019,0,[],0,0,[],...,True,False,emyhier,5860,0,none,0,0,0,False


## Read Transition Table

In [7]:
user_trans_path = 'trans_tables/transitiontable_reduced_{}_{}.txt'.format(data_name, data_date)

user_trans_table = {}
with open(user_trans_path, 'r') as f:
    for line in f:
        split_line = line.split(' ')
        user_trans_table[split_line[0]] = np.int64(split_line[1])


In [8]:
print('We have ' + str(len(user_trans_table)) + ' user ids in the transition table.')

We have 8400 user ids in the transition table.


## Create Output Dataframe

In [9]:
# Create output dataframe
tf = pd.DataFrame()

out_path = 'tweets_preproc_{}_{}.csv'.format(data_name, data_date)

## Tweet Features

### Ids

In [10]:
# Ids starting at 1

tf['id'] = np.int64(df.index + 1)

def id2index(id_):
    return id_ - 1

old_to_new_id = {old_id:np.int64(idx+1) for idx, old_id in df['id_str'].items()}  # dict to convert old to new ids


### Helper Functions

In [11]:
# Helper functions


# Convert old twitter api id to new id for 'retweet_of' or 'reply_of' column
def conv_id(old_id):
    if old_id == 0:
        return 0  # default value if tweet is not a retweet/quote or reply
    
    no_parent = 0  # default value if parent of retweet/quote or reply tweet is not in dataset
    
    return old_to_new_id.get(old_id, no_parent)


# Get the id of the parent tweet for a retweet/quote
# Row of df
def retweet_of(row):
    # retweets of quotes have both 'quoted_status' and 'retweeted_status' fields
    # 'quoted_status' contains the parent tweet, therefore it's checked first
    
    if row['quoted_status.id_str'] != 0:
        return conv_id(row['quoted_status.id_str'])
    
    if row['retweeted_status.id_str'] != 0:
        return conv_id(row['retweeted_status.id_str'])
    
    return 0


# Remove source tweets and their retweets/quotes and replies 3 days before the last day of the dataset, to fix retweet statistics, since most retweets occur 3 days after the original tweet
# Returns true if tweet should be removed
# Row of tf
def retweet_fix(row):
    cutoff_days = pd.offsets.Day(3)
    if row['source'] == True | row['reply'] == True:
        if pd.datetime.date(row['date'] + cutoff_days) > row['last_day']:
            return True
    elif row['retweet_of'] != 0:
        if pd.datetime.date(tf.loc[id2index(row['retweet_of'])]['date'] + cutoff_days) > row['last_day']:
            return True
    return False


# Remove retweets/quotes of source tweets outside the dataset
# Iterates the retweet/quote tree from bottom to top, and marks the tweet for deletion if an ancestor is outside the dataset
# Row of tf
def remove_retweets_of_outside(row):
    while row['retweet'] == True:
        if row['retweet_of'] == 0:
            return True
        row = tf.loc[id2index(row['retweet_of'])]  # check next retweet/quote in chain
    return False


# Remove replies of source tweets outside the dataset
# Iterates the reply tree from bottom to top, and marks the tweet for deletion if an ancestor is outside the dataset
# Row of tf
def remove_replies_of_outside(row):
    while row['reply'] == True:
        if row['reply_of'] == 0:
            return True
        row = tf.loc[id2index(row['reply_of'])]  # check next reply in chain
    return False


# Get retweet/quote count for id using retweet_counts: series of (id, n_retweeted)
def get_retweet_count(id_):
    if id_ not in retweet_counts:
        return 0
    return retweet_counts.loc[id_]


# Returns the correct entity depending on if the tweet is extended or not
# entity_name: 'text', 'hashtags', 'urls', 'user_mentions', 'polls', 'media'
def get_entity(row, entity_name):   
    if row['extended_tweet.full_text'] != 0:
        if entity_name == 'text':
            return row['extended_tweet.full_text']
        return row['extended_tweet.entities.' + entity_name]
    else:
        if entity_name == 'text':
            return row['text']
        return row['entities.' + entity_name]


# Count hashtags/urls/mentions/media for one tweet
# Row of df
def count_entity(row, entity_name):
    entity = get_entity(row, entity_name)
    
    if entity == 0:
        return 0
    
    return len(entity)

    
# Count media of certain type for one tweet
# Row of df
def count_media(row, type_):
    media = get_entity(row, 'media')
    
    if media == 0:
        return 0
    
    count = 0
    for m in media:
        if m['type'] == type_:
            count += 1
    
    return count


# Count text length of one tweet
# Row of df
def count_text(row):  
    return len(get_entity(row, 'text'))


### Time/Date Columns

In [12]:
starttime = time.time()

In [13]:
# Time/date columns of a tweet

tf['timestamp'] = pd.to_datetime(df['created_at'])  # timestamp with second accuracy

tf['time'] = tf['timestamp'].dt.time  # only time, regardless of date

tf['hour'] = tf['timestamp'].dt.hour  # only hour, regardless of date

tf['date'] = tf['timestamp'].dt.date  # only date

tf['weekday_enc'] = tf['timestamp'].dt.dayofweek  # weekday encoded: (Monday == 0, Tuesday == 1, ..., Sunday == 6)

tf['weekday'] = tf['timestamp'].dt.day_name()  # weekday as string (Monday, Tuesday, ..., Sunday)

tf['first_day'] = tf['date'][0]  # first day of any tweet in the dataset

tf['last_day'] = tf['date'][len(tf) - 1]  # last day of any tweet in the dataset (without retweet fix)


In [14]:
print(time.time() - starttime)

8.764108419418335


### Tweet Types

In [15]:
starttime = time.time()

In [16]:
# Tweet types

# Source
# not retweet, not quote, not reply

tf['source'] = (df['retweeted_status.id_str'] == 0) & (df['quoted_status.id_str'] == 0) & (df['in_reply_to_status_id_str'] == 0)


# Retweet
# 'retweet' => tweet is a retweet/quote
# 'retweeted_count' => number of retweets/quotes of this tweet in the dataset
# 'retweeted'==True => there is a retweet/quote of this tweet in the dataset

tf['retweet'] = (df['retweeted_status.id_str'] != 0) | (df['quoted_status.id_str'] != 0)

tf['retweet_of'] = df.apply(retweet_of, axis=1)


# Retweet with comment (quote in the twitter API)

tf['retweet_with_comment'] = (df['quoted_status.id_str'] != 0)


# Reply

tf['reply'] = df['in_reply_to_status_id_str'] != 0

tf['reply_of'] = df['in_reply_to_status_id_str'].apply(conv_id)


# Remove tweets and compute statistics

# Remove retweets/quotes of source tweets outside the dataset
to_remove = tf.apply(remove_retweets_of_outside, axis=1)  # series of (id, to_remove), which tweets need to be removed
rem_count_retweets = len(to_remove[to_remove == True])
print('{} retweets of source tweets outside the dataset removed'.format(rem_count_retweets))
      
# Remove replies of source tweets outside the dataset
to_remove |= tf.apply(remove_replies_of_outside, axis=1)
rem_count_replies = len(to_remove[to_remove == True]) - rem_count_retweets
print('{} replies of source tweets outside the dataset removed'.format(rem_count_replies))

# Remove pure source, reply tweets and their retweets/quotes 3 days before the last day of the dataset, to fix retweet/quote statistics, since most retweets/quotes occur 3 days after the original tweet
to_remove |= tf.apply(retweet_fix, axis=1)
rem_count_3_day_fix = len(to_remove[to_remove == True]) - rem_count_retweets - rem_count_replies
print('{} tweets removed for 3 day fix'.format(rem_count_3_day_fix))

# Remove marked tweets
tf.drop(to_remove[to_remove == True].index, inplace=True)
del to_remove, rem_count_retweets, rem_count_replies, rem_count_3_day_fix

# Compute retweet statistics
retweet_counts = tf[tf['retweet_of'] != 0]['retweet_of'].value_counts()  # series of (id, n_retweeted), how many times did tweet with id get retweeted within the dataset
tf['retweeted_count'] = tf['id'].apply(get_retweet_count)
del retweet_counts


7837 retweets of source tweets outside the dataset removed
10666 replies of source tweets outside the dataset removed
697 tweets removed for 3 day fix


In [17]:
print(time.time() - starttime)

42.30142116546631


### Tweet Content

In [18]:
starttime = time.time()

In [19]:
# Tweet content

# Hashtags

tf['hashtag_count'] = df.apply(count_entity, args=('hashtags',), axis=1)

tf['hashtag'] = tf['hashtag_count'] != 0


# URLs

tf['url_count'] = df.apply(count_entity, args=('urls',), axis=1)

tf['url'] = tf['url_count'] != 0


# Mentions

tf['mention_count'] = df.apply(count_entity, args=('user_mentions',), axis=1)

tf['mention'] = tf['mention_count'] != 0


# Polls

tf['poll'] = (df['entities.polls'] != 0) | (df['extended_tweet.entities.polls'] != 0)


# Media

tf['media_count'] = df.apply(count_entity, args=('media',), axis=1)

tf['media'] = tf['media_count'] != 0

tf['photo_count'] = df.apply(count_media, args=('photo',), axis=1)

tf['video_count'] = df.apply(count_media, args=('video',), axis=1)

tf['animated_gif_count'] = df.apply(count_media, args=('animated_gif',), axis=1)


# Text length

tf['text_length'] = df.apply(count_text, axis=1)

tf['text_length_median'] = tf['text_length'].median()


In [20]:
print(time.time() - starttime)

19.44887375831604


## User Features

### Helper Functions

In [21]:
# Helper functions


# Convert old twitter api to new user id using transition table
def conv_user_id(id_):   
    not_found_value = 0
    return user_trans_table.get(id_, not_found_value)


# How active is the user in terms of produced tweets (source tweets and retweets)
# Row of tf
def user_activity(row):
    if row['user.account_age'] == 0:
        return None
    last_tweet_statuses_count = tf.loc[id2index(row['user.last_tweet'])]['user.statuses_count']
    return last_tweet_statuses_count / row['user.account_age']


# How active is the user in terms of produced tweets and likes
# Row of tf
def user_tweets_likes_activity(row):
    if row['user.account_age'] == 0:
        return None
    return (row['user.statuses_count'] + row['user.favourites_count']) / row['user.account_age']


# In which quantile is the property
def quantile(prop_value, q1, q2, q3):
    if prop_value < q1:
        return 0
    elif prop_value < q2:
        return 1
    elif prop_value < q3:
        return 2
    else:
        return 3


### User ID

In [22]:
# User ID

tf['user.id'] = df['user.id_str'].apply(conv_user_id)
n_not_found_ids = len(tf[tf['user.id'] == 0])
if (n_not_found_ids != 0):
    print('Warning: ' + str(n_not_found_ids) + ' user ids could not be found in the transition table.')




### Followers

In [23]:
# Followers

tf['user.followers_count'] = df['user.followers_count']
q1 = tf['user.followers_count'].quantile(q=0.25)
q2 = tf['user.followers_count'].quantile(q=0.5)
q3 = tf['user.followers_count'].quantile(q=0.75)
tf['user.followers_count_quantile'] = tf['user.followers_count'].apply(quantile, args=(q1, q2, q3))

tf['user.followees_count'] = df['user.friends_count']
q1 = tf['user.followees_count'].quantile(q=0.25)
q2 = tf['user.followees_count'].quantile(q=0.5)
q3 = tf['user.followees_count'].quantile(q=0.75)
tf['user.followees_count_quantile'] = tf['user.followees_count'].apply(quantile, args=(q1, q2, q3))

del q1, q2, q3

### Profile Info

In [24]:
# Profile info

tf['user.verified'] = df['user.verified']
tf['user.default_profile'] = df['user.default_profile']
tf['user.default_profile_image'] = df['user.default_profile_image']
tf['user.url'] = df['user.url'] != 0

tf['user.listed_count'] = df['user.listed_count']
tf['user.listed_count_mean'] = tf['user.listed_count'].mean()


### Activity

In [25]:
starttime = time.time()

In [26]:
lutf = tf[['user.id', 'id']].drop_duplicates(subset='user.id', keep='last').set_index('user.id', drop=True)  # table of last tweet of every user
tf['user.last_tweet'] = tf['user.id'].apply(lambda user_id: lutf.loc[user_id]['id'])
del lutf

In [27]:
# Activity

tf['user.statuses_count'] = df['user.statuses_count']

tf['user.favourites_count'] = df['user.favourites_count']

tf['user.account_age'] = (tf['last_day'] - pd.to_datetime(df['user.created_at']).dt.date).dt.days
tf['user.account_age'] = tf['user.account_age'].astype(int)

tf['user.activity'] = tf.apply(user_activity, axis=1)

tf['user.mean_activity'] = tf.drop_duplicates(subset='user.id', keep='last')['user.activity'].mean()

tf['user.tweets_likes_activity'] = tf.apply(user_tweets_likes_activity, axis=1)


In [28]:
print(time.time() - starttime)

30.666560649871826


## Sperate Text Data

In [29]:
# Seperate text dataframe
txtf = pd.DataFrame()

txt_out_path = 'tweets_preproc_{}_{}_tweet_text.csv'.format(data_name, data_date)

In [30]:
tknzr = TweetTokenizer(preserve_case=False)
stops = Counter(stopwords.words('german') + stopwords.words('english'))

def anon_text(row):
    text = get_entity(row, 'text')
    
    # anon mentions and replace shortened urls with expanded versions
    mentions_and_urls = get_entity(row, 'user_mentions') + get_entity(row, 'urls')
    mentions_and_urls.sort(key=lambda e: e['indices'][0])
    
    offset = 0
    for mu in mentions_and_urls:
        if 'screen_name' in mu:
            # mention
            mention = mu
            user_id = str(conv_user_id(mention['id_str']))
            indices = mention['indices']

            text = text[:indices[0]+offset] + '@user' + user_id + text[indices[1]+offset:]
            offset += len('@user' + user_id) - (indices[1] - indices[0])
        else:
            # url
            url = mu
            expanded_url = url['expanded_url']
            indices = url['indices']

            text = text[:indices[0]+offset] + expanded_url + text[indices[1]+offset:]
            offset += len(expanded_url) - (indices[1] - indices[0])

    # remove stop words
    text = [word for word in tknzr.tokenize(text) if word not in stops]
    
    # remove link to tweet, which is present if it contains media
    if count_entity(row, 'media') > 0:
        text = text[:-1]
    
    return text


In [31]:
starttime = time.time()

In [32]:
txtf['id'] = tf['id']

txtf['text'] = df.apply(anon_text, axis=1)

txtf['text_sorted'] = txtf['text'].apply(sorted)

In [33]:
print(time.time() - starttime)

12.43366551399231


## Save dataframe as CSV file

In [34]:
display(tf)

Unnamed: 0,id,timestamp,time,hour,date,weekday_enc,weekday,first_day,last_day,source,...,user.url,user.listed_count,user.listed_count_mean,user.last_tweet,user.statuses_count,user.favourites_count,user.account_age,user.activity,user.mean_activity,user.tweets_likes_activity
2,3,2019-08-23 00:32:06+00:00,00:32:06,0,2019-08-23,4,Friday,2019-08-23,2019-10-02,True,...,False,0,77.977731,10503,3251,0,1255,2.735458,11.396612,2.590438
10,11,2019-08-23 04:52:26+00:00,04:52:26,4,2019-08-23,4,Friday,2019-08-23,2019-10-02,True,...,True,0,77.977731,41888,7,14,46,0.543478,11.396612,0.456522
13,14,2019-08-23 05:08:56+00:00,05:08:56,5,2019-08-23,4,Friday,2019-08-23,2019-10-02,True,...,True,9,77.977731,14,1040,1137,1742,0.597015,11.396612,1.249713
44,45,2019-08-23 06:45:53+00:00,06:45:53,6,2019-08-23,4,Friday,2019-08-23,2019-10-02,False,...,True,75,77.977731,51491,63417,32790,3742,17.460983,11.396612,25.710048
70,71,2019-08-23 07:55:30+00:00,07:55:30,7,2019-08-23,4,Friday,2019-08-23,2019-10-02,True,...,False,0,77.977731,54250,1679,1984,993,3.762336,11.396612,3.688822
73,74,2019-08-23 07:59:09+00:00,07:59:09,7,2019-08-23,4,Friday,2019-08-23,2019-10-02,True,...,True,9,77.977731,74,1186,763,1349,0.879170,11.396612,1.444774
82,83,2019-08-23 08:12:06+00:00,08:12:06,8,2019-08-23,4,Friday,2019-08-23,2019-10-02,True,...,True,4,77.977731,44616,76820,15,567,145.042328,11.396612,135.511464
88,89,2019-08-23 08:20:21+00:00,08:20:21,8,2019-08-23,4,Friday,2019-08-23,2019-10-02,True,...,True,853,77.977731,51679,128815,1944,3757,34.937450,11.396612,34.804099
90,91,2019-08-23 08:21:43+00:00,08:21:43,8,2019-08-23,4,Friday,2019-08-23,2019-10-02,True,...,True,31,77.977731,53092,4368,731,2403,1.910529,11.396612,2.121931
92,93,2019-08-23 08:25:45+00:00,08:25:45,8,2019-08-23,4,Friday,2019-08-23,2019-10-02,False,...,False,4,77.977731,55512,17448,30060,732,17.685792,11.396612,64.901639


In [35]:
display(txtf)

Unnamed: 0,id,text,text_sorted
2,3,"[fellner, !, live, :, sommergespräch, beate, m...","[!, :, beate, fellner, https://www.youtube.com..."
10,11,"[freu, schon, heute, ., @user6599683]","[., @user6599683, freu, heute, schon]"
13,14,[https://tvthek.orf.at/profile/Dok-1/13844820/...,"[#hysterie, #verbote, #verbotswahn, ,, ?, @use..."
44,45,"[rt, @user6796415, :, @user5709541, ,, #fpö, ,...","[#fpö, ,, ,, ,, ,, ,, ., ., :, @user0, @user42..."
70,71,"[övp, kopiert, immer, öfters, fpö, forderungen...","[!, !, ?, forderungen, fpö, freut, grünen, htt..."
73,74,"[vorteil, fpö, wahlkampf, :, einzige, ,, vorhe...","[,, ,, ,, ., ., ., :, :, einzige, fpö, grün-ne..."
82,83,"[fpö-brückl, :, „, neos, positionieren, steuer...","[:, fpö-brückl, https://brandaktuell.at/2019/0..."
88,89,"[., @user894087, gastronom, kultursprecher, @u...","["", "", ., ., ., :, @user8437691, @user894087, ..."
90,91,"[angehende, student, #philippvelich, wünscht, ...","[#klimaschutz, #philippvelich, #talkimhangar7,..."
92,93,"[hochschulreife, dürfte, heutzutage, zufallspr...","[dürfte, heutzutage, hochschulreife, vergeben,..."


In [22]:
# Save dataframe as CSV file
tf.to_csv(out_path, index=False)
txtf.to_csv(txt_out_path, index=False)