# Preprocessing of Twitter Data
Author: Florian Lugstein (flugstein@cs.sbg.ac.at)  
Date: 2020-07-29

In [1]:
import json
import time
import copy
import numpy as np
import pandas as pd
from pandas.io.json import json_normalize
from collections import Counter
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import TweetTokenizer
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/flugstein/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

## Choose Dataset

In [2]:
options = ['vegan', 'neos', 'fpoe', 'schalke', 'bvb']
data_dates = {'vegan': '20200407', 'neos': '20200311', 'fpoe': '20200311', 'schalke': '20200409', 'bvb': '20200409'}

data_name = input('Choose dataset to preprocess: ')

while data_name not in options:
    print('Choose one of ({})'.format(', '.join(options)))
    data_name = input('Choose the dataset to preprocess: ')
    
data_date = data_dates[data_name]

Choose dataset to preprocess: vegan


## Read Dataset JSON File

In [3]:
tweets_path = 'tweets/hpt_{}_tweets_utf8_sorted.json'.format(data_name)

In [4]:
used_columns = [
    'id_str',
    'truncated',
    'text',
    'entities.hashtags',
    'entities.urls',
    'entities.user_mentions',
    'entities.polls',
    'entities.media',
    'extended_tweet.full_text',
    'extended_tweet.entities.hashtags',
    'extended_tweet.entities.urls',
    'extended_tweet.entities.user_mentions',
    'extended_tweet.entities.polls',
    'extended_tweet.entities.media',
    'retweeted_status.id_str',
    'retweeted_status.text',
    'retweeted_status.entities.hashtags',
    'retweeted_status.entities.urls',
    'retweeted_status.entities.user_mentions',
    'retweeted_status.entities.media',
    'retweeted_status.extended_tweet.full_text',
    'retweeted_status.extended_tweet.entities.hashtags',
    'retweeted_status.extended_tweet.entities.urls',
    'retweeted_status.extended_tweet.entities.user_mentions',
    'retweeted_status.extended_tweet.entities.media',
    'quoted_status.id_str',
    'quoted_status.text',
    'quoted_status.entities.hashtags',
    'quoted_status.entities.urls',
    'quoted_status.entities.user_mentions',
    'quoted_status.entities.media',
    'quoted_status.extended_tweet.full_text',
    'quoted_status.extended_tweet.entities.hashtags',
    'quoted_status.extended_tweet.entities.urls',
    'quoted_status.extended_tweet.entities.user_mentions',
    'quoted_status.extended_tweet.entities.media',
    'in_reply_to_status_id_str',
    'quote_count',
    'retweet_count',
    'created_at',
    'user.id_str',
    'user.followers_count',
    'user.friends_count',
    'user.verified',
    'user.default_profile',
    'user.default_profile_image',
    'user.url',
    'user.listed_count',
    'user.statuses_count',
    'user.favourites_count',
    'user.created_at'
]

In [5]:
# For small data

starttime = time.time()

with open(tweets_path, 'r', encoding='utf8') as f:
    df = json_normalize(list(map(json.loads, f.readlines())))
    # df = df[df.columns.intersection(used_columns)] # only store used columns
    df = df.fillna(0) # replace NaNs with zeros
    
print(time.time() - starttime)

22.954254388809204


In [None]:
# For big data

startime = time.time()
chunksize = 100000

with open(tweets_path, 'r', encoding='utf8') as f:
    lines = []
    frames = []
    for i, line in enumerate(f):         
        lines.append(json.loads(line))
        if i != 0 and i % chunksize == 0:
            frame = json_normalize(lines)
            frame = frame[frame.columns.intersection(used_columns)] # only store used columns
            frame = frame.fillna(0) # replace NaNs with zeros
            frames.append(frame)
            lines = []
    frame = json_normalize(lines)
    frame = frame[frame.columns.intersection(used_columns)] # only store used columns
    frame = frame.fillna(0) # replace NaNs with zeros
    frames.append(frame)

print(len(frames))
print(time.time() - starttime)
df = pd.concat(frames, ignore_index=True, sort=False)
print(time.time() - starttime)

In [6]:
# View dataframe
print('{} tweets with {} features in dataset'.format(df.shape[0], df.shape[1]))
print(df.shape)
display(df)

# Documentation: https://developer.twitter.com/en/docs/tweets/data-dictionary/overview/intro-to-tweet-json
# Examples: https://gwu-libraries.github.io/sfm-ui/posts/2016-11-10-twitter-interaction


22325 tweets with 390 features in dataset
(22325, 390)


Unnamed: 0,contributors,coordinates,coordinates.coordinates,coordinates.type,created_at,display_text_range,entities.hashtags,entities.media,entities.polls,entities.symbols,...,user.profile_use_background_image,user.protected,user.screen_name,user.statuses_count,user.time_zone,user.translator_type,user.updated,user.url,user.utc_offset,user.verified
0,0,0.0,0,0,Fri Aug 23 00:08:41 +0000 2019,"[6, 18]",[],0,0,[],...,True,False,Peregrinetrade,1907,0,none,0,0,0,False
1,0,0.0,0,0,Fri Aug 23 00:26:51 +0000 2019,0,[],0,0,[],...,True,False,xoryps,39239,0,none,0,0,0,False
2,0,0.0,0,0,Fri Aug 23 00:41:15 +0000 2019,"[11, 24]",[],0,0,[],...,True,False,Johnnyrimes,1403,0,none,[description],0,0,False
3,0,0.0,0,0,Fri Aug 23 00:42:43 +0000 2019,0,[],0,0,[],...,True,False,PeKo40678598,210,0,none,[name],0,0,False
4,0,0.0,0,0,Fri Aug 23 00:48:33 +0000 2019,0,[],0,0,[],...,True,False,Donturunfromme,65844,0,none,0,0,0,False
5,0,0.0,0,0,Fri Aug 23 00:55:48 +0000 2019,0,[],0,0,[],...,True,False,Knifnek,34726,0,none,0,0,0,False
6,0,0.0,0,0,Fri Aug 23 00:57:58 +0000 2019,0,[],0,0,[],...,True,False,derllad,28664,0,none,"[description, name]",0,0,False
7,0,0.0,0,0,Fri Aug 23 01:06:22 +0000 2019,0,[],0,0,[],...,False,False,GuardNightmane,116569,0,none,[description],0,0,False
8,0,0.0,0,0,Fri Aug 23 02:11:12 +0000 2019,"[0, 55]","[{'text': 'vegan', 'indices': [49, 55]}]","[{'id': 1164721721049399296, 'id_str': '116472...",0,[],...,True,False,Veg_Ezy_,36790,0,none,"[description, name, location]",http://www.vegezy.com,0,False
9,0,0.0,0,0,Fri Aug 23 02:14:00 +0000 2019,0,[],0,0,[],...,True,False,NaneJuergensen,277875,0,none,0,http://www.keine-tricks-nur-jesus.de,0,False


## Read Transition Table

In [7]:
user_trans_path = 'trans_tables/transitiontable_reduced_{}_{}.txt'.format(data_name, data_date)

user_trans_table = {}
with open(user_trans_path, 'r') as f:
    for line in f:
        split_line = line.split(' ')
        user_trans_table[split_line[0]] = np.int64(split_line[1])


In [8]:
print('We have ' + str(len(user_trans_table)) + ' user ids in the transition table.')

We have 12088 user ids in the transition table.


## Create Output Dataframe

In [9]:
# Create output dataframe
tf = pd.DataFrame()

out_path = 'tweets_preproc_{}_{}.csv'.format(data_name, data_date)

## Tweet Features

### Ids

In [10]:
# Ids starting at 1

tf['id'] = np.int64(df.index + 1)

def id2index(id_):
    return id_ - 1

old_to_new_id = {old_id:np.int64(idx+1) for idx, old_id in df['id_str'].items()}  # dict to convert old to new ids


### Helper Functions

In [11]:
# Helper functions


# Convert old id to new id for 'reply_of' or 'retweet_of' or 'quoted_of' column
def conv_id(old_id):
    if old_id == 0:
        return 0  # default value if tweet is not a reply/retweet/quote
    
    no_parent = 0  # default value if parent of reply/retweet/quote tweet is not in dataset
    
    return old_to_new_id.get(old_id, no_parent)


# Remove source tweets and their retweets x days before the last day of the dataset, to fix retweet statistics, since most retweets occur x days after the original tweet
# Returns true if tweet should be removed
# Row of tf
def retweet_fix(row):
    cutoff_days = pd.offsets.Day(3)
    if row['pure_source'] == True:
        if pd.datetime.date(row['date'] + cutoff_days) > row['last_day']:
            return True
    elif row['retweet_of'] != 0:
        if pd.datetime.date(tf.loc[id2index(row['retweet_of'])]['date'] + cutoff_days) > row['last_day']:
            return True
    return False


# Get the id of the original tweet for a retweet/quote
# Row of df
def retweet_of(row):
    # retweets of quotes have both 'quoted_status' and 'retweeted_status' fields
    # 'quoted_status' contains the original source tweet, therefore it's checked first
    
    if row['quoted_status.id_str'] != 0:
        return conv_id(row['quoted_status.id_str'])
    
    if row['retweeted_status.id_str'] != 0:
        return conv_id(row['retweeted_status.id_str'])
    
    return 0


# Get retweet count for id using retweet_counts: series of (id, n_retweeted)
def get_retweet_count(id_):
    if id_ not in retweet_counts:
        return 0
    return retweet_counts.loc[id_]


# status_type: 'retweeted_status', 'quoted_status', ''
def get_entity_by_status_type(row, entity_name, status_type):
    if status_type != '':
        status_type += '.'
    
    if row[status_type + 'extended_tweet.full_text'] != 0:
        if entity_name == 'text':
            return row[status_type + 'extended_tweet.full_text']
        return row[status_type + 'extended_tweet.entities.' + entity_name]
    else:
        if entity_name == 'text':
            return row[status_type + 'text']
        return row[status_type + 'entities.' + entity_name]

    
# Returns the correct entity depending on tweet type and does merging for quotes
# entity_name: 'text', 'hashtags', 'urls', 'user_mentions', 'polls', 'media'
def get_entity(row, entity_name):
    if row['quoted_status.id_str'] != 0 and row['retweeted_status.id_str'] != 0:
        # retweet of quote: merge retweet (contains comment) and quote (contains original tweet)
        
        retweet_entity = get_entity_by_status_type(row, entity_name, 'retweeted_status')
        quote_entity = get_entity_by_status_type(row, entity_name, 'quoted_status')
        
        if entity_name == 'text':
            return retweet_entity + ' ' + quote_entity  # merge text
        else:
            if quote_entity == 0:
                return retweet_entity
            if retweet_entity == 0:
                return quote_entity
            
            # shift indices of 2nd entity because text gets merged
            retweet_text_len = len(get_entity_by_status_type(row, 'text', 'retweeted_status'))
            
            quote_entity = copy.deepcopy(quote_entity)
            for e in quote_entity:
                if 'indices' in e:
                    e['indices'][0] += retweet_text_len + len(' ')
                    e['indices'][1] += retweet_text_len + len(' ')

            # merge entities
            return retweet_entity + quote_entity  # merge lists
            
    elif row['quoted_status.id_str'] != 0:
        # quote: merge source tweet (contains comment) and quote (contains original tweet)
        
        source_entity = get_entity_by_status_type(row, entity_name, '')
        quote_entity = get_entity_by_status_type(row, entity_name, 'quoted_status')
        
        if entity_name == 'text':
            return source_entity + ' ' + quote_entity  # merge text
        else:
            if quote_entity == 0:
                return source_entity
            if source_entity == 0:
                return quote_entity

            # shift indices of 2nd entity because text gets merged
            source_text_len = len(get_entity_by_status_type(row, 'text', ''))
            
            quote_entity = copy.deepcopy(quote_entity)
            for e in quote_entity:
                if 'indices' in e:
                    e['indices'][0] += source_text_len + len(' ')
                    e['indices'][1] += source_text_len + len(' ')

            # merge entities
            return source_entity + quote_entity  # merge lists
            
    elif row['retweeted_status.id_str'] != 0:
        # retweet
        
        return get_entity_by_status_type(row, entity_name, 'retweeted_status')
            
    else:
        # source tweet (pure source or reply)
        
        return get_entity_by_status_type(row, entity_name, '')


# Count hashtags/urls/mentions/media for one tweet
# Row of df
def count_entity(row, entity_name):
    entity = get_entity(row, entity_name)
    
    if entity == 0:
        return 0
    
    return len(entity)

    
# Count media of certain type for one tweet
# Row of df
def count_media(row, type_):
    media = get_entity(row, 'media')
    
    if media == 0:
        return 0
    
    count = 0
    for m in media:
        if m['type'] == type_:
            count += 1
    
    return count


# Count text length of one tweet
# Row of df
def count_text(row):  
    return len(get_entity(row, 'text'))


In [100]:
print(df.iloc[3757]['id_str'])
print(df.iloc[3757]['user.screen_name'])
print('')
print(df.iloc[3757]['text'])
print('')
print(df.iloc[3757]['retweeted_status.text'])
print('')
print(df.iloc[3757]['quoted_status.text'])

1167067244830109696
squadrat

RT @Kiki1Bail: Der @neos_eu Balken ist ja interessant 🤔
👍Gefällt mir. 
Aber diesmal: 💚 https://t.co/I0QnyCLJSe https://t.co/hplsWofpoF

Der @neos_eu Balken ist ja interessant 🤔
👍Gefällt mir. 
Aber diesmal: 💚 https://t.co/I0QnyCLJSe https://t.co/hplsWofpoF

Nationalratswahl: Wie wählen Homosexuelle? Eine Studie zeigt Wahlmotive und Unterschiede zu Heterosexuellen auf. https://t.co/KxTikzdgHS


In [120]:
get_entity(df.iloc[3757], 'text')

'Der @neos_eu Balken ist ja interessant 🤔\n👍Gefällt mir. \nAber diesmal: 💚 https://t.co/I0QnyCLJSe https://t.co/hplsWofpoF Nationalratswahl: Wie wählen Homosexuelle? Eine Studie zeigt Wahlmotive und Unterschiede zu Heterosexuellen auf. https://t.co/KxTikzdgHS'

### Time/Date Columns

In [12]:
starttime = time.time()

In [13]:
# Time/date columns of a tweet

tf['timestamp'] = pd.to_datetime(df['created_at'])  # timestamp with second accuracy

tf['time'] = tf['timestamp'].dt.time  # only time, regardless of date

tf['hour'] = tf['timestamp'].dt.hour  # only hour, regardless of date

tf['date'] = tf['timestamp'].dt.date  # only date

tf['weekday_enc'] = tf['timestamp'].dt.dayofweek  # weekday encoded: (Monday == 0, Tuesday == 1, ..., Sunday == 6)

tf['weekday'] = tf['timestamp'].dt.day_name()  # weekday as string (Monday, Tuesday, ..., Sunday)

tf['first_day'] = tf['date'][0]  # first day of any tweet in the dataset

tf['last_day'] = tf['date'][len(tf) - 1]  # last day of any tweet in the dataset (without retweet fix)


In [14]:
print(time.time() - starttime)

3.836796760559082


### Tweet Types

In [15]:
starttime = time.time()

In [16]:
# Tweet types

# Pure source
# not retweet, not quote, not reply

tf['pure_source'] = (df['retweeted_status.id_str'] == 0) & (df['quoted_status.id_str'] == 0) & (df['in_reply_to_status_id_str'] == 0)


# Retweet
# 'retweet'==True and 'retweet_of'==0 => tweet is a retweet, but parent is not in dataset
# 'retweeted_count_int' => number of retweets of this tweet in the dataset
# 'retweeted_count_ext' => number of retweets of this tweet in and outside the dataset
# 'retweeted'==True => there is a retweet of this tweet in the dataset (retweets of retweets also count)
# we count quotes as retweets

tf['retweet'] = (df['retweeted_status.id_str'] != 0) | (df['quoted_status.id_str'] != 0)

tf['retweet_of'] = df.apply(retweet_of, axis=1)

# Remove source tweets and their retweets/quotes x days before the last day of the dataset, to fix retweet/quote statistics, since most retweets/quotes occur x days after the original tweet
to_remove = tf.apply(retweet_fix, axis=1)  # series of (id, to_remove), which tweets need to be removed
tf.drop(to_remove[to_remove == True].index, inplace=True)
print('{} tweets removed for retweet fix'.format(len(to_remove[to_remove == True])))
del to_remove

retweet_counts = tf[tf['retweet_of'] != 0]['retweet_of'].value_counts()  # series of (id, n_retweeted), how many times did tweet with id get retweeted within the dataset
tf['retweeted_count_int'] = tf['id'].apply(get_retweet_count)
del retweet_counts

tf['retweeted_count_ext'] = df['retweet_count'] + df['quote_count']  # TODO currently always 0

tf['retweeted'] = tf['retweeted_count_int'] != 0


# Retweet with comment (quote)
# in the twitter api, retweets of quotes have both 'quoted_status' and 'retweeted_status' fields
    # source tweet (quoted_status) -> quote of source (retweeted_status) -> retweet of quote
    # 'quoted_status' contains the original source tweet, 'retweeted_status' the quote tweet
# 'retweet_with_comment'==True and 'retweet_with_comment_of'==0 => tweet is a retweet_with_comment, but parent is not in dataset

tf['retweet_with_comment'] = (df['quoted_status.id_str'] != 0)


# Reply
# 'reply'==True and 'reply_of'=0 => tweet is a reply, but parent is not in dataset

tf['reply'] = df['in_reply_to_status_id_str'] != 0

tf['reply_of'] = df['in_reply_to_status_id_str'].apply(conv_id)


966 tweets removed for retweet fix


In [17]:
print(time.time() - starttime)

6.760125398635864


### Tweet Content

In [18]:
starttime = time.time()

In [19]:
# Tweet content
# If tweet is not extended, use standard tweet
# If tweet is extended, use extended_tweet
# If tweet is a retweet, use retweeted_status
# If tweet is a retweet and the original tweet is extended, use retweeted_status.extended_tweet


# Hashtags

tf['hashtag_count'] = df.apply(count_entity, args=('hashtags',), axis=1)

tf['hashtag'] = tf['hashtag_count'] != 0


# URLs

tf['url_count'] = df.apply(count_entity, args=('urls',), axis=1)

tf['url'] = tf['url_count'] != 0


# Mentions

tf['mention_count'] = df.apply(count_entity, args=('user_mentions',), axis=1)

tf['mention'] = tf['mention_count'] != 0


# Polls

tf['poll'] = (df['entities.polls'] != 0) | (df['extended_tweet.entities.polls'] != 0)


# Media

tf['media_count'] = df.apply(count_entity, args=('media',), axis=1)

tf['media'] = tf['media_count'] != 0

tf['photo_count'] = df.apply(count_media, args=('photo',), axis=1)

tf['video_count'] = df.apply(count_media, args=('video',), axis=1)

tf['animated_gif_count'] = df.apply(count_media, args=('animated_gif',), axis=1)


# Text length

tf['text_length'] = df.apply(count_text, axis=1)

tf['text_length_median'] = tf['text_length'].median()


In [20]:
print(time.time() - starttime)

12.909547567367554


## User Features

### Helper Functions

In [21]:
# Helper functions


# Convert old to new user id using transition table
def conv_user_id(id_):   
    not_found_value = 0
    return user_trans_table.get(id_, not_found_value)

'''
# Return column value from last tweet of user
def last_tweet_fixup(row, column):
    # return tf[tf['id'] == row['user.last_tweet']].iloc[0][column]
    return tf.loc[id2index(row['user.last_tweet'])][column] # NEW
'''

# How active is the user in terms of produced statuses (tweets/retweets)
def user_activity(row):
    if row['user.account_age'] == 0:
        return None
    last_tweet_statuses_count = tf.loc[id2index(row['user.last_tweet'])]['user.statuses_count']  # NEW
    # last_tweet_statuses_count = tf[tf['id'] == row['user.last_tweet']].iloc[0]['user.statuses_count']
    return last_tweet_statuses_count / row['user.account_age']


# How active is the user in terms of statuses and likes
def user_tweets_likes_activity(row):
    if row['user.account_age'] == 0:
        return None
    return (row['user.statuses_count'] + row['user.favourites_count']) / row['user.account_age']


# In which quantile is the property
def quantile(prop_value, q1, q2, q3):
    if prop_value < q1:
        return 0
    elif prop_value < q2:
        return 1
    elif prop_value < q3:
        return 2
    else:
        return 3


### User ID

In [22]:
# User ID

tf['user.id'] = df['user.id_str'].apply(conv_user_id)
n_not_found_ids = len(tf[tf['user.id'] == 0])
if (n_not_found_ids != 0):
    print('Warning: ' + str(n_not_found_ids) + ' user ids could not be found in the transition table.')




### Followers

In [23]:
# Followers

tf['user.followers_count'] = df['user.followers_count']
q1 = tf['user.followers_count'].quantile(q=0.25)
q2 = tf['user.followers_count'].quantile(q=0.5)
q3 = tf['user.followers_count'].quantile(q=0.75)
tf['user.followers_count_quantile'] = tf['user.followers_count'].apply(quantile, args=(q1, q2, q3))

tf['user.followees_count'] = df['user.friends_count']
q1 = tf['user.followees_count'].quantile(q=0.25)
q2 = tf['user.followees_count'].quantile(q=0.5)
q3 = tf['user.followees_count'].quantile(q=0.75)
tf['user.followees_count_quantile'] = tf['user.followees_count'].apply(quantile, args=(q1, q2, q3))

del q1, q2, q3

### Profile Info

In [24]:
# Profile info

tf['user.verified'] = df['user.verified']
tf['user.default_profile'] = df['user.default_profile']
tf['user.default_profile_image'] = df['user.default_profile_image']
tf['user.url'] = df['user.url'] != 0

tf['user.listed_count'] = df['user.listed_count']
tf['user.listed_count_mean'] = tf['user.listed_count'].mean()


### Activity

In [25]:
starttime = time.time()

In [26]:
# lutf = tf[['user.id', 'id']].drop_duplicates(subset='user.id', keep='last')
# tf['user.last_tweet'] = tf['user.id'].apply(lambda user_id: lutf[lutf['user.id'] == user_id].iloc[0]['id'])
lutf = tf[['user.id', 'id']].drop_duplicates(subset='user.id', keep='last').set_index('user.id', drop=True)  # table of last tweet of every user
tf['user.last_tweet'] = tf['user.id'].apply(lambda user_id: lutf.loc[user_id]['id'])
del lutf

In [27]:
# Activity

tf['user.statuses_count'] = df['user.statuses_count']

tf['user.favourites_count'] = df['user.favourites_count']

tf['user.account_age'] = (tf['last_day'] - pd.to_datetime(df['user.created_at']).dt.date).dt.days
tf['user.account_age'] = tf['user.account_age'].astype(int)

tf['user.activity'] = tf.apply(user_activity, axis=1)

tf['user.mean_activity'] = tf.drop_duplicates(subset='user.id', keep='last')['user.activity'].mean()

tf['user.tweets_likes_activity'] = tf.apply(user_tweets_likes_activity, axis=1)


In [28]:
print(time.time() - starttime)

18.763147830963135


## Sperate Text Data

In [29]:
# Seperate text dataframe
txtf = pd.DataFrame()

txt_out_path = 'tweets_preproc_{}_{}_tweet_text.csv'.format(data_name, data_date)

In [30]:
tknzr = TweetTokenizer(preserve_case=False)
stops = Counter(stopwords.words('german'))

def anon_text(row):
    # find uncut text and corresbonding mentions
    text = get_entity(row, 'text')
    mentions = get_entity(row, 'user_mentions')
    
    # anon mentions
    offset = 0
    for mention in mentions:
        user_id = str(conv_user_id(mention['id_str']))
        indices = mention['indices']
        
        text = text[:indices[0]+offset] + '@user' + user_id + text[indices[1]+offset:]
        offset += len('@user' + user_id) - (indices[1] - indices[0])

    # remove stop words
    text = [word for word in tknzr.tokenize(text) if word not in stops]
    
    '''
    # remove link to tweet, which is present if it contains media
    if count_entity(row, 'media') > 0:
        text = text[:-1]
    '''
    
    return text


In [31]:
starttime = time.time()

In [32]:
txtf['id'] = tf['id']

txtf['text'] = df.apply(anon_text, axis=1)

In [33]:
print(time.time() - starttime)

6.265855073928833


## Save dataframe as CSV file

In [34]:
display(tf)

Unnamed: 0,id,timestamp,time,hour,date,weekday_enc,weekday,first_day,last_day,pure_source,...,user.url,user.listed_count,user.listed_count_mean,user.last_tweet,user.statuses_count,user.favourites_count,user.account_age,user.activity,user.mean_activity,user.tweets_likes_activity
0,1,2019-08-23 00:08:41+00:00,00:08:41,0,2019-08-23,4,Friday,2019-08-23,2019-10-02,False,...,False,1,170.728171,1,1907,866,641,2.975039,13.777983,4.326053
1,2,2019-08-23 00:26:51+00:00,00:26:51,0,2019-08-23,4,Friday,2019-08-23,2019-10-02,False,...,False,62,170.728171,19976,39239,13591,3490,11.412894,13.777983,15.137536
2,3,2019-08-23 00:41:15+00:00,00:41:15,0,2019-08-23,4,Friday,2019-08-23,2019-10-02,False,...,False,0,170.728171,3,1403,1981,3831,0.366223,13.777983,0.883320
3,4,2019-08-23 00:42:43+00:00,00:42:43,0,2019-08-23,4,Friday,2019-08-23,2019-10-02,False,...,False,0,170.728171,4,210,94,343,0.612245,13.777983,0.886297
4,5,2019-08-23 00:48:33+00:00,00:48:33,0,2019-08-23,4,Friday,2019-08-23,2019-10-02,False,...,False,3,170.728171,5,65844,203,3410,19.309091,13.777983,19.368622
5,6,2019-08-23 00:55:48+00:00,00:55:48,0,2019-08-23,4,Friday,2019-08-23,2019-10-02,False,...,False,9,170.728171,6,34726,12691,3591,9.670287,13.777983,13.204400
6,7,2019-08-23 00:57:58+00:00,00:57:58,0,2019-08-23,4,Friday,2019-08-23,2019-10-02,False,...,False,1,170.728171,7,28664,99892,1807,15.862756,13.777983,71.143331
7,8,2019-08-23 01:06:22+00:00,01:06:22,1,2019-08-23,4,Friday,2019-08-23,2019-10-02,False,...,False,17,170.728171,8,116569,98589,1949,59.809646,13.777983,110.394048
8,9,2019-08-23 02:11:12+00:00,02:11:12,2,2019-08-23,4,Friday,2019-08-23,2019-10-02,True,...,True,568,170.728171,13526,36790,20341,3698,10.138724,13.777983,15.449162
9,10,2019-08-23 02:14:00+00:00,02:14:00,2,2019-08-23,4,Friday,2019-08-23,2019-10-02,True,...,True,115,170.728171,10,277875,0,3791,73.298602,13.777983,73.298602


In [35]:
display(txtf)

Unnamed: 0,id,text
0,1,"[@user0, vegan, schlop]"
1,2,"[ja, ,, amazonas, brennt, wegen, faschismus, k..."
2,3,"[@user0, vegan, sausage]"
3,4,"[wen, überzeugter, veganer, ,, warum, kuck, ku..."
4,5,"[dammit, i, swear, them, 8, months, i, vegan, ..."
5,6,"[hey, ,, probiers, mal, ., suche, 1.10, ., wg-..."
6,7,"[vegan, rice, krispie, treat, the, last, thing..."
7,8,"[hey, ,, probiers, mal, ., suche, 1.10, ., wg-..."
8,9,"[essential, detox, -, vegezy, https://t.co/6DV..."
9,10,"[kind, vegan, ernährt, :, https://t.co/2gPTZZk..."


In [22]:
# Save dataframe as CSV file
tf.to_csv(out_path, index=False)
txtf.to_csv(txt_out_path, index=False)