# Load Twitter Dataset

This notebook loads the 
[#BTW17 Twitter Dataset](https://zenodo.org/record/835735), extract the tweets and the hashtags and save them in `data/all_tweets` and `data/all_hashtags` files.


In [1]:
from pathlib import Path
import os
import json
# from nltk.corpus import stopwords
# import re
# import pandas as pd
# from wordcloud import WordCloud
# import matplotlib.pyplot as plt
import pickle

# Import one file

In [8]:
path = Path('data/recorded-tweets')
files = os.listdir(path)

In [9]:
len(files)

1310

In [10]:
files[0]

'chunk-001f0bf7-d12d-45c8-8b6e-27b7a88a01f5.json'

In [11]:
# Open one json file
with open(path / files[0]) as f:
    data = json.load(f)

In [12]:
type(data[0])

dict

In [13]:
list(data[0].keys())

['created_at',
 'id',
 'id_str',
 'text',
 'display_text_range',
 'source',
 'truncated',
 'in_reply_to_status_id',
 'in_reply_to_status_id_str',
 'in_reply_to_user_id',
 'in_reply_to_user_id_str',
 'in_reply_to_screen_name',
 'user',
 'geo',
 'coordinates',
 'place',
 'contributors',
 'is_quote_status',
 'retweet_count',
 'favorite_count',
 'entities',
 'extended_entities',
 'favorited',
 'retweeted',
 'possibly_sensitive',
 'filter_level',
 'lang',
 'timestamp_ms']

In [14]:
data[0]

{'created_at': 'Wed Aug 02 21:50:58 +0000 2017',
 'id': 892865330464382976,
 'id_str': '892865330464382976',
 'text': '@Volker_Beck Volker, keine Angst...wir vergessen nix!!! https://t.co/UVwj5XseP9',
 'display_text_range': [13, 55],
 'source': '<a href="http://twitter.com/download/android" rel="nofollow">Twitter for Android</a>',
 'truncated': False,
 'in_reply_to_status_id': 892863573843726342,
 'in_reply_to_status_id_str': '892863573843726342',
 'in_reply_to_user_id': 16337664,
 'in_reply_to_user_id_str': '16337664',
 'in_reply_to_screen_name': 'Volker_Beck',
 'user': {'id': 805053174621073408,
  'id_str': '805053174621073408',
  'name': '🏅Monsieur Oh Lala🏅',
  'screen_name': 'TheBrutalinski',
  'location': 'Neusiedl am See, Österreich',
  'url': None,
  'description': 'In einer irrsinnigen Welt vernünftig sein zu wollen ist schon wieder ein Irrsinn.\xa0 Natürlich alles Satire!!!',
  'protected': False,
  'verified': False,
  'followers_count': 39,
  'friends_count': 200,
  'listed_

In [15]:
print("TEXT:")
print(data[10]["text"])

print("\nNAME:")
print(data[10]['quoted_status']["user"]["name"])

print("\nFULL TEXT:")
print(data[10]['quoted_status']["extended_tweet"]["full_text"])

TEXT:
RT @FDPAussteigerin: Nur #AfD fordert Senkung der Mehrwertsteuer. Die #FDP gerade nicht! https://t.co/9xYytsPVT7

NAME:
Alternative für 🇩🇪

FULL TEXT:
Schön, dass unsere Forderung nun Erwähnung findet:
Senkung der #Mehrwertsteuer ist ein Alleinstellungsmerkmal der #AfD!
#TrauDichDeutschland https://t.co/C21N3uJ5U1


In [16]:
tweets = [0, 11, 12, 13, 153, 253, 366]

for tweet in tweets:
    # print("TEXT:")
    print(data[tweet]["text"])
    # print("NAME:")
    # print(data[tweet]["user"]["name"])
    print("-"*80)

@Volker_Beck Volker, keine Angst...wir vergessen nix!!! https://t.co/UVwj5XseP9
--------------------------------------------------------------------------------
RT @Beatrix_vStorch: CDU/CSU bedeutet: Keine Obergrenze, Familiennachzug in Millionenhöhe und Resettlement. Nur die #AfD! https://t.co/CnAl…
--------------------------------------------------------------------------------
RT @Beatrix_vStorch: Nur gute Erfahrungen-u warum zum Teufel sollen die Mitarbeiter d Verwaltung jetzt Selbstverteidigungskurse belegen? ht…
--------------------------------------------------------------------------------
RT @greenpeacemag: Deutliche Worte von @marcobuelow! Die wichtigsten Fakten zum #Dieselgipfel gibt´s hier: https://t.co/nZ3tG8U3K0 https://…
--------------------------------------------------------------------------------
RT @Beatrix_vStorch: Die Türkei ist eine islamistische Diktatur geworden. Trotzdem sind CDU/CSU/SPD/Grüne/Linke/FDP nicht bereit die Ver… 
----------------------------------

In [17]:
hashtag_list = []

for tweet in range(10):
    hashtags = data[tweet]["entities"]["hashtags"]
    hashtag_text = []
    for ht in range(len(hashtags)):
        hashtag_text.append(data[tweet]["entities"]["hashtags"][ht]["text"])
    print(f'{tweet} -- {hashtag_text}')
    hashtag_list.append(hashtag_text)

0 -- []
1 -- ['Dieselgipfel']
2 -- []
3 -- []
4 -- []
5 -- ['Erdogan']
6 -- ['AfD', 'Dieselgipfel']
7 -- []
8 -- ['Sanktionen', 'Russland']
9 -- []


In [18]:
hashtag_list

[[],
 ['Dieselgipfel'],
 [],
 [],
 [],
 ['Erdogan'],
 ['AfD', 'Dieselgipfel'],
 [],
 ['Sanktionen', 'Russland'],
 []]

In [19]:
# save all id in all_ids
all_ids = []
for tweet in range(len(data)):
    all_ids.append(data[tweet]["user"]["id"])

In [20]:
print(f"Number of tweets: {len(data)}")
print(f"Number of unique IDs: {len(pd.Series(all_ids).unique())}")
# 

Number of tweets: 505
Number of unique IDs: 278


In [21]:
# First 10 IDs with most tweets (only considering one file, loaded above)
pd.Series(all_ids).value_counts()[:10]

3364793391            41
3041102615            31
3448073356            17
884041595452096513    10
826051268812611584     8
210354218              8
738771661915344897     6
871334212556783618     6
784903714045829121     6
892012440443080704     5
Name: count, dtype: int64

# Import all files

In [22]:
def load_one_file(i):
    with open(path / files[i]) as f:
        data = json.load(f)
    return data


def extract_text_hashtag(data):
    tweets = []
    hashtags = []
    for tweet in range(len(data)):
        tweets.append(data[tweet]["text"])
        N_hashtags = len(data[tweet]["entities"]["hashtags"])
        hashtag_list = []
        for ht in range(N_hashtags):
            hashtag_list.append(data[tweet]["entities"]["hashtags"][ht]["text"])
        hashtags.append(hashtag_list)
    return tweets, hashtags


def load_all_files():
    all_tweets = []
    all_hashtags = []
    for i in range(len(files)):
        data_tmp = load_one_file(10)
        tweets, hashtags = extract_text_hashtag(data_tmp)
        all_tweets.append(tweets)
        all_hashtags.append(hashtags)
    return all_tweets, all_hashtags

loadind all the files takes about 2 minutes

In [23]:
all_tweets, all_hashtags = load_all_files()

In [25]:
print(f"Length all_tweets: {len(all_tweets)}")
print(f"Length all_tweets[0]: {len(all_tweets[0])}")
print(f"Length all_hashtags: {len(all_hashtags)}")
print(f"Length all_hashtags[0]: {len(all_hashtags[0])}")

Length all_tweets: 1310
Length all_tweets[0]: 1000
Length all_hashtags: 1310
Length all_hashtags[0]: 1000


# Save files

In [26]:
path = Path('data', 'all_tweets')
with open(path, "wb") as fp:
    pickle.dump(all_tweets, fp)

In [27]:
path = Path('data', 'all_hashtags')
with open(path, "wb") as fp:
    pickle.dump(all_hashtags, fp)