## Data preparation

In [1]:
import json # read json files
from genson import SchemaBuilder # print json schema
import re
import string
from bson import json_util

### Load data

In [2]:
# March
mar = []
with open('./tweets_mar_bson.json', 'r') as f:
    for line in f.readlines():
        mar.append(json.loads(line))

In [3]:
# April
apr = []
with open('./tweets_apr_bson.json', 'r') as f:
    for line in f.readlines():
        apr.append(json.loads(line))

In [4]:
# May
may = []
with open('./tweets_may_bson.json', 'r') as f:
    for line in f.readlines():
        may.append(json.loads(line))

In [5]:
# June
jun = []
with open('./tweets_jun_bson.json', 'r') as f:
    for line in f.readlines():
        jun.append(json.loads(line))

In [6]:
# July
jul = []
with open('./tweets_july_bson.json', 'r') as f:
    for line in f.readlines():
        jul.append(json.loads(line))

In [7]:
# Aug
aug = []
with open('./tweets_aug_bson.json', 'r') as f:
    for line in f.readlines():
        aug.append(json.loads(line))

### Merge all the tweets

In [8]:
original_tweets = mar+apr+may+jun+jul+aug
print('In the original data set, we have', len(original_tweets), 'tweets.')

In the original data set, we have 886602 tweets.


In [9]:
# Example tweet
original_tweets[0]

['https://twitter.com/DannyWijnhoud/status/1244776295109664769',
 {'$date': 1585612770000},
 '@nytimes @jelle_simons Theft in times of #Corona. I would like Vincent van Gogh could have made a painting of it.',
 1244776295109664769,
 'DannyWijnhoud',
 [],
 '',
 [],
 '']

### Exclude irrelevant tweets

In [10]:
tweets = []
for i in range(0, len(original_tweets)):
    if "beer" in original_tweets[i][2] or "alcohol" in original_tweets[i][2]:
        tweets.append(original_tweets[i])

print('Now we have', len(tweets), 'tweets.')

Now we have 5071 tweets.


### Data cleaning

- Format transformation

In [11]:
column_names=("url","text","date","tweet_id","user_name","cleaned_text")
json_response=[]
for i in range(0,len(tweets)):
    json_element= {}
    json_element[column_names[0]]=tweets[i][0]
    json_element[column_names[1]]=tweets[i][2]
    json_element[column_names[2]]=tweets[i][1]['$date']
    json_element[column_names[3]]=tweets[i][3]
    json_element[column_names[4]]=tweets[i][4]
    json_element[column_names[5]]=tweets[i][2]
    json_response.append(json_element)
    
# Example output
json_response[0]

{'url': 'https://twitter.com/AshleyDHeck1/status/1244776214222553088',
 'text': 'Ya know how everyone kept saying "hold my beer", as a way to show things could most definitely get worse in situations? ... Well they named this stupid virus after a beer. 🤦🍷🍷🤷😂 #randomthoughts #coronavirus #wth #COVID19 #corona #lol #imlosingit https://t.co/gZJIKJc8md',
 'date': 1585612750000,
 'tweet_id': 1244776214222553088,
 'user_name': 'AshleyDHeck1',
 'cleaned_text': 'Ya know how everyone kept saying "hold my beer", as a way to show things could most definitely get worse in situations? ... Well they named this stupid virus after a beer. 🤦🍷🍷🤷😂 #randomthoughts #coronavirus #wth #COVID19 #corona #lol #imlosingit https://t.co/gZJIKJc8md'}

- The schema of the json object

In [12]:
bld = SchemaBuilder()
bld.add_schema({"type": "object", "properties": {}})
bld.add_object(json_response)
bld.to_schema()

{'$schema': 'http://json-schema.org/schema#',
 'anyOf': [{'type': 'object'},
  {'type': 'array',
   'items': {'type': 'object',
    'properties': {'url': {'type': 'string'},
     'text': {'type': 'string'},
     'date': {'type': 'integer'},
     'tweet_id': {'type': 'integer'},
     'user_name': {'type': 'string'},
     'cleaned_text': {'type': 'string'}},
    'required': ['cleaned_text',
     'date',
     'text',
     'tweet_id',
     'url',
     'user_name']}}]}

- Deduplication

In [13]:
tweet_id=set()
tweets=[]
for t in json_response:
    if t['tweet_id'] not in tweet_id:
        tweet_id.add(t['tweet_id'])
        tweets.append(t)
print(len(tweets))

5071


- Remove url

In [14]:
def remove_url(text):
    text = re.sub(r'http\S+', '', text)
    return text

In [15]:
for i in range(0, len(tweets)):
    tweets[i]['cleaned_text'] = remove_url(tweets[i]['cleaned_text'])

# Example output
tweets[0]['cleaned_text']

'Ya know how everyone kept saying "hold my beer", as a way to show things could most definitely get worse in situations? ... Well they named this stupid virus after a beer. 🤦🍷🍷🤷😂 #randomthoughts #coronavirus #wth #COVID19 #corona #lol #imlosingit '

- Remove punctuations

In [16]:
def remove_punct(text):
    text  = "".join([char for char in text if char not in string.punctuation])
    text = re.sub('[0-9]+', '', text)
    return text

In [17]:
for i in range(0, len(tweets)):
    tweets[i]['cleaned_text'] = remove_punct(tweets[i]['cleaned_text'])
    
# Example output
tweets[0]['cleaned_text']

'Ya know how everyone kept saying hold my beer as a way to show things could most definitely get worse in situations  Well they named this stupid virus after a beer 🤦🍷🍷🤷😂 randomthoughts coronavirus wth COVID corona lol imlosingit '

- Remove_emojis

In [18]:
def remove_emojis(data):
    emoj = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
        u"\U00002500-\U00002BEF"  # chinese char
        u"\U00002702-\U000027B0"
        u"\U00002702-\U000027B0"
        u"\U000024C2-\U0001F251"
        u"\U0001f926-\U0001f937"
        u"\U00010000-\U0010ffff"
        u"\u2640-\u2642" 
        u"\u2600-\u2B55"
        u"\u200d"
        u"\u23cf"
        u"\u23e9"
        u"\u231a"
        u"\ufe0f"  # dingbats
        u"\u3030"
                      "]+", re.UNICODE)
    return re.sub(emoj, '', data)

In [19]:
for i in range(0, len(tweets)):
    tweets[i]['cleaned_text'] = remove_emojis(tweets[i]['cleaned_text'])
    
# Example output
tweets[0]['cleaned_text']

'Ya know how everyone kept saying hold my beer as a way to show things could most definitely get worse in situations  Well they named this stupid virus after a beer  randomthoughts coronavirus wth COVID corona lol imlosingit '

### Save tweets to the file

In [20]:
with open("tweets.json", "w") as data_file:
    for tweet in tweets:
        line = json.dumps(tweet, default=json_util.default)
        data_file.write(line)
        data_file.write("\n")