In [1]:
import numpy as np
import pandas as pd
import json
import os
import multiprocessing as mp
import time
import re

In [2]:
df = pd.read_json('data/tweets_DM.json', lines=True)

df['_source'].to_json('data/file.json', lines=True, orient='records')

In [3]:
dfdf = pd.read_json('data/file.json', lines=True)
dfdf['tweet'].to_json('data/tweet.json', lines=True, orient='records')

In [4]:
tweets = pd.read_json('data/tweet.json', lines=True)
tweets = tweets.join(df['_score'])
tweets = tweets.join(df['_crawldate'])
tweets.rename(columns={'_score': 'score', '_crawldate': 'crawldate'}, inplace=True)
tweets.set_index('tweet_id', inplace=True)

In [5]:
emotion = pd.read_csv('data/emotion.csv')
emotion.set_index('tweet_id', inplace=True)
data_identification = pd.read_csv('data/data_identification.csv')
data_identification.set_index('tweet_id', inplace=True)

In [6]:
tweets.head(5)

Unnamed: 0_level_0,hashtags,text,score,crawldate
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0x376b20,[Snapchat],"People who post ""add me on #Snapchat"" must be ...",391,2015-05-23 11:42:47
0x2d5350,"[freepress, TrumpLegacy, CNN]","@brianklaas As we see, Trump is dangerous to #...",433,2016-01-28 04:52:09
0x28b412,[bibleverse],"Confident of your obedience, I write to you, k...",232,2017-12-25 04:39:20
0x1cd5b0,[],Now ISSA is stalking Tasha 😂😂😂 <LH>,376,2016-01-24 23:53:05
0x2de201,[],"""Trust is not the same as faith. A friend is s...",989,2016-01-08 17:18:59


In [7]:
emotion.head(5)

Unnamed: 0_level_0,emotion
tweet_id,Unnamed: 1_level_1
0x3140b1,sadness
0x368b73,disgust
0x296183,anticipation
0x2bd6e1,joy
0x2ee1dd,anticipation


In [8]:
data_identification.head(5)

Unnamed: 0_level_0,identification
tweet_id,Unnamed: 1_level_1
0x28cc61,test
0x29e452,train
0x2b3819,train
0x2db41f,test
0x2a2acc,train


In [9]:
print('Number of tweets: ', len(tweets))
print('Number of emotion labels: ', len(emotion))
print('Number of data identification labels: ', len(data_identification))

all_pd = pd.concat([tweets, emotion, data_identification], axis=1)

print('Number of all data: ', len(all_pd))

Number of tweets:  1867535
Number of emotion labels:  1455563
Number of data identification labels:  1867535
Number of all data:  1867535


In [10]:
all_pd.head(5)

Unnamed: 0_level_0,hashtags,text,score,crawldate,emotion,identification
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0x376b20,[Snapchat],"People who post ""add me on #Snapchat"" must be ...",391,2015-05-23 11:42:47,anticipation,train
0x2d5350,"[freepress, TrumpLegacy, CNN]","@brianklaas As we see, Trump is dangerous to #...",433,2016-01-28 04:52:09,sadness,train
0x28b412,[bibleverse],"Confident of your obedience, I write to you, k...",232,2017-12-25 04:39:20,,test
0x1cd5b0,[],Now ISSA is stalking Tasha 😂😂😂 <LH>,376,2016-01-24 23:53:05,fear,train
0x2de201,[],"""Trust is not the same as faith. A friend is s...",989,2016-01-08 17:18:59,,test


In [11]:
# change hashtags to single string
from multiprocessing import Pool

def change_hashtags_to_string(hashtags):
	if len(hashtags) == 0:
		return ''
	else:
		h = ' '.join(hashtags)
		return re.sub(r'(?<![A-Z\W])(?=[A-Z])', ' ', h)

def change_hashtags_to_string_parallel(hashtags):
	pool = Pool(processes=mp.cpu_count())
	hashtags = pool.map(change_hashtags_to_string, hashtags)
	pool.close()
	pool.join()
	return hashtags

all_pd['hashtags'] = change_hashtags_to_string_parallel(all_pd['hashtags'])

all_pd.sample(5)

Unnamed: 0_level_0,hashtags,text,score,crawldate,emotion,identification
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0x1e96c1,lularoe august hot water,It's the last full month of summer! How are yo...,241,2017-01-28 23:58:53,joy,train
0x2332af,Spirit Chat,"Q2. When the tide of <LH> recedes from life, w...",527,2015-10-10 22:14:42,joy,train
0x2f9f2d,Got7 IGOT7 7for7,For all igot7... I wish you guys were here wit...,791,2017-02-06 01:18:55,,test
0x350db9,masterstrock,@isalilsand Shilpa you changed the game <LH> #...,111,2016-01-11 06:09:59,joy,train
0x36f543,fully,Awesome movie #fully <LH> full package of ente...,515,2016-03-27 03:42:00,joy,train


In [12]:
# remove text punctuation
import string

def remove_punctuation(text, hashtags):
	if len(text) == 0 and len(hashtags) == 0:
		return ''
	elif len(text) == 0:
		return hashtags
	elif len(hashtags) == 0:
		t = text.translate(str.maketrans('', '', string.punctuation))
		return re.sub(r'(?<![A-Z\W])(?=[A-Z])', ' ', t)
	else:
		t = text.translate(str.maketrans('', '', string.punctuation))
		return hashtags + ' ' + re.sub(r'(?<![A-Z\W])(?=[A-Z])', ' ', t)

def remove_punctuation_parallel(text, hashtags):
	pool = Pool(processes=mp.cpu_count())
	# combine text and hashtags with pool
	text = pool.starmap(remove_punctuation, zip(text, hashtags))
	pool.close()
	pool.join()
	return text

all_pd['text'] = remove_punctuation_parallel(all_pd['text'], all_pd['hashtags'])


In [13]:
all_pd.sample(5)

Unnamed: 0_level_0,hashtags,text,score,crawldate,emotion,identification
tweet_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
0x334a51,,ddlovato you looked beautiful tonight Killed ...,160,2016-06-21 11:54:42,trust,train
0x321cd1,iamblessed,iamblessed Its your month of blessings All ki...,491,2015-06-07 21:56:57,,test
0x21a32d,JOUR2202 A2,JOUR2202 A2 Its hard to stay openminded when...,999,2016-09-12 09:35:13,,test
0x3821ba,Life,Life 72 The moments in your life are only onc...,590,2017-03-15 23:44:27,anticipation,train
0x2d1e98,God,God You Can Enjoy Anything If You Can Enjoy...,857,2016-03-13 08:04:00,anticipation,train


In [14]:
test_pd = all_pd[all_pd['identification'] == 'test']
data_pd = all_pd[all_pd['identification'] == 'train']

print(len(test_pd))
print(len(data_pd))

411972
1455563


In [15]:
# save to pickle
test_pd.to_pickle('data/test_pd.pkl')
data_pd.to_pickle('data/data_pd.pkl')