This script converts combined labels to a dataset that huggingface can use

In [1]:
import sys
sys.path.append('../../../../lib/')

In [2]:
from analysis import *
import pandas as pd
import json
from transformers import AutoTokenizer

In [3]:
# Read Combinedlabels.csv
df = pd.read_csv("Combinedlabels.csv")

In [4]:
print(df.head())

   tweet_id  Gabe  Guanyi  Mom  Sum  \
0         0     0       0    0    0   
1         1     0       0    0    0   
2         2     0       0    0    0   
3         3     1       1    1    3   
4         4     0       0    0    0   

                                          tweet_text  \
0  RT {{MENTION}}: Starting in one hour. Raising ...   
1  $102.82 this week! This is how I now make a pa...   
2  OPEN today 10 am until 1 pm\n419 CCPkwy Cape C...   
3  A superb 18th century European carved ivory di...   
4                                 Driving aroundd :P   

                                    user_description  
0  Can't start a fire... Can't start a fire witho...  
1                     Follow me and you'll find out!  
2  We are 3rd generation Brocante-Antique Import ...  
3  John Nicholson's Auctioneers are the largest a...  
4  Im 13 years young\nim in love with the most am...  


In [5]:
# Read in ocr json
with open('text_from_images/ids_to_text.json') as f:
	ocr_json = json.load(f)

In [6]:
# Print length of of data frame
print(len(df))

# Remove rows with duplicate text
df = df.drop_duplicates(subset=['tweet_text'])

print(len(df))


515
492


In [7]:
texts = []
texts_userdescriptions = []
texts_userdescriptions_ocrs = []
labels = []
ids = []
max_length = 0
number_of_tweets = 0
for index, row in df.iterrows():
		label = row['Sum']
		if label >= 2:
			labels.append(1)
		else:
			labels.append(0)
		text = analysis.clean_text(row['tweet_text'].replace('"', '""'))

		try:
			userdescription = analysis.clean_text(row['user_description'].replace('"', '""'))
		except:
			userdescription = '[nodes]'
		try:
			ocr = analysis.clean_text(ocr_json[str(row['tweet_id'])].replace('"', '""'))
			# print(ocr)
		except:
			ocr = '[noocr]'
		# Append to each list with adding [sep]
		texts.append(text)
		texts_userdescriptions.append(text + ' [sep] ' + userdescription)
		texts_userdescriptions_ocrs.append(text + ' [sep] ' + userdescription + ' [sep] ' + ocr)
		ids.append(row['tweet_id'])
		number_of_tweets += 1
  
# Get max length for the text in each list
max_length_text = max(len(text) for text in texts)
max_length_userdescriptions = max(len(text) for text in texts_userdescriptions)
max_length_userdescriptions_ocrs = max(len(text) for text in texts_userdescriptions_ocrs)
print("Max length of text: ", max_length_text)
print("Max length of text and user description: ", max_length_userdescriptions)
print("Max length of text and user description and ocr: ", max_length_userdescriptions_ocrs)
print("Number of tweets: ", number_of_tweets)

Max length of text:  288
Max length of text and user description:  440
Max length of text and user description and ocr:  2102
Number of tweets:  492


In [8]:
# Count number of tokens in each text
tokens_in_text = []
tokens_in_text_userdescriptions = []	
tokens_in_text_userdescriptions_ocrs = []
tokenizer = AutoTokenizer.from_pretrained('bert-base-uncased')
tokenizer._add_tokens(["[nodes]", "[noocr]", "{{URL}}", "{{MENTION}}", "{{EMAIL}}"])
for text in texts:
	tokens_in_text.append(len(tokenizer.tokenize(text)))
for text in texts_userdescriptions:
	tokens_in_text_userdescriptions.append(len(tokenizer.tokenize(text)))
for text in texts_userdescriptions_ocrs:
	tokens_in_text_userdescriptions_ocrs.append(len(tokenizer.tokenize(text)))

# Sort the lists	
tokens_in_text = sorted(tokens_in_text)
tokens_in_text_userdescriptions = sorted(tokens_in_text_userdescriptions)
tokens_in_text_userdescriptions_ocrs = sorted(tokens_in_text_userdescriptions_ocrs)

# Print the first 10 elements of each list
print("First 10 elements of text: ", tokens_in_text[-10:])
print("First 10 elements of text and user description: ", tokens_in_text_userdescriptions[-10:])
print("First 10 elements of text and user description and ocr: ", tokens_in_text_userdescriptions_ocrs[-10:])


First 10 elements of text:  [79, 79, 80, 81, 84, 85, 85, 89, 89, 95]
First 10 elements of text and user description:  [111, 113, 113, 114, 114, 116, 118, 121, 127, 137]
First 10 elements of text and user description and ocr:  [165, 176, 191, 196, 205, 212, 221, 237, 313, 476]


In [9]:
# Get the number of zero labels
print("Number of zero labels: ", len(labels) - sum(labels))
print("Number of one labels: ", sum(labels))

Number of zero labels:  315
Number of one labels:  177


In [10]:
good_tweets_texts = {}
bad_tweets_texts = {}
good_tweets_texts_userdescriptions = {}
bad_tweets_texts_userdescriptions = {}
good_tweets_texts_userdescriptions_ocrs = {}
bad_tweets_texts_userdescriptions_ocrs = {}
good_tweets_texts["tweets"] = []
bad_tweets_texts["tweets"] = []
good_tweets_texts_userdescriptions["tweets"] = []
bad_tweets_texts_userdescriptions["tweets"] = []
good_tweets_texts_userdescriptions_ocrs["tweets"] = []
bad_tweets_texts_userdescriptions_ocrs["tweets"] = []

for tweet in range(len(texts)):
	out_text = {}
	out_text["text"] = texts[tweet]
	out_text["tweet_id"] = ids[tweet]
	out_text_userdescriptions = {}
	out_text_userdescriptions["text"] = texts_userdescriptions[tweet]
	out_text_userdescriptions["tweet_id"] = ids[tweet]
	out_text_userdescriptions_ocrs = {}
	out_text_userdescriptions_ocrs["text"] = texts_userdescriptions_ocrs[tweet]
	out_text_userdescriptions_ocrs["tweet_id"] = ids[tweet]

	if labels[tweet] == 0:
		good_tweets_texts["tweets"].append(out_text)
		good_tweets_texts_userdescriptions["tweets"].append(out_text_userdescriptions)
		good_tweets_texts_userdescriptions_ocrs["tweets"].append(out_text_userdescriptions_ocrs)
	else:
		bad_tweets_texts["tweets"].append(out_text)
		bad_tweets_texts_userdescriptions["tweets"].append(out_text_userdescriptions)
		bad_tweets_texts_userdescriptions_ocrs["tweets"].append(out_text_userdescriptions_ocrs)
  
print("Number of good tweets: ", len(good_tweets_texts["tweets"]))
print("Number of bad tweets: ", len(bad_tweets_texts["tweets"]))

Number of good tweets:  315
Number of bad tweets:  177


In [11]:
# Write to good_tweets.json
with open('good_tweets_text.json', 'w') as outfile:
	json.dump(good_tweets_texts, outfile)

with open('bad_tweets_text.json', 'w') as outfile:
	json.dump(bad_tweets_texts, outfile)
 
with open('good_tweets_text_userdescriptions.json', 'w') as outfile:
	json.dump(good_tweets_texts_userdescriptions, outfile)

with open('bad_tweets_text_userdescriptions.json', 'w') as outfile:
	json.dump(bad_tweets_texts_userdescriptions, outfile)

with open('good_tweets_text_userdescriptions_ocrs.json', 'w') as outfile:
	json.dump(good_tweets_texts_userdescriptions_ocrs, outfile)

with open('bad_tweets_text_userdescriptions_ocrs.json', 'w') as outfile:
	json.dump(bad_tweets_texts_userdescriptions_ocrs, outfile)