### Import Libraries

In [1]:
import ndjson
import os
import pandas as pd

### Extract Tranquil Tweets

In [2]:
dfs = []

for file_name in [f for f in os.listdir("tranquil/") if f.endswith(".ndjson")]:
    file_path = os.path.join("tranquil/", file_name)
    df = pd.read_json(file_path, lines=True, nrows=100000)
    dfs.append(df)

In [3]:
combined_df = pd.concat(dfs, ignore_index=True)

In [4]:
combined_df.drop(columns=["id"], inplace=True)

In [5]:
combined_df.rename(columns={"relevance": "label"}, inplace=True)

In [6]:
combined_df.head()

Unnamed: 0,text,label
0,"<NUMBER> sierra for a pioneer carplay radio, h...",0
1,if pizza didn't exist i wouldn't be fat,0
2,<HASHTAG> pelicans lead after <NUMBER> <URL>,0
3,fandoms unite <HASHTAG> missuniverse <HASHTAG>...,0
4,<USER> @jesscomedy i was trying to mess up the...,0


In [7]:
combined_df.tail()

Unnamed: 0,text,label
99995,<USER> whatever floats your boat joebo,0
99996,"<USER> bj mana,amo tu tbm!",0
99997,<USER> @trillian <NUMBER> right because there'...,0
99998,wow even after the <HASHTAG> leafs trade <HASH...,0
99999,<USER> wats good wit yu tho,0


In [8]:
combined_df.describe()

Unnamed: 0,label
count,100000.0
mean,0.0
std,0.0
min,0.0
25%,0.0
50%,0.0
75%,0.0
max,0.0


In [9]:
combined_df.isnull().sum()

text     0
label    0
dtype: int64

In [10]:
combined_df.nunique()

text     88812
label        1
dtype: int64

In [11]:
combined_df.dtypes

text     object
label     int64
dtype: object

In [12]:
df = combined_df

### Removing Tags

In [13]:
tags = set(df["text"].str.findall(r"<\S+>").sum())
tags

{'<HASHTAG>', '<NUMBER>', '<REPEAT>', '<SMILE>', '<URL>', '<USER>'}

In [16]:
for tag in tags:
    df.loc[:, "text"] = df["text"].str.replace(tag, "")

In [17]:
set(df["text"].str.findall(r"<\S+>").sum())

set()

### Removing Links

In [18]:
tco_links = set(df["text"].str.findall(r"\S*t\.co\S*").sum())
len(tco_links)

1395

In [20]:
http_links = set(df["text"].str.findall(r"\S*http\S*").sum())
len(http_links)

1624

In [21]:
links = tco_links | http_links
len(links)

1686

In [22]:
for link in links:
    df.loc[:, "text"] = df["text"].str.replace(link, "")

In [23]:
set(df["text"].str.findall(r"\S*t\.co\S*").sum())

{'"t.co/',
 '(t.co/hwngc',
 '(t.co/mqjtov',
 '(t.co/msjzik',
 '(t.co/v',
 '(t.co/yqqv',
 '(video)t.co/xrsrgsz',
 '[t.co/yorefwng',
 '[tinychat.com]',
 'facebook&gt;&gt;&gt;t.co/',
 'here:t.co/wjne',
 'heywoodt.co/cccqcse…',
 'intonews.blogspot.com',
 'ladygagalyricsvault.com/lady-gaga-news/alice-cooper-pretends-to-throttle-lady-gaga',
 'somethingmeaningfulcreations.blogspot.com/',
 'statsheet.com/mcb/games',
 't.co',
 't.co/',
 't.co/aahgtrjn',
 't.co/afycslpgqt"',
 't.co/agvdwgotph”',
 't.co/ajzdph…',
 't.co/an',
 't.co/andtz…',
 't.co/aoznzulsmr”',
 't.co/ars…',
 't.co/a…',
 't.co/ccfzq',
 't.co/cd',
 't.co/cevigswzrl.',
 't.co/cfgtbht',
 't.co/cltssn…',
 't.co/coc',
 't.co/cwjwe…',
 't.co/deigil',
 't.co/diuyy',
 't.co/dm…',
 't.co/dq…',
 't.co/drhr',
 't.co/dsymr…',
 't.co/dtn',
 't.co/duvsuno…',
 't.co/dvbhlh',
 't.co/dx',
 't.co/d…',
 't.co/e',
 't.co/eabog…',
 't.co/egafi',
 't.co/ehfon',
 't.co/em',
 't.co/emqumyu”',
 't.co/envgfq',
 't.co/eoe',
 't.co/eray',
 't.co/esjbzatdss…

In [24]:
set(df["text"].str.findall(r"\S*http\S*").sum())

set()

### Removing Invalid Whitespace

In [25]:
df.loc[:, "text"] = df["text"].str.replace(r"\s+", " ", regex=True).str.strip()

In [26]:
if df["text"].str.contains(r"\s\s").any():
    print("Error: Consecutive whitespace.")
if df["text"].str.count(r"\s").sum() != df["text"].str.count(" ").sum():
    print("Error: Non-space whitespace.")

### Writing Data

In [28]:
df.to_csv("tranquil/preprocessed_tranquil_tweets.csv", sep="\t", encoding="utf-16", index=False)