In [3]:
import pdfplumber
import re
import pandas as pd

# STEP 1 - Read in Tweets from PDF
pdf_path = "Twitter-iamthespookster-2019-08-04 (1).pdf"
with pdfplumber.open(pdf_path) as pdf:
    raw_data = "\n".join([page.extract_text() for page in pdf.pages if page.extract_text()])

# STEP 2 - Extract Tweets from Raw Data
dat = raw_data.split("\n")
dat = dat[4:]  # Remove first four lines (heading/whitespace)

# Find and remove line breaks
LBs = [i for i, line in enumerate(dat) if line == ""]
dat = [line for i, line in enumerate(dat) if i not in LBs]

# Find the Line Containing the Source of Each Tweet
source_line = [i for i, line in enumerate(dat) if "(Source:" in line]

# Extract Tweet, Date, Source from Content
tweets, dates, sources = [], [], []
start = 0
for i in source_line:
    tweet_in = dat[start:i+1]
    tweets.append(" ".join(tweet_in[:-2]))
    dates.append(tweet_in[-2])
    sources.append(tweet_in[-1])
    start = i + 1

data = pd.DataFrame({"tweet": tweets, "date": dates, "source": sources})

# STEP 3 - Extract Data from Tweet
url_pattern = re.compile(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")

data["RT"] = data["tweet"].str.startswith("RT ")
data["tweet"] = data["tweet"].str.replace(r"^RT ", "", regex=True)

data["user_tag"] = data["tweet"].str.contains("@")
data["account"] = data["tweet"].apply(lambda x: ",".join(re.findall(r"@\w+", x)))
data["tweet"] = data["tweet"].apply(lambda x: re.sub(r"@\w+|:", "", x))

data["ext_link"] = data["tweet"].str.contains("http")
data["link"] = data["tweet"].apply(lambda x: ",".join(url_pattern.findall(x)))
data["tweet"] = data["tweet"].apply(lambda x: url_pattern.sub("", x))

print(data.head())
print(data.shape[0])

data.to_csv("tweets.csv", index=False, encoding="utf-8-sig")


                                               tweet                  date  \
0                                                     Mar 9, 2019, 4:16 PM   
1                                                Why  Mar 9, 2019, 4:28 PM   
2   Today in “Manosphere Grift or Terrorist Recru...  Mar 9, 2019, 4:43 PM   
3   How can anyone see this as a good luck charm?...  Mar 9, 2019, 4:46 PM   
4                                  Hamms is good tho  Mar 9, 2019, 5:44 PM   

                                              source     RT  user_tag  \
0  (Source: https://twitter.com/iamthespookster/s...  False     False   
1  (Source: https://twitter.com/iamthespookster/s...  False      True   
2  (Source: https://twitter.com/iamthespookster/s...   True      True   
3  (Source: https://twitter.com/iamthespookster/s...   True      True   
4  (Source: https://twitter.com/iamthespookster/s...  False      True   

            account  ext_link link  
0                       False       
1  @LetsJetTogethe