# Combining data

#### Extracting pdf data (I am the spooker)

In [15]:
import pdfplumber
import re
import pandas as pd

# STEP 1 - Read in Tweets from PDF
pdf_path = "data\Twitter-iamthespookster-2019-08-04 (1).pdf"
with pdfplumber.open(pdf_path) as pdf:
    raw_data = "\n".join([page.extract_text() for page in pdf.pages if page.extract_text()])

# STEP 2 - Extract Tweets from Raw Data
dat = raw_data.split("\n")
dat = dat[4:]  # Remove first four lines (heading/whitespace)

# Find and remove line breaks
LBs = [i for i, line in enumerate(dat) if line == ""]
dat = [line for i, line in enumerate(dat) if i not in LBs]

# Find the Line Containing the Source of Each Tweet
source_line = [i for i, line in enumerate(dat) if "(Source:" in line]

# Extract Tweet, Date, Source from Content
tweets, dates, sources = [], [], []
start = 0
for i in source_line:
    tweet_in = dat[start:i+1]
    tweets.append(" ".join(tweet_in[:-2]))
    dates.append(tweet_in[-2])
    sources.append(tweet_in[-1])
    start = i + 1

data = pd.DataFrame({"tweet": tweets, "date": dates, "source": sources})

# STEP 3 - Extract Data from Tweet
url_pattern = re.compile(r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+")

data["RT"] = data["tweet"].str.startswith("RT ")
data["tweet"] = data["tweet"].str.replace(r"^RT ", "", regex=True)

data["user_tag"] = data["tweet"].str.contains("@")
data["account"] = data["tweet"].apply(lambda x: ",".join(re.findall(r"@\w+", x)))
data["tweet"] = data["tweet"].apply(lambda x: re.sub(r"@\w+|:", "", x))

data["ext_link"] = data["tweet"].str.contains("http")
data["link"] = data["tweet"].apply(lambda x: ",".join(url_pattern.findall(x)))
data["tweet"] = data["tweet"].apply(lambda x: url_pattern.sub("", x))

print(data.head())
print(data.shape[0])

                                               tweet                  date  \
0                                                     Mar 9, 2019, 4:16 PM   
1                                                Why  Mar 9, 2019, 4:28 PM   
2   Today in “Manosphere Grift or Terrorist Recru...  Mar 9, 2019, 4:43 PM   
3   How can anyone see this as a good luck charm?...  Mar 9, 2019, 4:46 PM   
4                                  Hamms is good tho  Mar 9, 2019, 5:44 PM   

                                              source     RT  user_tag  \
0  (Source: https://twitter.com/iamthespookster/s...  False     False   
1  (Source: https://twitter.com/iamthespookster/s...  False      True   
2  (Source: https://twitter.com/iamthespookster/s...   True      True   
3  (Source: https://twitter.com/iamthespookster/s...   True      True   
4  (Source: https://twitter.com/iamthespookster/s...  False      True   

            account  ext_link link  
0                       False       
1  @LetsJetTogethe

### Extracting excel data

In [16]:
import pandas as pd

# Read CSV without headers and assign column names
df2 = pd.read_excel('data\Dataset.xlsx', names=['Posts'])

print(df2.head())

                                               Posts
0  Mommy said not to talk to strangers..but she's...
1  1.I was immune to getting hurt/killed/infected...
2  I mean terrorist attacks happen all the time. ...
3  As a god, it would be my responsibility not to...
4  I am going to grab a knife and shove it in the...


### Combine into 1 df

In [18]:
#combine the 2 data

df1_selected = data[['tweet']].rename(columns={'tweet': 'Posts'})

df = pd.concat([df1_selected,df2], ignore_index = True)
df['Mass shooter post']=1

print(df)

                                                  Posts  Mass shooter post
0                                                                        1
1                                                   Why                  1
2      Today in “Manosphere Grift or Terrorist Recru...                  1
3      How can anyone see this as a good luck charm?...                  1
4                                     Hamms is good tho                  1
...                                                 ...                ...
3851  I'm getting over much of the problems I've had...                  1
3852  Bleys, thanks for your uplifting message.\n\nI...                  1
3853  I'm a fan of zombie film's, have been for year...                  1
3854  Hello All...\n\nI came across this place on th...                  1
3855  Thank's for the friendly welcome, I'm going to...                  1

[3856 rows x 2 columns]


# EDA