In [237]:
import findspark
findspark.init()

import pyspark
from pyspark.sql import SparkSession

In [238]:
spark = SparkSession \
    .builder \
    .appName("Twitter Ban Project") \
    .getOrCreate()

# Data
### Load Data

In [253]:
df = spark.read.csv("tweets/tweetIDs-post-ban.csv", header=True)
df.count()

6810

### Remove empty tweets

In [252]:
# Remove tweets with empty text fields
tweetRaw = df.filter(df.text != '')
tweetRaw.count()
# test = tweetRaw.take(5)
# for i in range(5):
#     print(test[i].__getattr__("text"))

4236

## Helper Functions

### formatText

In [250]:
import re

# Take in a string containing a tweet's text and return a list of the words in the tweet
def formatText(text):
    #text = row.__getattr__("text")
    if isinstance(text, type(str)):

        # Remove Everything but letters, numbers, and whitespace
        text = re.sub(r'[^\w\s]', '', text) 
        textList = text.split(' ')
                
        # Remove RT prefix 
        if 'RT' in textList:
            textList.remove('RT')
        
        # Remove empty elements
        while '' in textList:
            textList.remove('') 
    else:
        textList = []
    
    # print(textList)
    return textList 
    

In [242]:
def formatHashtags(htgs):
    # Does nothing yet
    return htgs

### IsViolent

In [243]:
# Determine if a tweet's text is violent
def isViolent(text):
    isViolent = False
    words = formatText(text)
    return isViolent
    

# Analysis
### Map for violent Tweets

In [251]:
rdd = tweetRaw.rdd.map(lambda row:
    (row.__getattr__("hashtags"), formatText(row.__getattr__("text")), isViolent(row.__getattr__("text")))
    )

tweets = rdd.toDF(["hashtags", "text", "isViolent"])
first = tweets.first()
print(first.__getattr__("text"))
tweets.show()

['VicToensing', 'Can', 'you', 'believe', 'that', 'SpeakerPelosi', 'tweeted', 'that', 'realDonaldTrump', 'should', 'end', 'demanding', 'that', 'Iran', 'cease', 'its', 'violence', 'Re']
+--------------------+--------------------+---------+
|            hashtags|                text|isViolent|
+--------------------+--------------------+---------+
|                null|[VicToensing, Can...|    false|
|                null|[CindyKendrick11,...|    false|
|                null|[NapsyLev, Andrew...|    false|
|           Irán Irak|[realDonaldTrump,...|    false|
|       NoWarWithIran|   [MoveOn, TONIGHT]|    false|
|                null|[tiffany_caban, W...|    false|
|                null|[RepMullin, I, ju...|    false|
|                null|[IlhanMN, The, Tr...|    false|
|                null|[TeamWarren, This...|    false|
|                null|[Take, it, from, ...|    false|
|                null|[Judicial, Watch,...|    false|
|                null|[WalshFreedom, jo...|    false|
|     

### Check text format

In [245]:
records = tweets.take(5)
for i in range(5):
    print(records[i].__getattr__("text"))

['RT', 'VicToensing', 'Can', 'you', 'believe', 'that', 'SpeakerPelosi', 'tweeted', 'that', 'realDonaldTrump', 'should', 'end', 'demanding', 'that', 'Iran', 'cease', 'its', 'violence', 'Re']
['RT', 'CindyKendrick11', 'POS']
['NapsyLev', 'AndrewYang', 'httpstcow5Zt3TNUme']
['realDonaldTrump', 'anunció', 'nuevas', 'sanciones', 'económicas', 'poderosas', 'contra', 'Irán']
['RT', 'MoveOn', 'TONIGHT']
