# Political Toxicity: Conventional NLP Pipelines
Using Apache Spark, the project should be able to take in CSV data from social media posts and classify them accurately as toxic or not.

In [15]:
import pandas as pd
import matplotlib.pyplot as matplot

In [16]:
file_path = "data/train/dataset.csv"
pd.read_csv(file_path).head()

Unnamed: 0,ID,text,link,hateful_hard,hateful_soft,hateful_stats,offensive_hard,offensive_soft,offensive_stats,toxic_hard,...,overlap_hot,overlap_ht,overlap_ot,sarcasm,second,third,threat,toxic_only,toxic_rater_effect,undirected
0,0,@realDailyWire It's time for the #Military #JA...,https://www.twitter.com/xxx/status/14321700388...,False,0.0,"[5, 0]",True,0.6,"[2, 3]",False,...,False,False,False,n,0.0,0.0,0.0,False,0.197414,1.0
1,1,@CBCAlerts F@ck it! Bring over the Taliban and...,https://www.twitter.com/xxx/status/14230597186...,True,0.8,"[1, 4]",True,1.0,"[0, 5]",True,...,False,False,False,n,0.0,0.0,0.0,False,0.273154,1.0
2,2,@AmberGDay @AJEnglish Who had reported this? A...,https://www.twitter.com/xxx/status/14322009126...,False,0.2,"[4, 1]",True,1.0,"[0, 5]",False,...,False,False,False,n,0.0,0.0,0.0,False,0.212844,1.0
3,3,… “This includes using all of his oversight au...,https://www.twitter.com/xxx/status/14282266623...,False,0.0,"[5, 0]",False,0.2,"[4, 1]",False,...,False,False,True,n,0.0,1.0,0.0,False,0.173645,1.0
4,4,@darkmikasonfire @CHSommers Ok fascist bot. @t...,https://www.twitter.com/xxx/status/14230076189...,False,0.4,"[3, 2]",True,1.0,"[0, 5]",True,...,False,False,False,y,0.0,0.0,0.0,False,0.162776,1.0


In [17]:
from pyspark.sql import SparkSession, functions as F, types as T
spark = SparkSession.builder.appName('Test').getOrCreate()

file_path = "data/train/dataset.csv"

seed_df = (
    spark.read
         .option("header", True)
         .option("inferSchema", True)
         .option("multiline", True)
         .csv(file_path,sep=',',ignoreLeadingWhiteSpace=True,ignoreTrailingWhiteSpace=True)
)

print(seed_df.columns)
seed_df = seed_df.drop('link')
print(seed_df.columns)

['ID', 'text', 'link', 'hateful_hard', 'hateful_soft', 'hateful_stats', 'offensive_hard', 'offensive_soft', 'offensive_stats', 'toxic_hard', 'toxic_soft', 'toxic_stats', 'composite_hateful_hard', 'composite_hateful_soft', 'composite_hateful_stats', 'composite_offensive_hard', 'composite_offensive_soft', 'composite_offensive_stats', 'composite_toxic_hard', 'composite_toxic_soft', 'composite_toxic_stats', 'accusation', 'behavior', 'benign', 'communication_style', 'composite_hateful_rater_effect', 'composite_offensive_rater_effect', 'composite_toxic_rater_effect', 'context_necessary', 'directed', 'first', 'group', 'group_or_individual', 'hateful_only', 'hateful_rater_effect', 'individual', 'insult', 'misinformation', 'multiple_behaviors', 'namecalling', 'not_first', 'offensive_only', 'offensive_rater_effect', 'overlap', 'overlap_ho', 'overlap_hot', 'overlap_ht', 'overlap_ot', 'sarcasm', 'second', 'third', 'threat', 'toxic_only', 'toxic_rater_effect', 'undirected']
['ID', 'text', 'hateful_

## Data Cleaning and Preparation

For all the tweets we need to remove all tokens that are not useful to the analysis. Then we need to create simple lists of relevant tokens for the unsupervised BERT model to identify.

In [18]:
"""
Maps each given string to an output list of tokens for analysis.
"""
import re
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer



# Initialize lemmatizer and stemmer
lemmatizer = WordNetLemmatizer()

def clean_pipeline_conv(text):
    # lower all the cases in 'Review_text'
    text = text['text'].lower()
    # removing special characters, keeping words and numbers
    no_mentions = re.sub(r"@\w*\s", "", text)
    clean_text = re.sub(r"[^a-zA-Z0-9]", " ", no_mentions)
    # tokenizing sentences into words
    token_list = word_tokenize(clean_text)
    # lemmatizing the list without stopwords - skipping this we don't know what model to use yet
    #lemma_words = [lemmatizer.lemmatize(word) for word in token_list]
    
    return token_list
#df = seed_df.rdd.map(clean_pipeline_conv)

from pyspark.ml.feature import Tokenizer, RegexTokenizer
from pyspark.sql.functions import col, udf
from pyspark.sql.types import IntegerType

from pyspark.sql import SparkSession, functions as F, types as T
spark = SparkSession.builder.appName('Test').getOrCreate()
seed_df = (
    spark.read
         .option("header", True)
         .option("inferSchema", True)
         .option("multiline", True)
         .csv(file_path,sep=',',ignoreLeadingWhiteSpace=True,ignoreTrailingWhiteSpace=True)
)

"""
Above is conventional cleaning. Here is a pipeline for spark.
"""
def clean_pipeline_spark(df):
    tk = Tokenizer(inputCol='text',outputCol='clean_tokens')
    countTokens = udf(lambda words: len(words), IntegerType())
    tokenized = tk.transform(df)
    tokenized.select("text",'clean_tokens').show()
        #.withColumn('tokens', countTokens(col('clean_tokens')))\
        
clean_pipeline_spark(seed_df)

+--------------------+--------------------+
|                text|        clean_tokens|
+--------------------+--------------------+
|@realDailyWire It...|[@realdailywire, ...|
|@CBCAlerts F@ck i...|[@cbcalerts, f@ck...|
|@AmberGDay @AJEng...|[@ambergday, @aje...|
|… “This includes ...|[…, “this, includ...|
|@darkmikasonfire ...|[@darkmikasonfire...|
|@TheTweetOfJohn D...|[@thetweetofjohn,...|
|@7NewsMelbourne A...|[@7newsmelbourne,...|
|@knowles204 @John...|[@knowles204, @jo...|
|@InimitableMc @BB...|[@inimitablemc, @...|
|@KXAN_News Why ar...|[@kxan_news, why,...|
|A drone strike he...|[a, drone, strike...|
|@JaniceDean Both ...|[@janicedean, bot...|
|@HeyYo210 @Michae...|[@heyyo210, @mich...|
|@CNNPolitics Lol ...|[@cnnpolitics, lo...|
|@nypost #Fauci fu...|[@nypost, #fauci,...|
|@nypost THIS ASSH...|[@nypost, this, a...|
|@nypost @williaml...|[@nypost, @willia...|
|@ObangMetho I am ...|[@obangmetho, i, ...|
|@nytimes Left beh...|[@nytimes, left, ...|
|@JoeMomma833 @hit...|[@joemomma

In [20]:
seed_df.select('toxic_hard').distinct().show()

+--------------------+
|          toxic_hard|
+--------------------+
|               FALSE|
|                   0|
|                 0.2|
|                 0.8|
|                TRUE|
|              [1, 4]|
|              [2, 3]|
|              [4, 1]|
|                 0.6|
|https://www.youtu...|
|                   1|
|              [0, 5]|
|              [5, 0]|
|                 0.4|
|              [3, 2]|
|                NULL|
+--------------------+



In [26]:
from pyspark.sql.functions import col, trim, lower, when

def map_toxic_hard(df, col_name="toxic_hard", new_col="toxic_score"):
    return df.withColumn(
        new_col,
        when(trim(lower(col(col_name))) == "true", 1)   # string TRUE
        .when(trim(lower(col(col_name))) == "false", 0) # string FALSE
        .otherwise(0)                                   # everything else
    )

seed_df = map_toxic_hard(seed_df, "toxic_hard", "toxic_score")
seed_df.select("toxic_hard", "toxic_score").show(13000)



+--------------------+-----------+
|          toxic_hard|toxic_score|
+--------------------+-----------+
|               FALSE|          0|
|                TRUE|          1|
|               FALSE|          0|
|               FALSE|          0|
|                TRUE|          1|
|               FALSE|          0|
|                TRUE|          1|
|               FALSE|          0|
|               FALSE|          0|
|               FALSE|          0|
|                TRUE|          1|
|               FALSE|          0|
|               FALSE|          0|
|               FALSE|          0|
|                TRUE|          1|
|                TRUE|          1|
|               FALSE|          0|
|                TRUE|          1|
|               FALSE|          0|
|                TRUE|          1|
|               FALSE|          0|
|               FALSE|          0|
|                TRUE|          1|
|               FALSE|          0|
|                TRUE|          1|
|               FALS

In [5]:
seed_df.printSchema()

root
 |-- ID: string (nullable = true)
 |-- text: string (nullable = true)
 |-- link: string (nullable = true)
 |-- hateful_hard: string (nullable = true)
 |-- hateful_soft: string (nullable = true)
 |-- hateful_stats: string (nullable = true)
 |-- offensive_hard: string (nullable = true)
 |-- offensive_soft: string (nullable = true)
 |-- offensive_stats: string (nullable = true)
 |-- toxic_hard: string (nullable = true)
 |-- toxic_soft: string (nullable = true)
 |-- toxic_stats: string (nullable = true)
 |-- composite_hateful_hard: string (nullable = true)
 |-- composite_hateful_soft: string (nullable = true)
 |-- composite_hateful_stats: string (nullable = true)
 |-- composite_offensive_hard: string (nullable = true)
 |-- composite_offensive_soft: string (nullable = true)
 |-- composite_offensive_stats: string (nullable = true)
 |-- composite_toxic_hard: string (nullable = true)
 |-- composite_toxic_soft: string (nullable = true)
 |-- composite_toxic_stats: string (nullable = true)
 

In [53]:
# Import the libraries we need
from pyspark.ml import Pipeline
from pyspark.sql import DataFrame
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF
from pyspark.ml.feature import HashingTF
from pyspark.ml.classification import LogisticRegression

from pyspark.ml import Transformer
from pyspark.sql import DataFrame
from pyspark.sql.functions import when, col, array, lit

In [54]:


seed_df = seed_df.select('text','toxic_score')
seed_df = seed_df.withColumn("toxic_score", col("toxic_score").cast("double"))
seed_df = seed_df.dropna(subset=['text'])

tokenizer = Tokenizer(inputCol="text", outputCol="words")
hashingTF = HashingTF(inputCol=tokenizer.getOutputCol(), outputCol="features")
lr = LogisticRegression(featuresCol='features',labelCol='toxic_score',
        maxIter=10, regParam=0.001)
pipeline = Pipeline(stages=[tokenizer, hashingTF, lr])


In [None]:
# Check whether the GPU is open
import torch
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0))

True
1
NVIDIA T500


In [None]:
""" DEBUG
# Print out all parameters for this stage
print("LogisticRegression parameters:")
print(lr.explainParams())
# If you want to check just the labelCol and featuresCol:
print("Label column:", lr.getLabelCol())
print("Features column:", lr.getFeaturesCol())
"""
# Now fit only on the required columns
model = pipeline.fit(seed_df.select('text','toxic_score'))
model

Label column: toxic_score
Features column: features


PipelineModel_c98e73aa732c

## Predict on Test Data
This will load a small test subset and confirm our predictions are correct.

In [50]:
test_path = "data/test/2pt_test.csv"

test_df = (
    spark.read
         .option("header", True)
         .option("inferSchema", True)
         .csv(test_path)
)

In [51]:
test_df.printSchema()

root
 |-- text: string (nullable = true)
 |--  toxic_score_true: double (nullable = true)



In [52]:
predictions = model.transform(test_df)
predictions.drop('text','words','features','rawPrediction').show(truncate=False)

+-----------------+------------------------------------------+----------+
| toxic_score_true|probability                               |prediction|
+-----------------+------------------------------------------+----------+
|0.0              |[0.9986656838422553,0.0013343161577447082]|0.0       |
|1.0              |[8.706487859701775E-5,0.999912935121403]  |1.0       |
+-----------------+------------------------------------------+----------+

