# Political Toxicity: Conventional NLP Pipelines
Using Apache Spark, the project should be able to take in CSV data from social media posts and classify them accurately as toxic or not.

In [15]:
from pyspark.sql import SparkSession, functions as F, types as T
from pyspark.sql import DataFrame
spark = SparkSession.builder.appName('Test').getOrCreate()

In [61]:
from pyspark.sql.functions import col, trim, lower, when
"""
Maps the column from string values to double values.
0.0 in all unknown types or cases
"""
def map_toxic_hard(df, col_name="toxic_hard", new_col="toxic_score"):
    return df.withColumn(
        new_col,
        when(trim(lower(col(col_name))) == "true", 1)   # string TRUE
        .when(trim(lower(col(col_name))) == "false", 0) # string FALSE
        .otherwise(0)                                   # everything else
    )

def preprocessing_steps(df: DataFrame) -> DataFrame:
    temp = map_toxic_hard(df, "toxic_hard", "toxic_score")
    return temp.select('text','toxic_score') \
        .dropna(subset=['text'])

"""
Reads the CSV containing the training data at the given path 
and returns a spark dataframe.
"""
def read_train_data(path:str) -> DataFrame:
    seed_df = (
        spark.read
            .option("header", True)
            .option("inferSchema", True)
            .option("multiline", True)
            .csv(path,sep=',',ignoreLeadingWhiteSpace=True,ignoreTrailingWhiteSpace=True)
    )
    seed_df = preprocessing_steps(seed_df)
    return seed_df

In [62]:
train_data_path = "data/train/dataset.csv"
seed_df = read_train_data(train_data_path)

## Data Cleaning and Preparation

For all the tweets we need to remove all tokens that are not useful to the analysis. Then we need to create simple lists of relevant tokens for the unsupervised BERT model to identify.

In [64]:
seed_df.printSchema()

root
 |-- text: string (nullable = true)
 |-- toxic_score: integer (nullable = false)



In [65]:
# Import the libraries we need
from pyspark.ml import Pipeline
from pyspark.sql import DataFrame
from pyspark.ml.feature import Tokenizer, StopWordsRemover, CountVectorizer, IDF
from pyspark.ml.feature import HashingTF
from pyspark.ml.classification import LogisticRegression

#from pyspark.ml import Transformer
#from pyspark.sql import DataFrame
from pyspark.sql.functions import col

In [66]:
# Pipeline builder

# ETL
tokenizer = Tokenizer(inputCol="text", outputCol="words")
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='cleaned_words')

# Hash/Vectorize
hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol='features')

# Regression Estimation
lr = LogisticRegression(featuresCol='features',labelCol='toxic_score',
        maxIter=10, regParam=0.001)

# Pipeline
pipeline = Pipeline(stages=[tokenizer, remover, hashingTF, lr])

In [67]:
# Check whether the GPU is open
import torch
print(torch.cuda.is_available())
print(torch.cuda.device_count())
print(torch.cuda.get_device_name(0))

True
1
NVIDIA T500


In [68]:
""" DEBUG
# Print out all parameters for this stage
print("LogisticRegression parameters:")
print(lr.explainParams())
# If you want to check just the labelCol and featuresCol:
print("Label column:", lr.getLabelCol())
print("Features column:", lr.getFeaturesCol())
"""
# Now fit only on the required columns
model = pipeline.fit(seed_df)
model

PipelineModel_a60514c2f9fd

## Predict on Test Data
This will load a small test subset and confirm our predictions are correct.

In [69]:
test_path = "data/test/2pt_test.csv"

test_df = (
    spark.read
         .option("header", True)
         .option("inferSchema", True)
         .csv(test_path)
)

In [70]:
test_df.printSchema()

root
 |-- text: string (nullable = true)
 |-- toxic_score_true: double (nullable = true)



In [71]:
predictions = model.transform(test_df)
predictions.select('toxic_score_true','prediction','probability').show(truncate=False)

+----------------+----------+------------------------------------------+
|toxic_score_true|prediction|probability                               |
+----------------+----------+------------------------------------------+
|0.0             |0.0       |[0.9985773268381002,0.0014226731618998123]|
|1.0             |1.0       |[9.038140940731217E-5,0.9999096185905927] |
+----------------+----------+------------------------------------------+



## Split Validation
Repeat the pipeline building and analysis through split validation to demonstrate the accuracy of the model.

In [72]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder
SEED = 22

In [73]:
tokenizer = Tokenizer(inputCol="text", outputCol="words")
remover = StopWordsRemover(inputCol=tokenizer.getOutputCol(), outputCol='cleaned_words')
hashingTF = HashingTF(inputCol=remover.getOutputCol(), outputCol='features')
lr = LogisticRegression(featuresCol='features',labelCol='toxic_score',
        maxIter=10, regParam=0.001)
pipeline = Pipeline(stages=[tokenizer,remover,hashingTF,lr])

In [76]:
grid = ParamGridBuilder().addGrid(lr.maxIter, [0, 1]).build()
evaluator = BinaryClassificationEvaluator(labelCol='toxic_score')
tvs = TrainValidationSplit(
    estimator=pipeline,parallelism=10,
    seed=SEED,
    evaluator=evaluator,
    estimatorParamMaps=grid)
train_df = read_train_data(train_data_path)

model = tvs.fit(train_df)
model.bestModel.transform(dataset=test_df)\
    .select('toxic_score_true','probability','prediction')\
    .show(truncate=False)


+----------------+-----------------------------------------+----------+
|toxic_score_true|probability                              |prediction|
+----------------+-----------------------------------------+----------+
|0.0             |[0.8884556385439323,0.11154436145606772] |0.0       |
|1.0             |[0.031650691973346586,0.9683493080266534]|1.0       |
+----------------+-----------------------------------------+----------+



In [77]:
model.validationMetrics

[0.5, 0.6995571644466337]