# Imports

In [0]:
import pyspark
import sparknlp
import pyspark.sql.functions as F
from pyspark.sql.types import *
from pyspark.ml.feature import *
from pyspark.ml import Pipeline
from pyspark.mllib.evaluation import MulticlassMetrics
from sparknlp.annotator import *
from sparknlp.common import *
from sparknlp.base import *
from sklearn.metrics import classification_report

# Model

## Training Dataset

In [0]:
russian_propaganda = (spark.read.table("russian_propaganda")
    .withColumn("is_propaganda", F.lit(1.0)))

true_news = (spark.read.table("non_propaganda_csv")
    .select(F.col("5").alias("text"))
    .withColumn("is_propagande", F.lit(0.0)))

train_val_fraction = 0.2
russian_peopaganda_train_count = int(russian_propaganda.count() * (1 - train_val_fraction))
true_news_train_count = int(true_news.count() * (1 - train_val_fraction))

training_data = (russian_propaganda.limit(russian_peopaganda_train_count)
    .union(true_news.limit(true_news_train_count))
    .cache()
)

test_data = (russian_propaganda.offset(russian_peopaganda_train_count)
    .union(true_news.offset(true_news_train_count))
    .cache()
)

In [0]:
display(training_data.head(5))

text,is_propaganda
The Holodomor theme is a term that appeared in the Cold War as an element of anti-Soviet propaganda. There was no famine in Ukraine because there is no documentary evidence of this.,1.0
The US is supporting terrorists in Iran and aims to organise a coup in the country.,1.0
"George Soros is behind many political convulsions around the world, e.g. creating anti-establishment parties such as SYRIZA (Greece) and PODEMOS (Spain).",1.0
"The Ukrainian secret service manipulated a sound recording in the MH17 investigation.As an indication of the manipulation, the Malaysian expert mentions the noise level in the sound recording: ""That is manipulated. The noise level differs from this audio track. See this soundtrack, the noise level differs in this part. It is lower here and higher here"".Audio parts had been cut together. There are also many sections that have been cut out. ""That is clear, you can see that, the lack of parts of the audio recording,"" the analyst added.",1.0
Belgian PM Charles Micheal told the press after his meeting with Dmitrij Medvedev that the EU has had enough of self-inflicted harm and will stop supporting the sanctions. The reason for the conflict in Eastern Ukraine - which brought about the EU sanctions - is the US-led coup. Russia does not even have troops in the Donbas and their support for the two separatist republic is only a way to stop the Ukrainian government.,1.0


In [0]:
display(training_data.groupBy("is_propaganda").count())

is_propaganda,count
1.0,14503
0.0,39903


## Training

In [0]:
documentAssembler = (
    DocumentAssembler()
    .setInputCol("text")
    .setOutputCol("document")
)
    
encoder = (
    UniversalSentenceEncoder.pretrained("tfhub_use", "en")
    .setInputCols(["document"])
    .setOutputCol("sentence_embeddings")
)

model = (
    ClassifierDLApproach()
    .setInputCols("sentence_embeddings")
    .setOutputCol("prediction")
    .setLabelColumn("is_propaganda")
    .setMaxEpochs(10)
    .setEnableOutputLogs(True)
)

pipeline = Pipeline(
    stages = [
        documentAssembler,
        encoder,
        model
    ]
)

tfhub_use download started this may take some time.
Approximate size to download 923.7 MB
[ | ][ / ][ — ][ \ ][ | ][ / ][ — ][ \ ][ | ][ / ][ — ][ \ ][ | ][ / ][ — ][ \ ][ | ][ / ][ — ][ \ ][ | ][ / ][ — ][OK!]


In [0]:
sentiment_predictor = pipeline.fit(training_data)

## Validation

In [0]:
predictions = (sentiment_predictor.transform(test_data)
    .select("is_propaganda", F.element_at(F.col("prediction.result"), 1).alias("prediction"))
    .withColumn("prediction", F.when(F.col("prediction") == "1.0", F.lit(1.0)).otherwise(F.lit(0.0)))
    .toPandas())

In [0]:
print(classification_report(predictions["is_propaganda"], predictions["prediction"], target_names=["non_propaganda", "propaganda"]))

                precision    recall  f1-score   support

non_propaganda       1.00      1.00      1.00      9976
    propaganda       1.00      1.00      1.00      3626

      accuracy                           1.00     13602
     macro avg       1.00      1.00      1.00     13602
  weighted avg       1.00      1.00      1.00     13602



In [0]:
sentences = [
    "I like to drink beer",
    "You are bad guy",
    "USA confront Russian and wants to destroy it",
    "Ukraine will lose the war",
    "i was in ukraine, it is beautiful"
]
schema = schema = StructType([StructField("text", StringType(), True)])
data = [Row(text=sentence) for sentence in sentences]
test = spark.createDataFrame(data)

In [0]:
test_predictions = (sentiment_predictor.transform(test)
    .select("text", F.element_at(F.col("prediction.result"), 1).alias("prediction"))
    .withColumn("prediction", F.when(F.col("prediction") == "1.0", F.lit(1.0)).otherwise(F.lit(0.0))))
    
display(test_predictions)

text,prediction
I like to drink beer,0.0
You are bad guy,0.0
USA confront Russian and wants to destroy it,1.0
Ukraine will lose the war,1.0
"i was in ukraine, it is beautiful",0.0
