In [1]:
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:95% !important; }</style>"))

In [2]:
import os
os.environ["PYSPARK_PYTHON"]="/usr/bin/python3"

In [3]:
from pyspark import SparkContext
import pyspark
conf = pyspark.SparkConf()
conf = conf.set('spark.local.dir', '/home/jimmy/spark_tmp')
conf = conf.set('spark.executor.memory', '15G')
conf = conf.set('spark.driver.memory', '15G')
conf = conf.set('spark.driver.maxResultSize', '15G')
conf = conf.set("spark.driver.host", "localhost")
#conf.set('spark.cores.max', '8')
#conf = conf.set("spark.default.parallelism", 8)
sc = SparkContext(appName="Train classifier", conf=conf)
sc.setCheckpointDir('checkpoint/')
sc

In [4]:
from pyspark.sql import SparkSession
spark = SparkSession(sc)
spark

In [5]:
files = sc.textFile("/home/jimmy/Documents/courses/spark/data/*/part-00[0-9]*")
data = spark.read.json(files)

In [8]:
from pyspark import StorageLevel

#data = data.persist(StorageLevel.MEMORY_AND_DISK)
#data = data.checkpoint(eager=True)

#data.count()

In [None]:
vandal = data.filter(data.label == "vandal")

In [None]:
unsafe = data.filter(data.label == "unsafe")

In [None]:
safe = data.filter(data.label == "safe").limit(60_000)

In [None]:
data = vandal.union(unsafe).union(safe)

In [None]:
#data = data.persist(StorageLevel.MEMORY_AND_DISK)
#data = data.checkpoint(eager=True)
#data.count()

In [6]:
data = spark.read.load("/home/jimmy/Documents/courses/spark/notebooks/processed_data")

In [7]:
from difflib import unified_diff

def make_diff(old, new):
    additions = []
    deletions = []
    generator = unified_diff(old.split(), new.split())
    for l in generator:
        if l.startswith('+'):
            additions.append(l[1:])
        elif l.startswith('-'):
            deletions.append(l[1:])
    additions = ' '.join(additions)
    deletions = ' '.join(deletions)
    return (additions, deletions)

In [9]:
from pyspark.sql import Row
from pyspark.sql.functions import udf, struct, array, col, lit, lower
from pyspark.sql.types import StringType, LongType
from pyspark.sql.functions import udf

profanities = [s.strip() for s in open("/home/jimmy/Documents/courses/spark/profanities.txt", "r").readlines()]
profanities

@udf("string")
def additions(old, new):
    (additions, _) = make_diff(old, new)
    return additions

@udf("string")
def deletions(old, new):
    (_, deletions) = make_diff(old, new)
    return deletions

@udf("long")
def longest_same_character_sequence(additions):
    ans, curr = 0, 1
    previous = None
    for c in additions:
        if c == previous:
            curr += 1
        else:
            curr = 1
        if curr > ans:
            ans = curr
        previous = c
    return ans

@udf("long")
def count_profanities(additions):
    count = 0
    additions = additions
    for profanity in profanities:
        count += additions.count(profanity)
    return count

# count_profanities_udf = udf(count_profanities, LongType())

def process_dataframe(df):
    return df \
        .withColumn("additions", lower(additions("text_old", "text_new"))) \
        .withColumn("deletions", lower(deletions("text_old", "text_new"))) \
        .drop("text_old") \
        .withColumn("profanities", count_profanities("additions")) \
        .withColumn("longest_same_character_sequence", longest_same_character_sequence("additions"))

In [10]:
data = process_dataframe(data)
data.show()

+--------------------+-----+----------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------+-------------------------------+
|             comment|label|       name_user|            text_new|          title_page|            url_page|           additions|           deletions|profanities|longest_same_character_sequence|
+--------------------+-----+----------------+--------------------+--------------------+--------------------+--------------------+--------------------+-----------+-------------------------------+
|                    | safe|  Jellysandwich0|{{short descripti...|A Complicated Kin...|//en.wikipedia.or...|          ++ 
 marry| -- 
 get married to|          0|                              2|
|→‎Bio oil:Updated...| safe|    Mirja kemppi|{{short descripti...|       UPM (company)|//en.wikipedia.or...|                ++ 
|-- 
 ===bio oil==...|          0|                              2|
|       →‎Taxonomy:ce| sa

In [None]:
#data = data.persist(StorageLevel.MEMORY_AND_DISK)
#data = data.checkpoint(eager=True)
#data.count()

In [11]:
drop_list = ['url_page', 'title_page', 'name_user', 'comment']
data = data.drop(*drop_list) #.coalesce(10_000).cache()
data.show(5)

+-----+--------------------+--------------------+--------------------+-----------+-------------------------------+
|label|            text_new|           additions|           deletions|profanities|longest_same_character_sequence|
+-----+--------------------+--------------------+--------------------+-----------+-------------------------------+
| safe|{{short descripti...|          ++ 
 marry| -- 
 get married to|          0|                              2|
| safe|{{short descripti...|                ++ 
|-- 
 ===bio oil==...|          0|                              2|
| safe|{{speciesbox
| na...|++ 
 collection"....|-- 
 collection";...|          0|                              2|
| safe|{{Tone|date=April...| ++ 
 3<sup>rd</sup>|          -- 
 third|          0|                              2|
| safe|{{Infobox televis...|++ 
 marry pérez ...|-- 
 get married ...|          0|                              2|
+-----+--------------------+--------------------+--------------------+----------

In [None]:
#data = data.persist(StorageLevel.MEMORY_AND_DISK)
#data = data.checkpoint(eager=True)
#data.count()

In [12]:
from pyspark.ml.feature import StringIndexerModel, IndexToString
from pyspark.ml import PipelineModel

text_columns = ['text_new', 'additions', 'deletions']
input_columns = text_columns + ['profanities','longest_same_character_sequence']
pipelines = {}

for col in text_columns:
    pipelines[col] = PipelineModel.load(f'/home/jimmy/Documents/courses/spark/notebooks/pipeline_{col}')
pipelines['label'] = StringIndexerModel.load(f'/home/jimmy/Documents/courses/spark/notebooks/pipeline_label')

In [13]:
for col in text_columns:
    data = pipelines[col].transform(data).drop(col, 'temp1', 'temp2', 'temp3').withColumnRenamed('temp4', col)
data = pipelines['label'].transform(data)

In [None]:
#data = data.persist(StorageLevel.MEMORY_AND_DISK)
#data = data.checkpoint(eager=True)
#data.count()

In [14]:
from pyspark.ml.linalg import Vectors
from pyspark.ml.feature import VectorAssembler
assembler = VectorAssembler(inputCols=input_columns,outputCol="features")
data = assembler.transform(data).drop(*input_columns)
data.show(5)

+-----+------+--------------------+
|label|target|            features|
+-----+------+--------------------+
| safe|   0.0|(786434,[373,400,...|
| safe|   0.0|(786434,[67,227,4...|
| safe|   0.0|(786434,[1707,195...|
| safe|   0.0|(786434,[3484,357...|
| safe|   0.0|(786434,[285,315,...|
+-----+------+--------------------+
only showing top 5 rows



In [15]:
#data = data.persist(StorageLevel.MEMORY_AND_DISK)
#data = data.checkpoint(eager=True)
#data.count()

In [16]:
#data = data.persist(StorageLevel.DISK_ONLY)
#data = data.checkpoint(eager=True)
#data.count()

In [17]:
# set seed for reproducibility
(trainingData, testData) = data.randomSplit([0.7, 0.3], seed = 100)
trainingData = trainingData.persist(StorageLevel.MEMORY_ONLY)
testData = testData.persist(StorageLevel.MEMORY_ONLY)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 42029
Test Dataset Count: 17910


In [18]:
#trainingData = trainingData.persist(StorageLevel.DISK_ONLY)
#trainingData = trainingData.checkpoint(eager=True)
#trainingData.count()

In [19]:
#testData = testData.persist(StorageLevel.DISK_ONLY)
#testData = testData.checkpoint(eager=True)
#testData.count()

In [20]:
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(labelCol='target', maxIter=20, regParam=0.3, elasticNetParam=0)

label_converter = IndexToString(inputCol="prediction", outputCol="predictedLabel",
                               labels=pipelines['label'].labels)

pipeline = Pipeline(stages=[lr, label_converter])
pipelineModel = pipeline.fit(trainingData)

results = pipelineModel.transform(testData)

In [21]:
results.show()

+-----+------+--------------------+--------------------+--------------------+----------+--------------+
|label|target|            features|       rawPrediction|         probability|prediction|predictedLabel|
+-----+------+--------------------+--------------------+--------------------+----------+--------------+
| safe|   0.0|(786434,[1,55,60,...|[2.11138391163214...|[0.40368446211901...|       1.0|        unsafe|
| safe|   0.0|(786434,[1,267,33...|[6.03109964896998...|[0.99857887952331...|       0.0|          safe|
| safe|   0.0|(786434,[1,297,30...|[0.24515096567145...|[0.07113210577865...|       1.0|        unsafe|
| safe|   0.0|(786434,[1,347,21...|[1.57948638393767...|[0.59916883116242...|       0.0|          safe|
| safe|   0.0|(786434,[2,844,10...|[1.02830647554900...|[0.51674961481456...|       0.0|          safe|
| safe|   0.0|(786434,[3,46,83,...|[3.36504599293499...|[0.97713290003446...|       0.0|          safe|
| safe|   0.0|(786434,[3,153,37...|[1.37325166877675...|[0.61838

In [22]:
results.printSchema()

root
 |-- label: string (nullable = true)
 |-- target: double (nullable = false)
 |-- features: vector (nullable = true)
 |-- rawPrediction: vector (nullable = true)
 |-- probability: vector (nullable = true)
 |-- prediction: double (nullable = false)
 |-- predictedLabel: string (nullable = true)



In [None]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator
evaluator = MulticlassClassificationEvaluator(labelCol="target", predictionCol="prediction")

evaluator.evaluate(results)

In [None]:
pipelineModel.save(f'/home/jimmy/Documents/courses/spark/notebooks/pipeline_classifier')

In [None]:
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator # Create ParamGrid for Cross Validation
paramGrid = (ParamGridBuilder()
             .addGrid(lr.regParam, [0.1, 0.3, 0.5]) # regularization parameter
             .addGrid(lr.elasticNetParam, [0.0, 0.1, 0.2]) # Elastic Net Parameter (Ridge = 0)
#            .addGrid(model.maxIter, [10, 20, 50]) #Number of iterations
#            .addGrid(idf.numFeatures, [10, 100, 1000]) # Number of features
             .build()) # Create 5-fold CrossValidator
cv = CrossValidator(estimator=lr, \
                    estimatorParamMaps=paramGrid, \
                    evaluator=evaluator, \
                    numFolds=5)
cvModel = cv.fit(trainingData)

predictions = cvModel.transform(testData)
# Evaluate best model
evaluator.evaluate(predictions)

In [None]:
cvModel.save(f'/home/jimmy/Documents/courses/spark/notebooks/logistic_regression_classifier')

In [None]:
pipeline = Pipeline(stages=[cvModel, label_converter])
pipeline.save(f'/home/jimmy/Documents/courses/spark/notebooks/pipeline_logistic_regression_classifier')

In [None]:
sc.stop()