In [79]:
import findspark
findspark.init()
import pyspark as ps
import warnings
from pyspark.sql import SQLContext
from nltk.stem.snowball import SnowballStemmer
import pyspark.sql.functions as f
from pyspark.sql.functions import trim

In [2]:
try:
    # create SparkContext on all CPUs available: in my case I have 4 CPUs on my laptop
    sc = ps.SparkContext('local[4]')
    sqlContext = SQLContext(sc)
    print("Just created a SparkContext")
except ValueError:
    warnings.warn("SparkContext already exists in this scope")



Just created a SparkContext


In [100]:
df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('dataset/train.csv')
df_test = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('dataset/test.csv')
type(df)

pyspark.sql.dataframe.DataFrame

In [102]:
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- comment_text: string (nullable = true)
 |-- toxic: string (nullable = true)
 |-- severe_toxic: string (nullable = true)
 |-- obscene: string (nullable = true)
 |-- threat: string (nullable = true)
 |-- insult: string (nullable = true)
 |-- identity_hate: string (nullable = true)



In [101]:
df = df.dropna()
# df = df.select(trim("id"), 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate')
df = df.withColumn("id", trim(df. id))
df.count()

124633

In [103]:
df_test = df_test.dropna()
df_test = df_test.withColumn("id", trim(df_test. id))
df_test.count()

253637

In [104]:
df.registerTempTable("df");
df_new = sqlContext.sql("SELECT * FROM df WHERE LENGTH(toxic) == 1 AND LENGTH(id) == 16")

In [105]:
df_test.registerTempTable("df_test");
df_new_test = sqlContext.sql("SELECT * FROM df_test WHERE LENGTH(id) == 16")

In [106]:
df_new.show()

+----------------+--------------------+-----+------------+-------+------+------+-------------+
|              id|        comment_text|toxic|severe_toxic|obscene|threat|insult|identity_hate|
+----------------+--------------------+-----+------------+-------+------+------+-------------+
|000103f0d9cfb60f|D'aww! He matches...|    0|           0|      0|     0|     0|            0|
|000113f07ec002fd|Hey man, I'm real...|    0|           0|      0|     0|     0|            0|
|0001d958c54c6e35|You, sir, are my ...|    0|           0|      0|     0|     0|            0|
|0002bcb3da6cb337|COCKSUCKER BEFORE...|    1|           1|      1|     0|     1|            0|
|00031b1e95af7921|Your vandalism to...|    0|           0|      0|     0|     0|            0|
|00037261f536c51d|Sorry if the word...|    0|           0|      0|     0|     0|            0|
|00040093b2687caa|alignment on this...|    0|           0|      0|     0|     0|            0|
|00070ef96486d6f9|Oh, and the girl ...|    0|     

In [107]:
df_new_test.show()

+----------------+--------------------+
|              id|        comment_text|
+----------------+--------------------+
|00001cee341fdb12|Yo bitch Ja Rule ...|
|0000247867823ef7|     == From RfC == |
|00013b17ad220c46|                   "|
|00017563c3f7919a|:If you have a lo...|
|00017695ad8997eb|I don't anonymous...|
|0001ea8717f6de06|Thank you for und...|
|00024115d4cbde0f|Please do not add...|
|000247e83dcc1211|:Dear god this si...|
|00025358d4737918|                   "|
|00026d1092fe71cc|== Double Redirec...|
|0002eadc3b301559|I think its crap ...|
|0002f87b16116a7f|"""::: Somebody w...|
|0003806b11932181|, 25 February 201...|
|0003e1cccfd5a40a|                   "|
|00059ace3e3e9a53|                   "|
|000634272d0d44eb|==Current Positio...|
|000663aff0fffc80|this other one fr...|
|000689dd34e20979|== Reason for ban...|
|000834769115370c|:: Wallamoose was...|
|000844b52dee5f3f||blocked]] from e...|
+----------------+--------------------+
only showing top 20 rows



In [108]:
(train_set, val_set) = df_new.randomSplit([0.98, 0.02], seed = 2000)

In [109]:
train_y = train_set.drop('id').drop('comment_text')

In [110]:
train_y.show()

+-----+------------+-------+------+------+-------------+
|toxic|severe_toxic|obscene|threat|insult|identity_hate|
+-----+------------+-------+------+------+-------------+
|    0|           0|      0|     0|     0|            0|
|    0|           0|      0|     0|     0|            0|
|    0|           0|      0|     0|     0|            0|
|    1|           1|      1|     0|     1|            0|
|    0|           0|      0|     0|     0|            0|
|    0|           0|      0|     0|     0|            0|
|    0|           0|      0|     0|     0|            0|
|    0|           0|      0|     0|     0|            0|
|    0|           0|      0|     0|     0|            0|
|    0|           0|      0|     0|     0|            0|
|    0|           0|      0|     0|     0|            0|
|    0|           0|      0|     0|     0|            0|
|    0|           0|      0|     0|     0|            0|
|    0|           0|      0|     0|     0|            0|
|    0|           0|      0|   

In [111]:
train_x = train_set.select('id','comment_text')

In [112]:
train_x.show()

+----------------+--------------------+
|              id|        comment_text|
+----------------+--------------------+
|000103f0d9cfb60f|D'aww! He matches...|
|000113f07ec002fd|Hey man, I'm real...|
|0001d958c54c6e35|You, sir, are my ...|
|0002bcb3da6cb337|COCKSUCKER BEFORE...|
|00031b1e95af7921|Your vandalism to...|
|00037261f536c51d|Sorry if the word...|
|00040093b2687caa|alignment on this...|
|00070ef96486d6f9|Oh, and the girl ...|
|000897889268bc93|REDIRECT Talk:Voy...|
|0009801bd85e5806|The Mitsurugi poi...|
|000f35deef84dc4a|There's no need t...|
|000ffab30195c5e1|Yes, because the ...|
|0011cc71398479c4|How could I post ...|
|00128363e367d703|Not sure about a ...|
|0015f4aa35ebe9b5|pretty much every...|
|00169857adbc989b|Hi Explicit, can ...|
|00190820581d90ce|FUCK YOUR FILTHY ...|
|001c419c445b5a59|You had a point, ...|
|001c557175094f10|In other words, y...|
|001dc38a83d420cf|GET FUCKED UP. GE...|
+----------------+--------------------+
only showing top 20 rows



In [115]:
val_y = val_set.drop('id').drop('comment_text')

In [116]:
val_x = val_set.select('comment_text')

In [117]:
test_x = df_new_test.select('id','comment_text')

In [119]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline

tokenizer = Tokenizer(inputCol="comment_text", outputCol="words")
hashtf = HashingTF(numFeatures=2**16, inputCol="words", outputCol='tf')
idf = IDF(inputCol='tf', outputCol="features", minDocFreq=5) #minDocFreq: remove sparse terms
# label_stringIdx = StringIndexer(inputCol = "target", outputCol = "label")
pipeline = Pipeline(stages=[tokenizer, hashtf, idf])

pipelineFit_X = pipeline.fit(train_x)
pipelineFit_VAL = pipeline.fit(val_x)
pipelineFit_X_test = pipeline.fit(test_x)
train_X = pipelineFit_X.transform(train_x)
val_X = pipelineFit_VAL.transform(val_x)
test_X = pipelineFit_X.transform(test_x)
train_X.show(100)

+----------------+--------------------+--------------------+--------------------+--------------------+
|              id|        comment_text|               words|                  tf|            features|
+----------------+--------------------+--------------------+--------------------+--------------------+
|000103f0d9cfb60f|D'aww! He matches...|[d'aww!, he, matc...|(65536,[2195,4714...|(65536,[2195,4714...|
|000113f07ec002fd|Hey man, I'm real...|[hey, man,, i'm, ...|(65536,[6589,1001...|(65536,[6589,1001...|
|0001d958c54c6e35|You, sir, are my ...|[you,, sir,, are,...|(65536,[389,2762,...|(65536,[389,2762,...|
|0002bcb3da6cb337|COCKSUCKER BEFORE...|[cocksucker, befo...|(65536,[1880,1835...|(65536,[1880,1835...|
|00031b1e95af7921|Your vandalism to...|[your, vandalism,...|(65536,[1714,7221...|(65536,[1714,7221...|
|00037261f536c51d|Sorry if the word...|[sorry, if, the, ...|(65536,[1177,1880...|(65536,[1177,1880...|
|00040093b2687caa|alignment on this...|[alignment, on, t...|(65536,[1880,

In [120]:
val_X.show()

+--------------------+--------------------+--------------------+--------------------+
|        comment_text|               words|                  tf|            features|
+--------------------+--------------------+--------------------+--------------------+
|For your informat...|[for, your, infor...|(65536,[649,1009,...|(65536,[649,1009,...|
|Same for File:Sea...|[same, for, file:...|(65536,[11104,233...|(65536,[11104,233...|
|the episode list ...|[the, episode, li...|(65536,[9389,1437...|(65536,[9389,1437...|
|The real personal...|[the, real, perso...|(65536,[3085,1863...|(65536,[3085,1863...|
|Vince - FYI 'Yell...|[vince, -, fyi, '...|(65536,[568,711,1...|(65536,[568,711,1...|
|Hahaha, you dont ...|[hahaha,, you, do...|(65536,[1903,2026...|(65536,[1903,2026...|
|Helpme is not to ...|[helpme, is, not,...|(65536,[438,989,6...|(65536,[438,989,6...|
|I'm not going to ...|[i'm, not, going,...|(65536,[7791,1001...|(65536,[7791,1001...|
|I know what you'r...|[i, know, what, y...|(65536,[835