In [75]:
import findspark
findspark.init()
import pyspark as ps
import warnings
from pyspark.sql import SQLContext
from nltk.stem.snowball import SnowballStemmer
import pyspark.sql.functions as f
from pyspark.sql.functions import trim
from pyspark.sql.types import DoubleType,DateType


In [76]:
from pyspark.ml.feature import HashingTF, IDF, Tokenizer
from pyspark.ml.feature import StringIndexer
from pyspark.ml import Pipeline
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.classification import LinearSVC

Загружаем данные

In [77]:
try:
    # create SparkContext on all CPUs available: in my case I have 4 CPUs on my laptop
    sc = ps.SparkContext('local[4]')
    sqlContext = SQLContext(sc)
    print("Just created a SparkContext")
except ValueError:
    warnings.warn("SparkContext already exists in this scope")



In [78]:
df = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('dataset/train.csv')
df_test = sqlContext.read.format('com.databricks.spark.csv').options(header='true', inferschema='true').load('dataset/test.csv')
type(df)

pyspark.sql.dataframe.DataFrame

In [79]:
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- comment_text: string (nullable = true)
 |-- toxic: string (nullable = true)
 |-- severe_toxic: string (nullable = true)
 |-- obscene: string (nullable = true)
 |-- threat: string (nullable = true)
 |-- insult: string (nullable = true)
 |-- identity_hate: string (nullable = true)



In [80]:
df = df.dropna()
# df = df.select(trim("id"), 'comment_text', 'toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate')
df = df.withColumn("id", trim(df. id))
df.count()

124633

In [81]:
df_test = df_test.dropna()
df_test = df_test.withColumn("id", trim(df_test. id))
df_test.count()

253637

In [82]:
df.registerTempTable("df");
df = sqlContext.sql("SELECT * FROM df WHERE LENGTH(toxic) == 1 AND LENGTH(id) == 16")

In [83]:
df = df.withColumn("toxic",df.toxic.cast(DoubleType()))
df = df.withColumn("severe_toxic",df.severe_toxic.cast(DoubleType()))
df = df.withColumn("obscene",df.obscene.cast(DoubleType()))
df = df.withColumn("threat",df.threat.cast(DoubleType()))
df = df.withColumn("insult",df.insult.cast(DoubleType()))
df = df.withColumn("identity_hate",df.identity_hate.cast(DoubleType()))

In [84]:
df.printSchema()

root
 |-- id: string (nullable = true)
 |-- comment_text: string (nullable = true)
 |-- toxic: double (nullable = true)
 |-- severe_toxic: double (nullable = true)
 |-- obscene: double (nullable = true)
 |-- threat: double (nullable = true)
 |-- insult: double (nullable = true)
 |-- identity_hate: double (nullable = true)



In [85]:
df.show()

+----------------+--------------------+-----+------------+-------+------+------+-------------+
|              id|        comment_text|toxic|severe_toxic|obscene|threat|insult|identity_hate|
+----------------+--------------------+-----+------------+-------+------+------+-------------+
|000103f0d9cfb60f|D'aww! He matches...|  0.0|         0.0|    0.0|   0.0|   0.0|          0.0|
|000113f07ec002fd|Hey man, I'm real...|  0.0|         0.0|    0.0|   0.0|   0.0|          0.0|
|0001d958c54c6e35|You, sir, are my ...|  0.0|         0.0|    0.0|   0.0|   0.0|          0.0|
|0002bcb3da6cb337|COCKSUCKER BEFORE...|  1.0|         1.0|    1.0|   0.0|   1.0|          0.0|
|00031b1e95af7921|Your vandalism to...|  0.0|         0.0|    0.0|   0.0|   0.0|          0.0|
|00037261f536c51d|Sorry if the word...|  0.0|         0.0|    0.0|   0.0|   0.0|          0.0|
|00040093b2687caa|alignment on this...|  0.0|         0.0|    0.0|   0.0|   0.0|          0.0|
|00070ef96486d6f9|Oh, and the girl ...|  0.0|     

In [141]:
(train_set, val_set) = df.randomSplit([0.8, 0.2], seed = 2000)

In [142]:
train_y = train_set.drop('id').drop('comment_text')

In [143]:
train_y.show()

+-----+------------+-------+------+------+-------------+
|toxic|severe_toxic|obscene|threat|insult|identity_hate|
+-----+------------+-------+------+------+-------------+
|  0.0|         0.0|    0.0|   0.0|   0.0|          0.0|
|  0.0|         0.0|    0.0|   0.0|   0.0|          0.0|
|  0.0|         0.0|    0.0|   0.0|   0.0|          0.0|
|  1.0|         1.0|    1.0|   0.0|   1.0|          0.0|
|  0.0|         0.0|    0.0|   0.0|   0.0|          0.0|
|  0.0|         0.0|    0.0|   0.0|   0.0|          0.0|
|  0.0|         0.0|    0.0|   0.0|   0.0|          0.0|
|  0.0|         0.0|    0.0|   0.0|   0.0|          0.0|
|  0.0|         0.0|    0.0|   0.0|   0.0|          0.0|
|  0.0|         0.0|    0.0|   0.0|   0.0|          0.0|
|  0.0|         0.0|    0.0|   0.0|   0.0|          0.0|
|  0.0|         0.0|    0.0|   0.0|   0.0|          0.0|
|  0.0|         0.0|    0.0|   0.0|   0.0|          0.0|
|  1.0|         0.0|    1.0|   0.0|   1.0|          0.0|
|  0.0|         0.0|    0.0|   

In [144]:
train_x = train_set.select('id','comment_text')

In [145]:
train_x.show()

+----------------+--------------------+
|              id|        comment_text|
+----------------+--------------------+
|000103f0d9cfb60f|D'aww! He matches...|
|000113f07ec002fd|Hey man, I'm real...|
|0001d958c54c6e35|You, sir, are my ...|
|0002bcb3da6cb337|COCKSUCKER BEFORE...|
|00031b1e95af7921|Your vandalism to...|
|00037261f536c51d|Sorry if the word...|
|00040093b2687caa|alignment on this...|
|00070ef96486d6f9|Oh, and the girl ...|
|000897889268bc93|REDIRECT Talk:Voy...|
|0009801bd85e5806|The Mitsurugi poi...|
|000f35deef84dc4a|There's no need t...|
|000ffab30195c5e1|Yes, because the ...|
|0015f4aa35ebe9b5|pretty much every...|
|00190820581d90ce|FUCK YOUR FILTHY ...|
|001c419c445b5a59|You had a point, ...|
|001c557175094f10|In other words, y...|
|001dc38a83d420cf|GET FUCKED UP. GE...|
|001e89eb3f0b0915|Are you threateni...|
|001ee16c46a99262|Thanks! Undeletio...|
|0020e7119b96eeeb|Stupid peace of s...|
+----------------+--------------------+
only showing top 20 rows



In [146]:
train_set = train_set.dropna()
val_set = val_set.dropna()

HashingTF и IDF для логистической регрессии и SVM

In [96]:
for i in range(8,15):
    tokenizer = Tokenizer(inputCol="comment_text", outputCol="words")
    hashtf = HashingTF(numFeatures=2**i, inputCol="words", outputCol='tf')
    idf = IDF(inputCol='tf', outputCol="features", minDocFreq=5)
    auc_av_reg = 0
    auc_av_svm = 0
    for label in ['toxic', 'severe_toxic', 'obscene','threat','insult','identity_hate']:
        log_reg=LogisticRegression(featuresCol='features', labelCol=label)
        svm = LinearSVC(labelCol=label)
        pipeline = Pipeline(stages=[tokenizer, hashtf, idf, log_reg])
        pipeline2 = Pipeline(stages=[tokenizer, hashtf, idf, svm])
    
        pipelineFit_reg = pipeline.fit(train_set)
        pipelineFit_svm = pipeline2.fit(train_set)
        val_reg = pipelineFit_reg.transform(val_set)
        val_svm = pipelineFit_svm.transform(val_set)
        comment_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol=label)
    
        auc_av_reg += comment_eval.evaluate(val_reg) / 6
        auc_av_svm += comment_eval.evaluate(val_svm) / 6
    print('average logreg auc for', i*nFeatures, 'features:', auc_av_reg)
    print('average svm auc for', i*nFeatures, 'features:', auc_av_svm)


average logreg auc for 2048 features: 0.5402676269243784
average svm auc for 2048 features: 0.5081478320025472
average logreg auc for 2304 features: 0.5920918408354721
average svm auc for 2304 features: 0.5330903811308944
average logreg auc for 2560 features: 0.6281785754869031
average svm auc for 2560 features: 0.5811089030728485
average logreg auc for 2816 features: 0.6662553673563634
average svm auc for 2816 features: 0.6232626310513292
average logreg auc for 3072 features: 0.6695717031519337
average svm auc for 3072 features: 0.6753558618522904
average logreg auc for 3328 features: 0.6652174503520631
average svm auc for 3328 features: 0.6820810259036854
average logreg auc for 3584 features: 0.6834253499518372
average svm auc for 3584 features: 0.6874731060262619


В данном случае можно сделать вывод, что с увеличением фичей классификаторы пропорционально увеличивают качество работы, логистическая регрессия в данном случае показывает лучший результат при маленьком количестве фичей в то время как SVM выходит вперед при большем количестве фичей.

Word2vec

In [138]:
from pyspark.ml.feature import Word2Vec
from pyspark.sql.functions import array, split

In [None]:
word2Vec = Word2Vec(vectorSize=50, seed=42, inputCol="comment_text", outputCol="features_w2vec")
word2Vec.setMinCount(10)
word2Vec.setMaxIter(10)
auc_av_reg = 0
auc_av_svm = 0
train_set = train_set.withColumn("comment_text", split("comment_text", "\s+"))
val_set = val_set.withColumn("comment_text", split("comment_text", "\s+"))
for label in ['toxic', 'severe_toxic', 'obscene','threat','insult','identity_hate']:
    log_reg=LogisticRegression(featuresCol='features_w2ve', labelCol=label)
    svm = LinearSVC(featuresCol='features_w2ve',labelCol=label)
    pipeline = Pipeline(stages=[word2Vec, log_reg])
    pipeline2 = Pipeline(stages=[word2Vec, svm])
    
    pipelineFit_reg = pipeline.fit(train_set)
    pipelineFit_svm = pipeline2.fit(train_set)
    val_reg = pipelineFit_reg.transform(val_set)
    val_svm = pipelineFit_svm.transform(val_set)
    comment_eval = BinaryClassificationEvaluator(rawPredictionCol='prediction', labelCol=label)
    
    auc_av_reg += comment_eval.evaluate(val_reg) / 6
    auc_av_svm += comment_eval.evaluate(val_svm) / 6
print('average logreg auc for', i*nFeatures, 'features:', auc_av_reg)
print('average svm auc for', i*nFeatures, 'features:', auc_av_svm)

In [128]:
train_set.select('comment_text').show()

+--------------------+
|        comment_text|
+--------------------+
|[D'aww! He matche...|
|[Hey man, I'm rea...|
|[You, sir, are my...|
|[COCKSUCKER BEFOR...|
|[Your vandalism t...|
|[Sorry if the wor...|
|[alignment on thi...|
|[Oh, and the girl...|
|[REDIRECT Talk:Vo...|
|[The Mitsurugi po...|
|[There's no need ...|
|[Yes, because the...|
|[pretty much ever...|
|[FUCK YOUR FILTHY...|
|[You had a point,...|
|[In other words, ...|
|[GET FUCKED UP. G...|
|[Are you threaten...|
|[Thanks! Undeleti...|
|[Stupid peace of ...|
+--------------------+
only showing top 20 rows

