In [2]:
!pip install pyspark

[0m

In [3]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, SQLContext
from pyspark.sql import SparkSession
import pyspark.sql.functions as F
import pyspark.sql.types as T
from pyspark.ml.feature import Tokenizer, HashingTF, IDF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.tuning import TrainValidationSplit, ParamGridBuilder
from pyspark.ml.feature import Word2Vec

In [4]:
!unzip /kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip

Archive:  /kaggle/input/jigsaw-toxic-comment-classification-challenge/train.csv.zip
  inflating: train.csv               


In [5]:
spark = SparkSession.builder.master("local[5]").appName("NLP").getOrCreate()
spark.sparkContext.setLogLevel('ERROR')

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/12/12 07:54:34 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
data = pd.read_csv("/kaggle/working/train.csv")
data.fillna("", inplace=True)

In [7]:
data_train, data_test = train_test_split(data, shuffle=True)

In [8]:
train = spark.createDataFrame(data_train)
test = spark.createDataFrame(data_test)

In [9]:
out_cols = [i for i in train.columns if i not in ["id", "comment_text"]]

In [12]:
tokenizer = Tokenizer(inputCol="comment_text", outputCol="words")
train_w = tokenizer.transform(train)
test_w = tokenizer.transform(test)

In [30]:
def train_test_idf(features: int = 262144):
    global train_w, test_w
    outp = {}
    hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures = features)
    train_h = hashingTF.transform(train_w)
    test_h = hashingTF.transform(test_w)
    
    idf = IDF(inputCol="rawFeatures", outputCol="features")
    idfModel = idf.fit(train_h) 
    Train = idfModel.transform(train_h)
    Test = idfModel.transform(test_h)
    
    for i in out_cols:
        obj_model = LogisticRegression(featuresCol="features", labelCol=i, maxIter = 50)
        model = obj_model.fit(Train)
        res = model.transform(Test)
        evaluator = BinaryClassificationEvaluator( labelCol=i)
        outp[i] = evaluator.evaluate(res)
    return outp

In [31]:
res = {}
for i in [500,1000, 2000]:
    res[i] = train_test_idf(i)

                                                                                

In [32]:
for i, j in res.items():
    print("numFeatures = ", i)
    for k, l in j.items():
        print("\t" + k + " = ", l)
    print()

numFeatures =  500
	toxic =  0.8577864971383122
	severe_toxic =  0.9049685880855972
	obscene =  0.8843409207517552
	threat =  0.8882936858813885
	insult =  0.8840253232513319
	identity_hate =  0.8283828449488723

numFeatures =  1000
	toxic =  0.8751613615001697
	severe_toxic =  0.9021997066359719
	obscene =  0.8931503878701298
	threat =  0.8774360012357657
	insult =  0.890487154001317
	identity_hate =  0.8297505009502917

numFeatures =  2000
	toxic =  0.8900541918847275
	severe_toxic =  0.8938842688842703
	obscene =  0.8997107206891362
	threat =  0.8557841246844003
	insult =  0.9014479775180567
	identity_hate =  0.8294840380295881



Можно заметить, что при увеличении numFeatures качество становится лучше. Это связано с тем, что больше информации извлекается из текста.

In [33]:
def get_res_w2v():
    outp = {}
    global train_w, test_w
    word2Vec = Word2Vec(inputCol="words", outputCol="w2v_features")
    model = word2Vec.fit(train_w)
    Train = model.transform(train_w)
    Test = model.transform(test_w)
    
    for i in out_cols:
        obj_model = LogisticRegression(featuresCol="w2v_features", labelCol=i, maxIter = 50)
        model = obj_model.fit(Train)
        res = model.transform(Test)
        evaluator = BinaryClassificationEvaluator( labelCol=i)
        outp[i] = evaluator.evaluate(res)
    return outp

In [34]:
res = get_res_w2v()

                                                                                

In [35]:
for i, j in res.items():
        print(i + " = ", j)

toxic =  0.9417544215801974
severe_toxic =  0.9663468418795814
obscene =  0.9504943789609587
threat =  0.9611918737815482
insult =  0.9493444513444956
identity_hate =  0.9308283200905938


Результаты Word2Vec лучше, чем у IDF, это связано с тем, что Word2Vec - более сложная модель, которая лучше отображает контекст сообщений.