# Быстрая навигация
* [Блок 1. HashingTF и IDF](#10)  
* [Блок 2. Word2Vec](#20)  

In [28]:
# !pip install pyspark

In [3]:
import numpy as np
import pandas as pd

from pyspark.sql import SparkSession 

In [5]:
# Create SparkSession
spark = SparkSession.builder.appName('text_classification').getOrCreate()

Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


22/12/10 19:24:26 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


In [6]:
df = pd.read_csv('train.csv') # from /kaggle/input/jigsaw-toxic-comment-classification-challenge/
df.shape

(159571, 8)

In [7]:
df.toxic.value_counts()

0    144277
1     15294
Name: toxic, dtype: int64

In [8]:
df = df[['comment_text', 'toxic']]
df.fillna("", inplace=True)
df.shape

(159571, 2)

In [9]:
from sklearn.model_selection import train_test_split

df_train, df_test = train_test_split(df, test_size=0.3, random_state=42)
df_train.shape, df_test.shape

((111699, 2), (47872, 2))

In [10]:
train = spark.createDataFrame(df_train)
test = spark.createDataFrame(df_test)

<a id="10"></a><h3 style='background:black; border:0; color:white'><center>Блок 1. HashingTF и IDF<center>

In [11]:
from pyspark.ml.feature import Tokenizer, HashingTF, IDF

tokenizer = Tokenizer(inputCol="comment_text", outputCol="words")
wordsData = tokenizer.transform(train)

hashingTF = HashingTF(inputCol="words", outputCol="rawFeatures", numFeatures=20000)
tf = hashingTF.transform(wordsData)

idf = IDF(inputCol="rawFeatures", outputCol="features")
idfModel = idf.fit(tf) 
tfidf = idfModel.transform(tf)

22/12/10 19:24:39 WARN TaskSetManager: Stage 0 contains a task of very large size (10938 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

In [12]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(featuresCol="features", labelCol='toxic', regParam=0.1)
lrModel = lr.fit(tfidf)

22/12/10 19:24:46 WARN TaskSetManager: Stage 1 contains a task of very large size (10938 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

22/12/10 19:24:50 WARN TaskSetManager: Stage 2 contains a task of very large size (10938 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

22/12/10 19:24:54 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeSystemBLAS
22/12/10 19:24:54 WARN BLAS: Failed to load implementation from: com.github.fommil.netlib.NativeRefBLAS
22/12/10 19:24:54 WARN TaskSetManager: Stage 3 contains a task of very large size (10938 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:24:54 WARN TaskSetManager: Stage 4 contains a task of very large size (10938 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:24:55 WARN TaskSetManager: Stage 5 contains a task of very large size (10938 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:24:55 WARN TaskSetManager: Stage 6 contains a task of very large size (10938 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:24:55 WARN TaskSetManager: Stage 7 contains a task of very large size (10938 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:24:55 WARN TaskSetManager: Stage 8 contains a task of very large si

In [13]:
test_tokens = tokenizer.transform(test)
test_tf = hashingTF.transform(test_tokens)
test_tfidf = idfModel.transform(test_tf)

res = lrModel.transform(test_tfidf)

In [14]:
df_res = res.toPandas()
df_res.shape

22/12/10 19:25:03 WARN TaskSetManager: Stage 45 contains a task of very large size (4286 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

(47872, 8)

In [15]:
from sklearn.metrics import f1_score, accuracy_score, recall_score

y_true = df_test.toxic.values
y_pred = df_res.prediction.values
f1 = f1_score(y_true, y_pred, average='macro')
ac = accuracy_score(y_true, y_pred)
re = recall_score(y_true, y_pred)

print(f'Accuracy: {ac:.2}')
print(f'F1: {f1:.2}')
print(f'Recall: {re:.2}')

Accuracy: 0.91
F1: 0.59
Recall: 0.13


In [16]:
# Увеличение количества признаков (numFeatures) положительно влияет на метрики

# numFeatures - 1000
# Accuracy: 0.91
# F1: 0.49
# Recall: 0.015
    
# numFeatures - 2000
# Accuracy: 0.91
# F1: 0.50
# Recall: 0.022
    
# numFeatures - 5000
# Accuracy: 0.91
# F1: 0.53
# Recall: 0.058

# numFeatures - 20000
# Accuracy: 0.91
# F1: 0.59
# Recall: 0.13

<a id="20"></a><h3 style='background:black; border:0; color:white'><center>Блок 2. Word2Vec<center>

In [22]:
from pyspark.ml.feature import Word2Vec

w2v_tokenizer = Word2Vec(vectorSize=10, minCount=0, inputCol="words", outputCol="rawFeatures")
w2v = w2v_tokenizer.fit(wordsData)
w2v_tr = w2v.transform(wordsData)

22/12/10 19:37:35 WARN TaskSetManager: Stage 62 contains a task of very large size (10938 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

22/12/10 19:37:42 WARN TaskSetManager: Stage 64 contains a task of very large size (10938 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

In [23]:
lr = LogisticRegression(featuresCol="rawFeatures", labelCol='toxic', regParam=0.1)
lrModel = lr.fit(w2v_tr)

22/12/10 19:40:34 WARN TaskSetManager: Stage 67 contains a task of very large size (10938 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

22/12/10 19:40:36 WARN TaskSetManager: Stage 68 contains a task of very large size (10938 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

22/12/10 19:40:39 WARN TaskSetManager: Stage 69 contains a task of very large size (10938 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:40:39 WARN TaskSetManager: Stage 70 contains a task of very large size (10938 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:40:39 WARN TaskSetManager: Stage 71 contains a task of very large size (10938 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:40:39 WARN TaskSetManager: Stage 72 contains a task of very large size (10938 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:40:39 WARN TaskSetManager: Stage 73 contains a task of very large size (10938 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:40:39 WARN TaskSetManager: Stage 74 contains a task of very large size (10938 KiB). The maximum recommended task size is 1000 KiB.
22/12/10 19:40:40 WARN TaskSetManager: Stage 75 contains a task of very large size (10938 KiB). The maximum recommended task size is 1000 KiB.

In [24]:
test_w2v = w2v.transform(test_tokens)
res = lrModel.transform(test_w2v)

In [25]:
df_res = res.toPandas()
df_res.shape

22/12/10 19:40:43 WARN TaskSetManager: Stage 81 contains a task of very large size (4286 KiB). The maximum recommended task size is 1000 KiB.


                                                                                

(47872, 7)

In [26]:
y_true = df_test.toxic.values
y_pred = df_res.prediction.values
f1 = f1_score(y_true, y_pred, average='macro')
ac = accuracy_score(y_true, y_pred)
re = recall_score(y_true, y_pred)

print(f'Accuracy: {ac:.2}')
print(f'F1: {f1:.2}')
print(f'Recall: {re:.2}')

Accuracy: 0.92
F1: 0.6
Recall: 0.14


In [29]:
# Метрики немного выше, но зависит от выбранных параметров vectorSize и numFeatures в методах -
# чем они выше тем выше метрики, но нехватает производительности ноутбука