### Value Counts of Dataset

In [1]:
import pandas as pd

df = pd.read_csv('../data/HateSpeechDatasetBalanced.csv')

In [2]:
df['Label'].value_counts()

Label
1    364525
0    361594
Name: count, dtype: int64

### Make training samples for training and validation of the model

In [3]:
df_label_0_validation = df[df['Label'] == 0].iloc[0:50000]
df_label_1_validation = df[df['Label'] == 1].iloc[0:5000]

df_validation = pd.concat([df_label_0_validation, df_label_1_validation])

df_label_0_train1 = df[df['Label'] == 0].iloc[50000:250000]
df_label_1_train1 = df[df['Label'] == 1].iloc[50000:250000]

df_train1 = pd.concat([df_label_0_train1, df_label_1_train1])

df_label_0_train2 = df[df['Label'] == 0].iloc[250000:]
df_label_1_train2 = df[df['Label'] == 1].iloc[250000:]

df_train2 = pd.concat([df_label_0_train2, df_label_1_train2])

df_validation = df_validation.sample(frac=1, random_state=42).reset_index(drop=True)
df_train1 = df_train1.sample(frac=1, random_state=42).reset_index(drop=True)
df_train2 = df_train2.sample(frac=1, random_state=42).reset_index(drop=True)

df_validation.to_csv('../data/validation_dataset.csv', index=False)
df_train1.to_csv('../data/train_dataset_1.csv', index=False)
df_train2.to_csv('../data/train_dataset_2.csv', index=False)

In [1]:
import pandas as pd

df_train1 = pd.read_csv('../data/train_dataset_1.csv')
df_train2 = pd.read_csv('../data/train_dataset_2.csv')

In [2]:
df_train = pd.concat([df_train1, df_train2])
df_train.to_csv('../data/train_dataset.csv')

In [1]:
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to /home/amamylov/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/amamylov/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [2]:
import nltk
from nltk.stem import WordNetLemmatizer
from pyspark.sql import SparkSession
from pyspark.ml.feature import Tokenizer, StopWordsRemover, HashingTF, IDF
from pyspark.sql.functions import col, udf
from pyspark.sql.types import ArrayType, StringType

# Инициализация лемматизатора
lemmatizer = WordNetLemmatizer()

# Функция для лемматизации списка слов
def lemmatize_words(words):
    return [lemmatizer.lemmatize(word) for word in words]

# Создаем UDF для применения лемматизации к каждому списку слов
lemmatize_udf = udf(lambda words: lemmatize_words(words), ArrayType(StringType()))

# Шаг 1: Инициализация Spark сессии с указанием ресурсов
spark = SparkSession.builder \
    .appName("Text Preprocessing with Lemmatization in PySpark") \
    .config("spark.executor.memory", "2g") \
    .config("spark.driver.memory", "4g") \
    .config("spark.executor.cores", "2") \
    .config("spark.executor.instances", "2") \
    .config("spark.sql.shuffle.partitions", "8") \
    .config("spark.storage.memoryFraction", "0.6") \
    .config("spark.memory.fraction", "0.8") \
    .getOrCreate()

# Шаг 2: Загрузка данных из CSV
data = spark.read.csv("../data/train_dataset.csv", header=True, inferSchema=True)

# Шаг 3: Препроцессинг — очищаем текст
data = data.select("Content", "Label").na.drop()

# Шаг 4: Токенизация текста
tokenizer = Tokenizer(inputCol="Content", outputCol="words")
words_data = tokenizer.transform(data)

# Шаг 5: Удаление стоп-слов
remover = StopWordsRemover(inputCol="words", outputCol="filtered_words")
filtered_data = remover.transform(words_data)

# Шаг 6: Лемматизация (новый этап)
lemmatized_data = filtered_data.withColumn("lemmatized_words", lemmatize_udf(col("filtered_words")))

# Шаг 7: Применение TF-IDF
# TF (Term Frequency)
hashing_tf = HashingTF(inputCol="lemmatized_words", outputCol="raw_features", numFeatures=10000)
featurized_data = hashing_tf.transform(lemmatized_data)

# IDF (Inverse Document Frequency)
idf = IDF(inputCol="raw_features", outputCol="features")
idf_model = idf.fit(featurized_data)
rescaled_data = idf_model.transform(featurized_data)

# Шаг 8: Сохранение обработанных данных или их использование для обучения модели
final_data = rescaled_data.select("features", "Label")

# Пример сохранения
final_data.write.parquet("../data/prepared_data.parquet")

24/09/18 20:44:32 WARN Utils: Your hostname, amamylov-pc resolves to a loopback address: 127.0.1.1; using 192.168.100.7 instead (on interface wlp0s20f3)
24/09/18 20:44:32 WARN Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
24/09/18 20:44:33 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
                                                                                

In [3]:
from pyspark.ml.classification import RandomForestClassifier
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml.feature import StringIndexer
from pyspark.ml.tuning import ParamGridBuilder, CrossValidator
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

# Шаг 1: Индексация меток (Label) в числовой формат, если еще не сделано
indexer = StringIndexer(inputCol="Label", outputCol="indexedLabel")
final_data = indexer.fit(final_data).transform(final_data)

# Шаг 2: Разделение данных на тренировочные и тестовые выборки
train_data, test_data = final_data.randomSplit([0.8, 0.2], seed=42)

# Шаг 3: Инициализация и настройка RandomForestClassifier
rf = RandomForestClassifier(labelCol="indexedLabel", featuresCol="features", numTrees=100, maxDepth=10)

# Шаг 4: Обучение модели
rf_model = rf.fit(train_data)

# Шаг 5: Предсказание на тестовых данных
predictions = rf_model.transform(test_data)

# Шаг 6: Оценка метрик
# F1 и Precision/Recall
evaluator_f1 = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="f1")
f1_score = evaluator_f1.evaluate(predictions)

evaluator_precision = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="weightedPrecision")
precision_score = evaluator_precision.evaluate(predictions)

evaluator_recall = MulticlassClassificationEvaluator(labelCol="indexedLabel", predictionCol="prediction", metricName="weightedRecall")
recall_score = evaluator_recall.evaluate(predictions)

# ROC AUC
evaluator_auc = BinaryClassificationEvaluator(labelCol="indexedLabel", rawPredictionCol="rawPrediction", metricName="areaUnderROC")
roc_auc_score = evaluator_auc.evaluate(predictions)

# Шаг 7: Вывод метрик
print(f"F1 Score: {f1_score}")
print(f"Precision: {precision_score}")
print(f"Recall: {recall_score}")
print(f"ROC AUC: {roc_auc_score}")

24/09/18 20:45:17 WARN MemoryStore: Not enough space to cache rdd_66_12 in memory! (computed 130.6 MiB so far)
24/09/18 20:45:17 WARN MemoryStore: Not enough space to cache rdd_66_7 in memory! (computed 85.0 MiB so far)
24/09/18 20:45:17 WARN BlockManager: Persisting block rdd_66_12 to disk instead.
24/09/18 20:45:17 WARN BlockManager: Persisting block rdd_66_7 to disk instead.
24/09/18 20:45:17 WARN MemoryStore: Not enough space to cache rdd_66_5 in memory! (computed 85.0 MiB so far)
24/09/18 20:45:17 WARN BlockManager: Persisting block rdd_66_5 to disk instead.
24/09/18 20:45:17 WARN MemoryStore: Not enough space to cache rdd_66_13 in memory! (computed 85.0 MiB so far)
24/09/18 20:45:17 WARN BlockManager: Persisting block rdd_66_13 to disk instead.
24/09/18 20:45:17 WARN MemoryStore: Not enough space to cache rdd_66_11 in memory! (computed 85.0 MiB so far)
24/09/18 20:45:17 WARN BlockManager: Persisting block rdd_66_11 to disk instead.
24/09/18 20:45:17 WARN MemoryStore: Not enough s

24/09/18 20:45:42 WARN MemoryStore: Not enough space to cache rdd_66_5 in memory! (computed 55.4 MiB so far)
24/09/18 20:45:42 WARN MemoryStore: Not enough space to cache rdd_66_13 in memory! (computed 55.4 MiB so far)
24/09/18 20:45:42 WARN MemoryStore: Not enough space to cache rdd_66_10 in memory! (computed 55.4 MiB so far)
24/09/18 20:45:42 WARN MemoryStore: Not enough space to cache rdd_66_0 in memory! (computed 55.4 MiB so far)
24/09/18 20:45:42 WARN MemoryStore: Not enough space to cache rdd_66_11 in memory! (computed 55.4 MiB so far)
24/09/18 20:45:42 WARN MemoryStore: Not enough space to cache rdd_66_12 in memory! (computed 55.4 MiB so far)
24/09/18 20:45:42 WARN MemoryStore: Not enough space to cache rdd_66_2 in memory! (computed 55.4 MiB so far)
24/09/18 20:45:42 WARN MemoryStore: Not enough space to cache rdd_66_1 in memory! (computed 55.4 MiB so far)
24/09/18 20:45:42 WARN MemoryStore: Not enough space to cache rdd_66_8 in memory! (computed 55.4 MiB so far)
24/09/18 20:45:

24/09/18 20:46:19 WARN DAGScheduler: Broadcasting large task binary with size 4.0 MiB
24/09/18 20:46:19 WARN MemoryStore: Not enough space to cache rdd_66_0 in memory! (computed 55.4 MiB so far)
24/09/18 20:46:19 WARN MemoryStore: Not enough space to cache rdd_66_9 in memory! (computed 55.4 MiB so far)
24/09/18 20:46:19 WARN MemoryStore: Not enough space to cache rdd_66_10 in memory! (computed 55.4 MiB so far)
24/09/18 20:46:19 WARN MemoryStore: Not enough space to cache rdd_66_3 in memory! (computed 55.4 MiB so far)
24/09/18 20:46:19 WARN MemoryStore: Not enough space to cache rdd_66_13 in memory! (computed 55.4 MiB so far)
24/09/18 20:46:19 WARN MemoryStore: Not enough space to cache rdd_66_6 in memory! (computed 55.4 MiB so far)
24/09/18 20:46:19 WARN MemoryStore: Not enough space to cache rdd_66_12 in memory! (computed 55.4 MiB so far)
24/09/18 20:46:19 WARN MemoryStore: Not enough space to cache rdd_66_15 in memory! (computed 55.4 MiB so far)
24/09/18 20:46:19 WARN MemoryStore: No

F1 Score: 0.7081978381093227
Precision: 0.7521916153826167
Recall: 0.7183415375995518
ROC AUC: 0.8095759683574836


24/09/15 12:34:46 WARN DAGScheduler: Broadcasting large task binary with size 2.1 MiB
24/09/15 12:34:54 WARN DAGScheduler: Broadcasting large task binary with size 2.1 MiB
24/09/15 12:34:58 WARN DAGScheduler: Broadcasting large task binary with size 2.1 MiB
24/09/15 12:35:04 WARN DAGScheduler: Broadcasting large task binary with size 2.1 MiB
                                                                                

F1 Score: 0.7113088941020482
Precision: 0.7668876518779137
Recall: 0.7230267285732213
ROC AUC: 0.8171362246596771


In [None]:
# Закрываем сессию
spark.stop()