In [1]:
from pyspark import SparkContext, SparkConf
from pyspark.sql import SparkSession
from pyspark.ml.classification import RandomForestClassifier
from pyspark.sql.functions import explode, split, col
from pyspark.sql import SparkSession
from pyspark.sql.functions import size, split, col, lit
from pyspark.sql import SparkSession
from pyspark.ml.feature import VectorAssembler
from pyspark.ml.evaluation import BinaryClassificationEvaluator
from pyspark.ml import Pipeline

# create the session
conf = SparkConf().set("spark.ui.port", "4050")

# create the context
sc = SparkContext(conf=conf)
spark = SparkSession.builder.getOrCreate()

In [4]:
# read local csv
df = spark.read.option("header", True).csv("two_towers_chapters_unique.csv")

# We use the information of the first question to create the features
characters = ["Sam", "Frodo", "Gollum", "Gandalf", "Orcs", "Aragorn"]

# Create a column for each character
for c in characters:
    df = df.withColumn(c, size(split(col("text"), rf"\b{c}\b")) - 1)

# count words
df = df.withColumn("word_count", size(split(col("text"), r"\s+")))

# Count sentences
df = df.withColumn("sentence_count", size(split(col("text"), r"[.!?]")))

# avg sentence length
df = df.withColumn("avg_sentence_length", col("word_count") / col("sentence_count"))

# Importance empty column because we are going to write oursefls if it is important or not
df = df.withColumn("importance", lit(None).cast("int"))

# wanted columns
final_columns = ["chapter_id", "chapter_title"] + characters + ["word_count", "sentence_count", "avg_sentence_length", "importance"]
df_final = df.select(final_columns)

# save csv
df_final.write.mode("overwrite").option("header", True).csv("two_towers_features_RF.csv")

Random Forest Training

In [6]:
# Charge CSV and csv has already the importance column changed to 0 or 1
df = spark.read.option("header", True).option("inferSchema", True).csv("chapters_RF.csv")

# Predictable values
features = ["Sam", "Frodo", "Gollum", "Gandalf", "Orcs", "Aragorn", "word_count", "sentence_count", "avg_sentence_length"]
assembler = VectorAssembler(inputCols=features, outputCol="features")

# Classifier
rf = RandomForestClassifier(
    labelCol="importance",
    featuresCol="features",
    numTrees=8,
    maxDepth=3,
    minInstancesPerNode=1,
    subsamplingRate=0.6,
    featureSubsetStrategy="log2"
)

# Pipeline
pipeline = Pipeline(stages=[assembler, rf])

# Train, validation and test split
train_data, val_data, test_data = df.randomSplit([0.7, 0.1, 0.2], seed=42)
# train
model = pipeline.fit(train_data)

# Validation
val_predictions = model.transform(val_data)
evaluator = BinaryClassificationEvaluator(labelCol="importance")
val_accuracy = evaluator.evaluate(val_predictions)

print(f"Accuracy on validation: {val_accuracy:.2f}")
val_predictions.select("chapter_title", "importance", "prediction", "probability").show(truncate=False)

# Test
test_predictions = model.transform(test_data)
test_accuracy = evaluator.evaluate(test_predictions)

print(f"Accuracy on test: {test_accuracy:.2f}")
test_predictions.select("chapter_title", "importance", "prediction", "probability").show(truncate=False)

Accuracy on validation: 0.50
+--------------------------+----------+----------+-------------+
|chapter_title             |importance|prediction|probability  |
+--------------------------+----------+----------+-------------+
|THE VOICE OF SARUMAN      |0         |0.0       |[1.0,0.0]    |
|OF HERBS AND STEWED RABBIT|1         |0.0       |[0.75,0.25]  |
|THE WINDOW ON THE WEST    |0         |0.0       |[0.625,0.375]|
+--------------------------+----------+----------+-------------+

Accuracy on test: 0.38
+------------------------+----------+----------+-------------+
|chapter_title           |importance|prediction|probability  |
+------------------------+----------+----------+-------------+
|THE URUK-HAI            |1         |0.0       |[0.625,0.375]|
|HELM’S DEEP             |0         |0.0       |[0.625,0.375]|
|F LOTSAM AND JETSAM     |0         |0.0       |[0.625,0.375]|
|THE BLACK GATE IS CLOSED|0         |0.0       |[0.625,0.375]|
|SHELOB’S LAIR           |0         |0.0       |[0.