## 1. Install Libraries

In [1]:
"""
STILL NEED TO ADD VERSIONS (AFTER FILE IS CONFIRMED)
"""
%pip install spacy
%pip install pyarrow
%pip install textblob
%pip install textstat
!python -m spacy download en_core_web_sm

Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.
Defaulting to user installation because normal site-packages is not writeable
Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m65.2 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h[38;5;2m✔ Download and i

## 2. Import Libraries

In [2]:
# OS environment
import os

# Import SparkConf class into program
from pyspark import SparkConf

# Import SparkContext and SparkSession classes
from pyspark import SparkContext # Spark
from pyspark.sql import SparkSession # Spark SQL

# PySpark Data Operations
from pyspark.sql.functions import col, size, split, udf
from pyspark.sql import functions as F

# Regex
import re

# Numeric operations
import numpy as np

# Define custom schema (data types) for PySpark Dataframes
from pyspark.sql.types import StructType, StructField, IntegerType, FloatType, StringType

# spaCy model for natural language processing
import spacy

# Pandas
import pandas as pd

# Pscholinguistics
from textblob import TextBlob

# Readability features
import textstat



## 3. Function and Classes

### 3.1. clean_text function

In [3]:
# Define text cleaning function
def clean_text(text):
    
    """
    Clean the input text string by removing unwanted elements while keeping useful punctuation.

    Steps performed:
    - Convert non-ASCII quotes/aprostrophes with ASCII equivalents
    - Remove URLs (e.g. http://..., www...)
    - Remove Twitter-style mentions (@username) and hashtags (#hashtag)
    - Remove HTML entities (e.g. &nbsp;)
    - Remove emojis and non-ASCII characters
    - Normalize whitespace (convert multiple spaces/tabs/newlines into a single space)
    - Trim leading and trailing spaces

    Args:
        text (str or None): The input text to clean.

    Returns:
        str: A cleaned version of the input text. If input is None, returns an empty string.
    """
    
    if text is None:
        return ""
    
    # Replace curly quotes/apostrophes with ASCII equivalents
    replacements = {
        '“': '"', '”': '"',
        '‘': "'", '’': "'"
    }
    for curly, straight in replacements.items():
        text = text.replace(curly, straight)
    
    # Remove URLs
    text = re.sub(r'http\S+|www\.\S+', '', text)
    
    # Remove mentions and hashtags
    text = re.sub(r'@\w+|#\w+', '', text)
    
    # Remove HTMLs
    text = re.sub(r'&\w+;', '', text)
    
    # Remove emojis and other non-ASCII symbols
    text = re.sub(r'[^\x00-\x7F]+', '', text)
    
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text)
    
    return text.strip()

### 3.2. FeaturesSpark Class

In [4]:
from pyspark.sql import functions as F

class FeaturesSpark:
    """
    Features that can be computed efficiently using PySpark.
    """

    def __init__(self):
        pass

    def transform(self, df, text_col):
        txt = F.coalesce(F.col(text_col), F.lit(""))

        # Capital letters
        df = df.withColumn("num_capital_letters", F.length(F.regexp_replace(txt, r"[^A-Z]", "")))

        # Word count
        df = df.withColumn("num_words", F.size(F.split(txt, r"\s+")))

        # Sentence count
        df = df.withColumn("num_sentences", F.size(F.split(txt, r"[.!?]+")))

        # Words per sentence
        df = df.withColumn(
            "words_per_sentence", 
            F.when(F.col("num_sentences") > 0, F.col("num_words") / F.col("num_sentences"))
             .otherwise(F.lit(0))
        )

        # Short sentences (<10 words)
        df = df.withColumn(
            "num_short_sentences", 
            F.size(F.expr(f"filter(split({text_col}, '[.!?]+'), x -> size(split(x, ' ')) < 10)"))
        )

        # Long sentences (>=20 words)
        df = df.withColumn(
            "num_long_sentences", 
            F.size(F.expr(f"filter(split({text_col}, '[.!?]+'), x -> size(split(x, ' ')) >= 20)"))
        )

        # Special characters
        df = df.withColumn("num_special_characters", F.length(F.regexp_replace(txt, r"[a-zA-Z0-9\s]", "")))

        # Count articles (a, an, the)
        df = df.withColumn(
            "num_articles",
            F.size(F.expr(f"filter(split(lower({text_col}), '\\s+'), x -> x IN ('a', 'an', 'the'))"))
        )

        return df


### 3.3. POSFeatures Class

In [5]:
from pyspark.sql.functions import col
from pyspark.sql.types import IntegerType, DoubleType
from pyspark.sql.functions import udf
import spacy

class POSFeatures:
    def __init__(self, model="en_core_web_sm"):
        self.model = model

    def _count_pos(self, text, pos_tag):
        # load model lazily (cached per worker)
        if not hasattr(self, "_nlp"):
            self._nlp = spacy.load(self.model, disable=["ner", "parser"])
        doc = self._nlp(text)
        return sum(1 for token in doc if token.pos_ == pos_tag)

    def register_udfs(self, spark):
        return {
            "num_verbs": udf(lambda text: self._count_pos(text, "VERB"), IntegerType()),
            "num_adjectives": udf(lambda text: self._count_pos(text, "ADJ"), IntegerType()),
            "num_adverbs": udf(lambda text: self._count_pos(text, "ADV"), IntegerType()),
            "num_determiners": udf(lambda text: self._count_pos(text, "DET"), IntegerType()),
        }

    def transform(self, df, text_col):
        spark = df.sql_ctx.sparkSession
        udfs = self.register_udfs(spark)
        for col_name, func in udfs.items():
            df = df.withColumn(col_name, func(text_col))
        
        # Calculate rate_adj_adv = (num_adjectives + num_adverbs) / num_words
        df = df.withColumn(
            "rate_adj_adv", 
            (col("num_adjectives") + col("num_adverbs")) / col("num_words")
        )
        return df


### 3.4. ReadabilityIndices Class

In [6]:
class ReadabilityIndices:
    @staticmethod
    def extract_features(df, text_col):
        # Regular UDFs
        gf_udf = F.udf(lambda t: float(textstat.gunning_fog(t)) if t else None, FloatType())
        smog_udf = F.udf(lambda t: float(textstat.smog_index(t)) if t else None, FloatType())
        ari_udf = F.udf(lambda t: float(textstat.automated_readability_index(t)) if t else None, FloatType())
        syllables_udf = F.udf(lambda t: float(textstat.syllable_count(t)) if t else None, FloatType())

        return (df
            .withColumn("gunning_fog", gf_udf(F.col(text_col)))
            .withColumn("smog", smog_udf(F.col(text_col)))
            .withColumn("ari", ari_udf(F.col(text_col)))
            .withColumn("num_syllables", syllables_udf(F.col(text_col)))
        )


### 3.5. Psycholinguistics Class

In [7]:
class Psycholinguistics:
    @staticmethod
    def extract_features(df, text_col, title_col=None):
        # polarity
        def polarity_udf(text):
            if not text:
                return None
            return float(TextBlob(text).sentiment.polarity)
        
        # subjectivity
        def subjectivity_udf(text):
            if not text:
                return None
            return float(TextBlob(text).sentiment.subjectivity)
        
        # title similarity
        def title_similarity_udf(text, title):
            if not text or not title:
                return None
            text_words = set(text.lower().split())
            title_words = set(title.lower().split())
            if not text_words or not title_words:
                return 0
            return float(len(text_words & title_words) / len(text_words | title_words))

        df = df.withColumn("polarity", F.udf(polarity_udf, FloatType())(F.col(text_col)))
        df = df.withColumn("subjectivity", F.udf(subjectivity_udf, FloatType())(F.col(text_col)))
        if title_col:
            df = df.withColumn("title_similarity", F.udf(title_similarity_udf, FloatType())(
                F.col(text_col), F.col(title_col)
            ))
        else:
            df = df.withColumn("title_similarity", F.lit(None).cast(FloatType()))
        return df

## 4. Configure Spark Environment
Using the code snippets from tutorial 1 and 2, set up the Spark environment and configure the Spark Application using SparkConf.

In [8]:
spark_home = os.environ.get("SPARK_HOME")

if spark_home:
    print(f"SPARK_HOME: {spark_home}")
else:
    print("SPARK_HOME environement variable is not set.")

os.environ["SPARK_HOME"]= "/usr/local/lib/python3.10/dist-packages/pyspark"

print (f"SPARK_HOME is now set to: {os.environ.get('SPARK_HOME')}")

SPARK_HOME: /usr/local/lib/python3.8/dist-packages/pyspark
SPARK_HOME is now set to: /usr/local/lib/python3.10/dist-packages/pyspark


In [9]:
# local[*]: run Spark in local mode with as many working processors as logical cores on your machine
# If we want Spark to run locally with 'k' worker threads, we can specify as "local[k]".
master = "local[*]"
# The `appName` field is a name to be shown on the Spark cluster UI page
app_name = "WELFake Exploratory Data Anlaysis (EDA)"
# Setup configuration parameters for Spark
spark_conf = SparkConf().setMaster(master).setAppName(app_name)

# Setup SparkSession
spark = SparkSession.builder.config(conf=spark_conf).getOrCreate()
sc = spark.sparkContext
sc.setLogLevel('ERROR')

25/08/20 02:36:05 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Using Spark's default log4j profile: org/apache/spark/log4j-defaults.properties
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).


## 5. Load datasets


In [10]:
# Load dataset into Spark dataframe
welfake_df = spark.read.csv(
    "data/WELFake_Dataset.csv",
    header=True,
    inferSchema=True,
    quote='"', 
    multiLine=True, #multilines in text and title data
    escape='"'
)

# Display sample rows
welfake_df.show(3)

                                                                                

+---+--------------------+--------------------+-----+
|_c0|               title|                text|label|
+---+--------------------+--------------------+-----+
|  0|LAW ENFORCEMENT O...|No comment is exp...|    1|
|  1|                null|Did they post the...|    1|
|  2|UNBELIEVABLE! OBA...| Now, most of the...|    1|
+---+--------------------+--------------------+-----+
only showing top 3 rows



In [11]:
# Rename first column as index
welfake_df = welfake_df.withColumnRenamed("_c0", "index")

# Show dataframe dimensions
num_rows = welfake_df.count()
num_cols = len(welfake_df.columns)

print(f"Rows: {num_rows}")
print(f"Columns: {num_cols}")

#Print the Schema
welfake_df.printSchema()

[Stage 3:>                                                          (0 + 1) / 1]

Rows: 72134
Columns: 4
root
 |-- index: integer (nullable = true)
 |-- title: string (nullable = true)
 |-- text: string (nullable = true)
 |-- label: integer (nullable = true)



                                                                                

## 6. Remove duplicate


In [12]:
# Count original dataset rows
original_count = welfake_df.count()

# Remove duplicate news articles
welfake_df_dedup = welfake_df.dropDuplicates(["title", "text"])

deduped_count = welfake_df_dedup.count()
duplicates_removed = original_count - deduped_count

print(f"Original rows: {original_count}")
print(f"Duplicates removed: {duplicates_removed}")
print(f"After dataset size: {deduped_count} rows")



Original rows: 72134
Duplicates removed: 8456
After dataset size: 63678 rows


                                                                                

## 7. Clean title and article texts

In [13]:
# Register udf to pyspark
clean_text_udf = udf(clean_text, StringType())

In [14]:
# Apply cleaning to title and text
welfake_df_clean = welfake_df_dedup.withColumn("cleaned_title", clean_text_udf("title")) \
                       .withColumn("cleaned_text", clean_text_udf("text"))

# Preview results
welfake_df_clean.select("title", "cleaned_title", "text", "cleaned_text").show(5, truncate=80)

[Stage 10:>                                                         (0 + 1) / 1]

+--------------------------------------------------------------------------------+--------------------------------------------------------------------------------+--------------------------------------------------------------------------------+--------------------------------------------------------------------------------+
|                                                                           title|                                                                   cleaned_title|                                                                            text|                                                                    cleaned_text|
+--------------------------------------------------------------------------------+--------------------------------------------------------------------------------+--------------------------------------------------------------------------------+--------------------------------------------------------------------------------+
|Credit Suisse Boss Fa

                                                                                

## 8. Remove null and empty string values

In [15]:
# Remove null or empty string values
welfake_df_processed = welfake_df_clean.filter(
    (col("cleaned_text").isNotNull()) & 
    (col("cleaned_text") != "") &
    (col("cleaned_title").isNotNull()) & 
    (col("cleaned_title") != "") &
    (col("label").isNotNull()) 
)

# Count the number of rows with empty values removed
clean_count = welfake_df_clean.count()
processed_count = welfake_df_processed.count()
removed_empty = clean_count - processed_count

print(f"Removed empty text rows: {removed_empty}")
print(f"After dataset size: {processed_count} rows")



Removed empty text rows: 1186
After dataset size: 62492 rows


                                                                                

## 9. Remove outlier based on text word count

### 9.1. Calculate article text word count

In [16]:
# Calculate text word count
welfake_df_wc = welfake_df_processed.withColumn("text_wc", size(split(col("cleaned_text"), "\\s+")))

welfake_df_wc.select("cleaned_text", "text_wc").show(3)

[Stage 18:>                                                         (0 + 1) / 1]

+--------------------+-------+
|        cleaned_text|text_wc|
+--------------------+-------+
|When Tidjane Thia...|   1496|
|ROCKVILLE, Md. (R...|    841|
|This post was ori...|    754|
+--------------------+-------+
only showing top 3 rows



Traceback (most recent call last):                                              
  File "/usr/local/lib/python3.10/dist-packages/pyspark/python/lib/pyspark.zip/pyspark/daemon.py", line 186, in manager
  File "/usr/local/lib/python3.10/dist-packages/pyspark/python/lib/pyspark.zip/pyspark/daemon.py", line 74, in worker
  File "/usr/local/lib/python3.10/dist-packages/pyspark/python/lib/pyspark.zip/pyspark/worker.py", line 642, in main
    if read_int(infile) == SpecialLengths.END_OF_STREAM:
  File "/usr/local/lib/python3.10/dist-packages/pyspark/python/lib/pyspark.zip/pyspark/serializers.py", line 595, in read_int
    raise EOFError
EOFError


### 9.2. Remove outlier based on percentile values

In [17]:
# Calculate key percentiles for text word count
percentiles_upper_tail = [0.96, 0.97, 0.98, 0.99]
percentiles_lower_tail = [0.01, 0.02, 0.03, 0.04]

# Compute percentiles
upper_tail_quantiles = welfake_df_wc.approxQuantile("text_wc", percentiles_upper_tail, 0.01)
lower_tail_quantiles = welfake_df_wc.approxQuantile("text_wc", percentiles_lower_tail, 0.01)

# Show quantile values for analysis
print(f"Upper tail (96% to 99%): {upper_tail_quantiles}")
print(f"Lower tail (1% to 4%): {lower_tail_quantiles}")



Upper tail (96% to 99%): [1406.0, 1514.0, 1683.0, 24173.0]
Lower tail (1% to 4%): [1.0, 26.0, 39.0, 54.0]


                                                                                

In [18]:
# Calculate 2nd and 98th percentiles
lower_bound, upper_bound = welfake_df_wc.approxQuantile("text_wc", [0.02, 0.98], 0.01)

print(f"Filter out text_wc < {lower_bound} or > {upper_bound}\n")

# Filter out values below the 2nd and above the 98th percentiles
welfake_df_filtered = welfake_df_wc.filter(
    (F.col("text_wc") > lower_bound) & (F.col("text_wc") < upper_bound)
)

# Count the number of rows with empty values removed
outlier_count = welfake_df_filtered.count()
removed_outlier = processed_count - outlier_count

print(f"Removed outlier text rows: {removed_outlier}")
print(f"After dataset size: {outlier_count} rows")

                                                                                

Filter out text_wc < 26.0 or > 1683.0





Removed outlier text rows: 2851
After dataset size: 59641 rows


                                                                                

## 10. Feature Engineering

### 10.1. Create quantity feature columns using FeaturesSpark()

In [19]:
# Initialise FeaturesSpark
feat_spark = FeaturesSpark()

# Create quantity feature columns
welfake_df_feat_spark = feat_spark.transform(df=welfake_df_filtered, text_col="cleaned_text")

In [20]:
# Preview quanitty feature columns
welfake_df_feat_spark.select(
    "cleaned_title",
    "cleaned_text",
    "num_special_characters",
    "num_capital_letters",
    "num_words",
    "num_sentences",
    "words_per_sentence",
    "num_short_sentences",
    "num_long_sentences",
    "num_articles",
    "label"
).show(5)

[Stage 32:>                                                         (0 + 1) / 1]

+--------------------+--------------------+----------------------+-------------------+---------+-------------+------------------+-------------------+------------------+------------+-----+
|       cleaned_title|        cleaned_text|num_special_characters|num_capital_letters|num_words|num_sentences|words_per_sentence|num_short_sentences|num_long_sentences|num_articles|label|
+--------------------+--------------------+----------------------+-------------------+---------+-------------+------------------+-------------------+------------------+------------+-----+
|Credit Suisse Bos...|When Tidjane Thia...|                   213|                236|     1496|           91|16.439560439560438|                 23|                38|           0|    0|
|Angry and inspire...|ROCKVILLE, Md. (R...|                   143|                124|      841|           35| 24.02857142857143|                  6|                23|           0|    0|
|Russian Economy M...|This post was ori...|                 

                                                                                

### 10.2. Create POS feature columns using POSFeatures()

In [21]:
# Initialise POSFeatures
pos_features = POSFeatures()

# Create POS feature columns
welfake_df_pos_feat = pos_features.transform(df=welfake_df_feat_spark, text_col="cleaned_text")

In [22]:
# Preview POS feature columns
welfake_df_pos_feat.select(
    "cleaned_title",
    "cleaned_text",
    "num_verbs",
    "num_adjectives",
    "num_adverbs",
    "num_determiners",
     "rate_adj_adv",
    "label"
).show(5)

[Stage 35:>                                                         (0 + 1) / 1]

+--------------------+--------------------+---------+--------------+-----------+---------------+-------------------+-----+
|       cleaned_title|        cleaned_text|num_verbs|num_adjectives|num_adverbs|num_determiners|       rate_adj_adv|label|
+--------------------+--------------------+---------+--------------+-----------+---------------+-------------------+-----+
|Credit Suisse Bos...|When Tidjane Thia...|      157|           126|         60|            143|0.12433155080213903|    0|
|Angry and inspire...|ROCKVILLE, Md. (R...|      117|            78|         34|             73|0.13317479191438764|    0|
|Russian Economy M...|This post was ori...|       88|            44|         11|             90|0.07294429708222812|    1|
|HOUSE SPEAKER PAU...|HOUSE SPEAKER PAU...|       14|             7|         13|             11|0.14084507042253522|    1|
|(AUDIO) RACIST BL...|What is it that t...|       31|            11|          2|             16|0.06372549019607843|    1|
+---------------

                                                                                

### 10.3. Create Readability features using ReadabilityIndices()

In [23]:
# Initialise ReadabilityIndices
readability = ReadabilityIndices()

# Create readability feature columns
welfake_df_readability = readability.extract_features(welfake_df_pos_feat, "cleaned_text")

In [24]:
# Preview readability columns
welfake_df_readability.select(
    "cleaned_title",
    "cleaned_text",
    "gunning_fog",
    "smog",
    "ari",
    "num_syllables",
    "rate_adj_adv",
    "label"
).show(5)


[Stage 37:>                                                         (0 + 1) / 1]

+--------------------+--------------------+-----------+---------+----------+-------------+-------------------+-----+
|       cleaned_title|        cleaned_text|gunning_fog|     smog|       ari|num_syllables|       rate_adj_adv|label|
+--------------------+--------------------+-----------+---------+----------+-------------+-------------------+-----+
|Credit Suisse Bos...|When Tidjane Thia...|  12.285737|12.139135|10.7932825|       2338.0|0.12433155080213903|    0|
|Angry and inspire...|ROCKVILLE, Md. (R...|   16.68025|15.035415| 16.468256|       1388.0|0.13317479191438764|    0|
|Russian Economy M...|This post was ori...|  15.220309|14.512509| 13.890554|       1227.0|0.07294429708222812|    1|
|HOUSE SPEAKER PAU...|HOUSE SPEAKER PAU...|  10.571428|10.290406|  8.504578|        200.0|0.14084507042253522|    1|
|(AUDIO) RACIST BL...|What is it that t...|   9.937255|11.038039|  8.357353|        304.0|0.06372549019607843|    1|
+--------------------+--------------------+-----------+---------

                                                                                

### 10.4. Create psycholinguistics features using Psycholinguistics()

In [25]:
# Create psycholinguistics feature columns
welfake_df_psycho = Psycholinguistics.extract_features(
    df=welfake_df_readability,
    text_col="cleaned_text",
    title_col="cleaned_title"
)

In [26]:
# Preview psycholinguistics columns
welfake_df_psycho.select(
    "cleaned_text",
    "polarity",
    "subjectivity",
    "title_similarity",
    "label"
).show(5)


[Stage 39:>                                                         (0 + 1) / 1]

+--------------------+------------+------------+----------------+-----+
|        cleaned_text|    polarity|subjectivity|title_similarity|label|
+--------------------+------------+------------+----------------+-----+
|When Tidjane Thia...|  0.05515215|  0.40515655|     0.013157895|    0|
|ROCKVILLE, Md. (R...|  0.17139433|  0.39709178|     0.014989293|    0|
|This post was ori...| 0.056074135|   0.4292704|     0.028423773|    1|
|HOUSE SPEAKER PAU...|  0.07604167|  0.40208334|     0.054945055|    1|
|What is it that t...|-0.028863637|  0.56545454|     0.059322033|    1|
+--------------------+------------+------------+----------------+-----+
only showing top 5 rows



                                                                                

### 10.5. Extract engineered features

In [28]:
# Extract feature columns for machine learning
welfake_df_preprocessed = welfake_df_psycho.select(
    "cleaned_title",
    "cleaned_text",
    "num_special_characters",
    "num_capital_letters",
    "num_words",
    "num_sentences",
    "words_per_sentence",
    "num_short_sentences",
    "num_long_sentences",
    "num_articles",
    "num_verbs",
    "num_adjectives",
    "num_adverbs",
    "rate_adj_adv",
    "num_determiners",
    "gunning_fog",
    "smog",
    "ari",
    "num_syllables",
    "polarity",
    "subjectivity",
    "title_similarity",
    "label"
)

# Cache results
welfake_df_preprocessed.cache()

DataFrame[cleaned_title: string, cleaned_text: string, num_special_characters: int, num_capital_letters: int, num_words: int, num_sentences: int, words_per_sentence: double, num_short_sentences: int, num_long_sentences: int, num_articles: int, num_verbs: int, num_adjectives: int, num_adverbs: int, rate_adj_adv: double, num_determiners: int, gunning_fog: float, smog: float, ari: float, num_syllables: float, polarity: float, subjectivity: float, title_similarity: float, label: int]

In [29]:
# Preview preprocessed data
welfake_df_preprocessed.show(5)

[Stage 41:>                                                         (0 + 1) / 1]

+--------------------+--------------------+----------------------+-------------------+---------+-------------+------------------+-------------------+------------------+------------+---------+--------------+-----------+-------------------+---------------+-----------+---------+----------+-------------+------------+------------+----------------+-----+
|       cleaned_title|        cleaned_text|num_special_characters|num_capital_letters|num_words|num_sentences|words_per_sentence|num_short_sentences|num_long_sentences|num_articles|num_verbs|num_adjectives|num_adverbs|       rate_adj_adv|num_determiners|gunning_fog|     smog|       ari|num_syllables|    polarity|subjectivity|title_similarity|label|
+--------------------+--------------------+----------------------+-------------------+---------+-------------+------------------+-------------------+------------------+------------+---------+--------------+-----------+-------------------+---------------+-----------+---------+----------+-----------

                                                                                

## 11. Feature Selection