## Language Detector Project

Dataset Source: https://www.kaggle.com/datasets/basilb2s/language-detection

##### Import Necessary Libraries

In [0]:
import pyspark
import pyspark.sql.functions as F
from pyspark.sql.types import StringType, StructType, StructField, IntegerType, DoubleType, FloatType, ArrayType

from pyspark.ml import Pipeline

from pyspark.ml.evaluation import MulticlassClassificationEvaluator

import sparknlp
from sparknlp.base import *
from sparknlp.annotator import *

##### Versions of Libraries, Modules, Frameworks Used in This Project

In [0]:
print("Apache Spark version:", spark.version)
print("Spark NLP version:", sparknlp.version())

Apache Spark version: 3.2.1
Spark NLP version: 4.2.5


#### Create Functions Used Throughout This Project

##### Create Function to Ingest Dataset

In [0]:
def ingest_dataset(file_location: str, \
                   schema: StructType \
                  ) -> pyspark.sql.dataframe.DataFrame:
    '''
    This function returns a dataset aligned with the schema
    '''
    file_type = "csv"
    infer_schema = "false"
    first_row_is_header = "true"
    delimiter = ","
    
    df = spark.read.format(file_type) \
      .option("inferSchema", infer_schema) \
      .option("header", first_row_is_header) \
      .option("sep", delimiter) \
      .schema(schema)\
      .load(file_location)
    
    return df

##### Function to Remove Class Imbalance

In [0]:
def balance_dataset(dataset: pyspark.sql.dataframe.DataFrame, \
                    unique_label_values: [str], \
                    new_schema: StructType, \
                    samples: int = 20000, \
                    label_col: str = "label" \
                   ) -> pyspark.sql.dataframe.DataFrame:
    '''
    This function removes the class imbalance.
    '''
    temp_df =  spark.createDataFrame([], new_schema)
    new_df =  spark.createDataFrame([], new_schema)
    
    for ulab in unique_label_values:
        # extract df of only the desired labels
        temp_df = dataset.where(F.col(label_col).isin(ulab))
        
        ratio = round(samples/temp_df.count(), 4)
        # sample it to desired number of samples
        if ratio > 1.0:
            # Oversample
            temp_df = temp_df.sample(True, ratio, seed=42)
        elif ratio < 1.0:
            # Undersample
            temp_df = temp_df.sample(False, ratio, seed=42)
        elif ratio == 1.0:
            # Just use the existing dataframe
            pass
        # concatenate it to the new_df
        new_df = new_df.union(temp_df)
        # unpersist the temp_df
        temp_df.unpersist()
    return new_df

##### Create Metrics Evaluation Function

In [0]:
def evaluate_with_spark_metrics(predictions: pyspark.sql.dataframe.DataFrame, \
                                metrics: [str], \
                                model_name: str \
                               ) -> None:
    '''
    Calculate & display metrics for a multiclass classification analysis.
    '''
    from pyspark.ml.evaluation import MulticlassClassificationEvaluator
    print("+---------------------------------------------+")
    print("|  " + model_name.center(41) + "  |")
    print("+---------------------------------------------+")
    print("|   %s  |  %s   |" % ("Metric".rjust(20), "Value".ljust(14)))
    print("+---------------------------------------------+")
    for x in metrics:
        evaluator = MulticlassClassificationEvaluator(labelCol="label", \
                                                      predictionCol="prediction", \
                                                      metricName=x) 
        score = evaluator.evaluate(predictions)
        print("|   %s  |  %s   |" % (x.rjust(20), str(round(score, 6)).ljust(14)))
        print("+---------------------------------------------+")

#### Ingest & Preprocess Datasets

##### Ingest Dataset

In [0]:
data_file = "/FileStore/tables/Language_Detection.csv"

orig_schema = StructType([
    StructField("text", StringType(), True),
    StructField("language", StringType(), True),
])

df = ingest_dataset(data_file, orig_schema)

display(df)

text,language
"Nature, in the broadest sense, is the natural, physical, material world or universe.",English
"""""""Nature"""" can refer to the phenomena of the physical world","and also to life in general."""
"The study of nature is a large, if not the only, part of science.",English
"Although humans are part of nature, human activity is often understood as a separate category from other natural phenomena.",English
"""[1] The word nature is borrowed from the Old French nature and is derived from the Latin word natura, or """"essential qualities","innate disposition"""""
"[2] In ancient philosophy, natura is mostly used as the Latin translation of the Greek word physis (φύσις), which originally related to the intrinsic characteristics that plants, animals, and other features of the world develop of their own accord.",English
[3][4],
The concept of nature as a whole,the physical universe
"During the advent of modern scientific method in the last several centuries, nature became the passive reality, organized and moved by divine laws.",English
"[5][6] With the Industrial revolution, nature increasingly became seen as the part of reality deprived from intentional intervention : it was hence considered as sacred by some traditions (Rousseau, American transcendentalism) or a mere decorum for divine providence or human history (Hegel, Marx).",English


##### Data Preprocessing (Part 1)

In [0]:
# Remove duplicates
df = df.dropDuplicates()

# Remove null values
df = df.na.drop(how='any')

df = df.withColumnRenamed("language", "label")

df = df.replace("Portugeese", "Portuguese", subset=["label"]) \
        .replace("Sweedish", "Swedish", subset=["label"])

# Add feature with word length of 'text' &  'language' features (individually)
df = df.withColumn("text_len", F.size(F.split(F.col("text"), " ")))
df = df.withColumn("language_len", F.size(F.split(F.col("label"), " ")))

# Remove samples with language feature that is more than one word
df = df.filter(F.col("language_len") < 2)

display(df)

text,label,text_len,language_len
"The number of active editors in English Wikipedia, by sharp comparison, was cited as peaking in 2007 at approximately 50,000 and dropping to 30,000 by the start of 2014.",English,29,1
[143] No comment was made concerning which of the differentiated edit policy standards from Wikipedia in other languages (non-English Wikipedia) would provide a possible alternative to English Wikipedia for effectively ameliorating substantial editor attrition rates on the English-language Wikipedia.,English,39,1
"The 'Wikipedia is not censored' policy has sometimes proved controversial: in 2008, Wikipedia rejected an online petition against the inclusion of images of Muhammad in the English edition of its Muhammad article, citing this policy.",English,35,1
"This pattern does not adhere to the common statistical definition of an outlier as a rare object, and many outlier detection methods (in particular, unsupervised algorithms) will fail on such data unless it has been aggregated appropriately.",English,37,1
i'll be right with you.,English,5,1
നല്ല പെരുമാറ്റവും സംസ്കാരവും സങ്കീർണ്ണവും.,Malayalam,4,1
പറയുക.,Malayalam,1,1
"നാർസിസ ആദ്യം പാടുപെട്ട വഴികൾ മാറ്റി, പക്ഷേ പതുക്കെ ക്ഷമയോടെ അവൾ ഒരുപാട് സുന്ദരിയാകാൻ തുടങ്ങി ഒരു ദിവസം മെല്ലിയും ടെറിയും മരിയനെ വീണ്ടും കാണാൻ വന്നു ഓ ഹലോ നിങ്ങൾ രണ്ടുപേരും അതിനാൽ മരിയൻ ഞങ്ങളോട് പറയുക.",Malayalam,28,1
இப்போதே.,Tamil,1,1
este é o meu erro.,Portuguese,5,1


Output can only be rendered in Databricks

##### Data Preprocessing (Part 2)

In [0]:
languages_for_analysis =  sorted(set(df.select("label").toPandas()['label']))

# Only retain samples with an actual language listed
df = df.filter(F.col("label").isin(languages_for_analysis))

# Get a list of numbers (as string data types) for 
number_values = [str(x) for x in range(len(languages_for_analysis))]

languages_converter = dict(zip(languages_for_analysis, number_values))

preds_converter = {'ar': '0', 'da': '1', 'nl': '2', 'en': '3', 'fr': '4', 'de': '5', 'el': '6', 'hi': '7', 'it': '8', 'kn': '9', 'ml': '10', 'pt': '11', 'ru': '12', 'es': '13', 'sv': '14', 'ta': '15', 'tr': '16'}

df = df.drop("language_len", "text_len")

df = df.persist()

##### Build Pipeline Stages

In [0]:
# Document Assembler
doc = DocumentAssembler() \
    .setInputCol("text") \
    .setOutputCol("document") \
    .setCleanupMode('shrink')

# Language Detector
language_detector = LanguageDetectorDL.pretrained("ld_wiki_tatoeba_cnn_95", "xx") \
    .setInputCols("document") \
    .setOutputCol("class")

ld_wiki_tatoeba_cnn_95 download started this may take some time.
Approximate size to download 7.9 MB
[ | ][OK!]


##### Build Pipeline

In [0]:
pipeline = Pipeline().setStages([
      doc,
      language_detector
])

##### Generate Predictions

In [0]:
preds = pipeline.fit(df).transform(df)

preds = preds.select(F.col("label"), F.col("class.result").alias("prediction"))
preds = preds.withColumn("prediction", F.col("prediction").getItem(0))

display(preds)

label,prediction
English,en
English,en
English,en
English,en
English,en
Malayalam,ml
Malayalam,ml
Malayalam,ml
Tamil,ta
Portuguese,pt


##### Prepare Predictions for Metrics Evaluation Function

In [0]:
preds = preds.replace(languages_converter, subset=["label"]).withColumn("label", F.col("label").cast(DoubleType()))

preds = preds.replace(preds_converter, subset=["prediction"]).withColumn("prediction", F.col("prediction").cast(DoubleType()))

preds = preds.fillna(-1)

preds = preds.persist()

display(preds)

label,prediction
3.0,3.0
3.0,3.0
3.0,3.0
3.0,3.0
3.0,3.0
10.0,10.0
10.0,10.0
10.0,10.0
15.0,15.0
11.0,11.0


##### Calculate & Display Metrics

In [0]:
metrics_to_eval = ["accuracy", "f1", "weightedPrecision", "weightedRecall"]

evaluate_with_spark_metrics(preds, \
                            metrics_to_eval, \
                            "Metrics for Cancer Classification")

+---------------------------------------------+
|      Metrics for Cancer Classification      |
+---------------------------------------------+
|                 Metric  |  Value            |
+---------------------------------------------+
|               accuracy  |  0.980445         |
+---------------------------------------------+
|                     f1  |  0.985507         |
+---------------------------------------------+
|      weightedPrecision  |  0.99085          |
+---------------------------------------------+
|         weightedRecall  |  0.980445         |
+---------------------------------------------+


##### End Spark Session

In [0]:
df = df.unpersist()
preds = preds.unpersist()

spark.stop()

### Notes & Other Takeaways From This Project
****
- With accuracy, F1 score, recall and precision values all above 0.98, I would say that the results are excellent. On top of that, it only took about a minute and a half to handle over 10,000 samples.
****