In [1]:
import os
import sys

import sparknlp

from sparknlp.base import *
from sparknlp.common import *
from sparknlp.annotator import *

from pyspark.ml import Pipeline
from pyspark.sql import SparkSession, Row

import pandas as pd

spark = sparknlp.start()

print("Spark NLP version: ", sparknlp.version())
print("Apache Spark version: ", spark.version)

spark

Spark NLP version:  4.0.1
Apache Spark version:  3.0.3


In [25]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
("project", "Experienced Data Scientist with a demonstrated history of working in the information technology and services industry. Skilled in Python (Programming Language), SQL and Data Science. Strong engineering professional graduated from Tbilisi State University.")
('project','We are seeking a full-time AP Clerk to support our accounting team. Job descriptions: Timely process vendor invoices for multi-entities Weekly payment process and positive pay uploading Maintain accurate vendor wire templates and process wire payments Create/import inbound reports Credit card transaction reconciliations Travel expense / employee reimbursement report Investigate aging AP and reconcile AP balance with vendors’ statements of accounts Maintain Fixed Assets log Ad hoc projects and misc. tasks Education and skills requirements: Associate degree in Accounting or Finance or high school diploma Very organized and detail oriented Positive work attitude and multi-task Proficient in Microsoft Office especially Excel; ERP experience (NetSuite) is preferred but not required. Others: This position will be in person to the office. Job Type: Full-time Pay: $40')

('project',
 'We are seeking a full-time AP Clerk to support our accounting team. Job descriptions: Timely process vendor invoices for multi-entities Weekly payment process and positive pay uploading Maintain accurate vendor wire templates and process wire payments Create/import inbound reports Credit card transaction reconciliations Travel expense / employee reimbursement report Investigate aging AP and reconcile AP balance with vendors’ statements of accounts Maintain Fixed Assets log Ad hoc projects and misc. tasks Education and skills requirements: Associate degree in Accounting or Finance or high school diploma Very organized and detail oriented Positive work attitude and multi-task Proficient in Microsoft Office especially Excel; ERP experience (NetSuite) is preferred but not required. Others: This position will be in person to the office. Job Type: Full-time Pay: $40')

In [3]:
# data for testing
columns = ["title","description"]
data = [('project','designs and builds computer programs that power mobile devices, desktop computers, and even cars. They not only identify user needs but also create new applications for any given market while making improvements based on feedback from users.')]
rdd = spark.sparkContext.parallelize(data)
input_data = spark.createDataFrame(data).toDF(*columns)
input_data.head()

Row(title='project', description='designs and builds computer programs that power mobile devices, desktop computers, and even cars. They not only identify user needs but also create new applications for any given market while making improvements based on feedback from users.')

In [4]:
titleDF = spark.read \
      .option("header", True) \
      .csv("data/training_all_data.csv", sep=r'@')

huge_data = titleDF.union(input_data)

In [5]:
titleDF.select('description').distinct().count()

17479

In [6]:
titleDF.filter(titleDF.title.isNull())

DataFrame[title: string, description: string]

In [7]:
from pyspark.sql.functions import col

titleDF.groupBy("title") \
    .count() \
    .orderBy(col("count").desc()) \
    .head(2)

[Row(title='softwar develop', count=4808), Row(title='technician', count=1565)]

In [8]:
from pyspark.ml.feature import CountVectorizer, HashingTF, IDF, OneHotEncoder, StringIndexer, VectorAssembler, SQLTransformer

In [9]:
%%time

document_assembler = DocumentAssembler() \
      .setInputCol("description") \
      .setOutputCol("document")
    
tokenizer = Tokenizer() \
      .setInputCols(["document"]) \
      .setOutputCol("token")
      
normalizer = Normalizer() \
      .setInputCols(["token"]) \
      .setOutputCol("normalized")

stopwords_cleaner = StopWordsCleaner()\
      .setInputCols("normalized")\
      .setOutputCol("cleanTokens")\
      .setCaseSensitive(False)

stemmer = Stemmer() \
      .setInputCols(["cleanTokens"]) \
      .setOutputCol("stem")

finisher = Finisher() \
      .setInputCols(["stem"]) \
      .setOutputCols(["token_features"]) \
      .setOutputAsArray(True) \
      .setCleanAnnotations(False)

countVectors = CountVectorizer(inputCol="token_features", outputCol="features", vocabSize=10000, minDF=5)

label_stringIdx = StringIndexer(inputCol = "title", outputCol = "label")

nlp_pipeline = Pipeline(
    stages=[document_assembler, 
            tokenizer,
            normalizer,
            stopwords_cleaner, 
            stemmer, 
            finisher,
            countVectors,
            label_stringIdx])

nlp_model = nlp_pipeline.fit(huge_data)
processed = nlp_model.transform(titleDF)

processed.count()

CPU times: user 74.6 ms, sys: 25.3 ms, total: 99.9 ms
Wall time: 27.1 s


17479

In [10]:
processed.select('description','token_features').show(truncate=50)

+--------------------------------------------------+--------------------------------------------------+
|                                       description|                                    token_features|
+--------------------------------------------------+--------------------------------------------------+
|We are looking for a dependable Pharmacy Techni...|[look, depend, pharmaci, technician, process, f...|
|"Required License Type: CDL-B Required License ...|[requir, licens, typ, cdlb, requir, licens, end...|
|Working at the Y, you have the opportunity to s...|[work, y, opportun, strengthen, commun, chang, ...|
| Multifaceted, engagingÂ & challenging role.Â  ...|[multifacet, engagingâ, challeng, roleâ, conven...|
|"The High Companies began our work in 1931 with...|[high, compani, began, work, firmli, establish,...|
|JobID: 20564 Position Type: APS Recruitment/Lic...|[jobid, posit, typ, ap, recruitmentlicens, supp...|
|Summary To provide coverage for patient care fo...|[summari, pr

In [11]:
processed.select('description','features','label').show()

+--------------------+--------------------+-----+
|         description|            features|label|
+--------------------+--------------------+-----+
|We are looking fo...|(10000,[0,1,3,4,6...|  1.0|
|"Required License...|(10000,[0,1,3,4,5...| 11.0|
|Working at the Y,...|(10000,[0,1,2,3,4...| 13.0|
| Multifaceted, en...|(10000,[0,1,2,5,8...|  9.0|
|"The High Compani...|(10000,[0,1,3,4,5...|  9.0|
|JobID: 20564 Posi...|(10000,[0,1,2,3,8...|  5.0|
|Summary To provid...|(10000,[1,3,4,9,1...| 16.0|
|Waverly Rehabilit...|(10000,[5,9,11,14...|  2.0|
|With minimum supe...|(10000,[4,12,13,1...| 14.0|
|Overview: Adams S...|(10000,[0,1,2,3,4...| 18.0|
|Position Summary....|(10000,[0,1,2,3,4...| 11.0|
|Position Summary....|(10000,[0,1,2,3,4...| 11.0|
|Position Summary....|(10000,[0,1,2,3,4...| 11.0|
|Position Summary....|(10000,[0,1,2,3,4...| 11.0|
|Position Summary....|(10000,[0,1,2,3,4...| 11.0|
|Position Summary....|(10000,[0,1,2,3,4...| 11.0|
|Position Summary....|(10000,[0,1,2,3,4...|  1.0|


In [12]:
# set seed for reproducibility
(trainingData, testData) = processed.randomSplit([0.8, 0.2], seed = 100)
print("Training Dataset Count: " + str(trainingData.count()))
print("Test Dataset Count: " + str(testData.count()))

Training Dataset Count: 13992
Test Dataset Count: 3487


In [13]:
trainingData.printSchema()

root
 |-- title: string (nullable = true)
 |-- description: string (nullable = true)
 |-- document: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-- value: string (valueContainsNull = true)
 |    |    |-- embeddings: array (nullable = true)
 |    |    |    |-- element: float (containsNull = false)
 |-- token: array (nullable = true)
 |    |-- element: struct (containsNull = true)
 |    |    |-- annotatorType: string (nullable = true)
 |    |    |-- begin: integer (nullable = false)
 |    |    |-- end: integer (nullable = false)
 |    |    |-- result: string (nullable = true)
 |    |    |-- metadata: map (nullable = true)
 |    |    |    |-- key: string
 |    |    |    |-

In [14]:
from pyspark.ml.classification import LogisticRegression

lr = LogisticRegression(maxIter=10000, regParam=0.3, elasticNetParam=0)

lrModel = lr.fit(trainingData)

predictions = lrModel.transform(testData)

predictions.filter(predictions['prediction'] == 0) \
    .select("description","title","probability","label","prediction") \
    .orderBy("probability", ascending=False) \
    .show(n = 10, truncate = 30)

+------------------------------+---------------+------------------------------+-----+----------+
|                   description|          title|                   probability|label|prediction|
+------------------------------+---------------+------------------------------+-----+----------+
|"Read what people are sayin...|softwar develop|[0.9998808347099905,1.58860...|  0.0|       0.0|
|Read what people are saying...|softwar develop|[0.9994606117093139,5.57152...|  0.0|       0.0|
|Read what people are saying...|softwar develop|[0.9994073934131825,1.99954...|  0.0|       0.0|
|Read what people are saying...|softwar develop|[0.9992017177489978,8.83071...|  0.0|       0.0|
|Read what people are saying...|softwar develop|[0.9989919517723896,2.05479...|  0.0|       0.0|
|Read what people are saying...|softwar develop|[0.9989147670347271,7.17656...|  0.0|       0.0|
|  Senior Software Developer...|softwar develop|[0.9989013461534338,2.80340...|  0.0|       0.0|
|Read what people are saying..

In [15]:
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

evaluator = MulticlassClassificationEvaluator(predictionCol="prediction")

evaluator.evaluate(predictions)

0.7468835433202499

In [16]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
y_true = predictions.select("label")
y_true = y_true.toPandas()

y_pred = predictions.select("prediction")
y_pred = y_pred.toPandas()

In [17]:
y_pred.prediction.value_counts()

0.0     1255
1.0      383
4.0      292
2.0      272
3.0      251
5.0      206
6.0      138
7.0      121
10.0      81
8.0       79
13.0      59
12.0      44
14.0      40
11.0      34
9.0       33
15.0      30
21.0      26
16.0      22
23.0      18
18.0      17
24.0      14
20.0      13
22.0      10
32.0       8
19.0       8
27.0       7
29.0       6
17.0       5
26.0       4
25.0       4
28.0       3
30.0       3
31.0       1
Name: prediction, dtype: int64

In [18]:
cnf_matrix = confusion_matrix(list(y_true.label.astype(int)), list(y_pred.prediction.astype(int)))
cnf_matrix

array([[940,   3,   0, ...,   0,   0,   0],
       [ 20, 272,   5, ...,   0,   0,   0],
       [  2,   0, 216, ...,   0,   0,   0],
       ...,
       [  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0],
       [  0,   0,   0, ...,   0,   0,   0]])

In [19]:
predictions

DataFrame[title: string, description: string, document: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>, token: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>, normalized: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>, cleanTokens: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>, stem: array<struct<annotatorType:string,begin:int,end:int,result:string,metadata:map<string,string>,embeddings:array<float>>>, token_features: array<string>, features: vector, label: double, rawPrediction: vector, probability: vector, prediction: double]

In [26]:
print(classification_report(y_true.label, y_pred.prediction))
print(accuracy_score(y_true.label, y_pred.prediction))

              precision    recall  f1-score   support

         0.0       0.75      0.98      0.85       955
         1.0       0.71      0.84      0.77       322
         2.0       0.79      0.95      0.87       227
         3.0       0.70      0.78      0.74       226
         4.0       0.70      0.89      0.79       230
         5.0       0.87      0.91      0.89       196
         6.0       0.89      0.91      0.90       135
         7.0       0.74      0.69      0.72       130
         8.0       0.90      0.60      0.72       118
         9.0       0.82      0.31      0.45        88
        10.0       0.94      0.78      0.85        97
        11.0       0.91      0.46      0.61        68
        12.0       0.77      0.49      0.60        69
        13.0       0.85      0.74      0.79        68
        14.0       0.85      0.58      0.69        59
        15.0       0.97      0.57      0.72        51
        16.0       0.95      0.60      0.74        35
        17.0       0.40    

# Testing

In [21]:
from pyspark.ml.linalg import Vectors,Matrix

In [22]:
trainingData.select('title','label').distinct().show(50)

+--------------------+-----+
|               title|label|
+--------------------+-----+
|softwar develop i...| 29.0|
|             plumber| 34.0|
|    field technician| 19.0|
|       product owner| 24.0|
|        receptionist|  4.0|
|             cleaner| 37.0|
|       staff auntant| 39.0|
|             teacher|  5.0|
|       it specialist| 12.0|
|           physician| 16.0|
|softwar develop e...| 33.0|
|             trainer| 22.0|
|            attorney| 35.0|
|     softwar develop|  0.0|
|       custom servic|  7.0|
|  softwar develop ii| 20.0|
|               offic|  9.0|
|               coach| 11.0|
|       night auditor| 32.0|
|       social worker| 15.0|
|       human resourc| 13.0|
|          pharmacist| 36.0|
|full stack softwa...| 28.0|
|             of nurs| 23.0|
|            warehous| 10.0|
|       er technician| 31.0|
|            dishwash| 38.0|
|           architect|  8.0|
| net softwar develop| 30.0|
|  support technician| 26.0|
|            assessor| 27.0|
|             

In [23]:
dfFromRDD2 = spark.createDataFrame(data).toDF(*columns)
test = nlp_model.transform(dfFromRDD2)

In [24]:
row = Matrix(rdd, 2)
lrModel.predict(test.head().features)

0.0