In [1]:
import findspark

findspark.init()

In [2]:
from pyspark.sql import SparkSession


spark: SparkSession = (
    SparkSession.builder.appName("classifier_model")
    .master("local[*]")
    .config("spark.sql.legacy.timeParserPolicy", "LEGACY")
    .getOrCreate()
)
spark.sparkContext.setCheckpointDir("../checkpoints/")

##### Load dataframe

In [3]:
from pyspark.sql import functions as f

schema = "polarity FLOAT, id LONG, date_time TIMESTAMP, query STRING, user STRING, text STRING"
timestampformat = "EEE MMM dd HH:mm:ss zzz yyyy"

spark_reader = spark.read.schema(schema)

In [4]:
dataframe = (
    spark_reader.csv(
        "../data/clean_static_data",
        quote='"',
        header=False,
        encoding="utf-8",
        timestampFormat=timestampformat,
    )
    .select("text", "polarity")
    .coalesce(2)
    .cache()
)

In [5]:
dataframe.limit(10).show()

+--------------------+--------+
|                text|polarity|
+--------------------+--------+
|goen to pass out ...|     0.0|
|Planned to go to ...|     0.0|
|aussiemcflyfan ye...|     0.0|
|Why am I up alrea...|     0.0|
|But now talking a...|     0.0|
|CorinaBecker It's...|     0.0|
|miss antonia me I...|     0.0|
|So bummed Tried t...|     0.0|
|aaaahhh coffee No...|     0.0|
|Dad had traumatic...|     0.0|
+--------------------+--------+



##### Split dataframe into train and test dataframes

In [6]:
train_df, test_df = dataframe.randomSplit([0.98, 0.02])

In [7]:
train_df.groupBy("polarity").count().show()

+--------+------+
|polarity| count|
+--------+------+
|     0.0|637375|
|     1.0|637034|
+--------+------+



In [8]:
test_df.groupBy("polarity").count().show()

+--------+-----+
|polarity|count|
+--------+-----+
|     0.0|13038|
|     1.0|13014|
+--------+-----+



In [9]:
from pyspark.ml.feature import HashingTF, StopWordsRemover, Tokenizer, IDF
from pyspark.ml.classification import LogisticRegression
from pyspark.ml.pipeline import Pipeline
# from pyspark.ml.tuning import CrossValidator, ParamGridBuilder
from pyspark.ml.evaluation import MulticlassClassificationEvaluator

##### Define pipeline

In [10]:
tokenizer = Tokenizer(inputCol="text", outputCol="raw_tokenized")

stopwords_remover = StopWordsRemover(
    inputCol="raw_tokenized",
    outputCol="tokenized",
    stopWords=StopWordsRemover.loadDefaultStopWords("english") + ["'"],
)

hash_term_freq = HashingTF(numFeatures=1e5, inputCol="tokenized", outputCol="tf")

idf = IDF(inputCol="tf", outputCol="features", minDocFreq=5)

lr_classifier = LogisticRegression(labelCol="polarity", maxIter=100, regParam=0.1)

pipline = Pipeline(stages=[tokenizer, stopwords_remover, hash_term_freq, idf, lr_classifier])

##### In order to save time, I commented grid searching. Please uncomment it in a strong system.

In [11]:
# params = (
#     ParamGridBuilder()
#     .addGrid(hash_term_freq.numFeatures, [1e3, 1e4, 1e5, 1e6])
#     .addGrid(idf.minDocFreq, [0, 3, 5, 10])
#     .addGrid(lr_classifier.regParam, [0.01, 0.1, 1, 2])
#     .addGrid(lr_classifier.maxIter, [20, 50, 100])
#     .build()
# )

precision_evaluator = MulticlassClassificationEvaluator(labelCol="polarity", metricName="weightedPrecision")
recall_evaluator = MulticlassClassificationEvaluator(labelCol="polarity", metricName="weightedRecall")
f1_evaluator = MulticlassClassificationEvaluator(labelCol="polarity", metricName="f1")

# cv = CrossValidator(
#     estimator=pipline, estimatorParamMaps=params, evaluator=evaluator, numFolds=5
# )

##### Train model

In [12]:
transformer = pipline.fit(train_df)

In [13]:
train_summary = transformer.stages[-1].summary

print("loss:")
for objective in train_summary.objectiveHistory:
    print(objective)

loss:
0.693147144761647
0.5380875983772021
0.5261101520811851
0.5245941650516408
0.5244850323217729
0.5244412111137922
0.5244363525386133
0.5244349870449421
0.5244347353991443
0.5244347250624964
0.5244347150426151
0.5244347143868816
0.5244347140033999
0.5244347139881549
0.5244347139600913
0.5244347139584957
0.5244347139575722


##### Check results for train dataframe

In [14]:
fMeasure = train_summary.weightedFMeasure()
precision = train_summary.weightedPrecision
recall = train_summary.weightedRecall
print(f"F-measure: {fMeasure}\nPrecision: {precision}\nRecall: {recall}")

F-measure: 0.7983109615867954
Precision: 0.7984692989188622
Recall: 0.7983331881680057


##### Check results for test dataframe

In [15]:
predictions = transformer.transform(test_df)
fMeasure = f1_evaluator.evaluate(predictions)
precision = precision_evaluator.evaluate(predictions)
recall = recall_evaluator.evaluate(predictions)
print(f"F-measure: {fMeasure}\nPrecision: {precision}\nRecall: {recall}")

F-measure: 0.7639744342068644
Precision: 0.7641897014846722
Recall: 0.7640104406571473


##### Save model

In [16]:
transformer.write().overwrite().save('../model2')

In [17]:
spark.stop()