 Before you start using this notebook change **datalake_name** in the Python variable and file system command


In [None]:
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

In [None]:
font = {'size'   : 12}
matplotlib.rc('font', **font)

In [None]:
datalake_name = 'cadlstoreev6d3ekjm3w7e'

In [None]:
%fs ls adl://cadlstoreev6d3ekjm3w7e.azuredatalakestore.net/

In [None]:
%python
configs = {
  'fs.adl.oauth2.access.token.provider.type': 'CustomAccessTokenProvider',
  'fs.adl.oauth2.access.token.custom.provider': spark.conf.get('spark.databricks.passthrough.adls.tokenProviderClassName')
}
dbutils.fs.mount(
source = f'adl://{datalake_name}.azuredatalakestore.net/',
mount_point = '/mnt/datalake',
extra_configs = configs)

In [None]:
questions_with_cleaned_text_columns_output_path = f'adl://{datalake_name}.azuredatalakestore.net/questions_with_cleaned_text_columns.parquet'
most_popular_questions_with_unique_tag_path = f'adl://{datalake_name}.azuredatalakestore.net/most_popular_questions_with_unique_tag.parquet'

In [None]:
questions_with_cleaned_text_columns_df = spark.read.parquet(questions_with_cleaned_text_columns_output_path)
most_popular_questions_with_unique_tag_df = spark.read.parquet(most_popular_questions_with_unique_tag_path)

## The most common words in the questions titles without consideration stopwords

In [None]:
most_frequent_words_in_titles = questions_with_cleaned_text_columns_df \
    .withColumn('Word', explode(split(col('TitleWithout'), ' ')))\
    .groupBy('Word') \
    .count() \
    .sort(desc('count'))

most_frequent_words_in_titles.show()

## The most common words in the questions titles without consideration stopwords - PLOT

In [None]:
top_ten_most_frequent_words_in_titles_pandas = most_frequent_words_in_titles \
    .limit(10) \
    .sort(asc('count')) \
    .toPandas()

fig, ax = plt.subplots(figsize=(10, 10), facecolor='white', dpi=100)
plt.barh(top_ten_most_frequent_words_in_titles_pandas['Word'], top_ten_most_frequent_words_in_titles_pandas['count'])
plt.ticklabel_format(axis='x', style='plain')

most_popular_tags_pandas_min_value = most_popular_tags_pandas_df['count'].min().round(decimals=-5)
most_popular_tags_pandas_max_value = most_popular_tags_pandas_df['count'].max().round(decimals=-5)
most_popular_tags_pandas_max_step = most_popular_tags_pandas_max_value/5
plt.xticks(np.arange(most_popular_tags_pandas_min_value, (most_popular_tags_pandas_max_value + most_popular_tags_pandas_max_step), step=most_popular_tags_pandas_max_step), rotation = 45)


plt.ylabel('Słowa')
plt.xlabel('Ilość wystąpień', labelpad= 20.0)
plt.title('Najczęściej pojawiające się słowa w tytułach pytań \n według danych z portalu stackoverflow.com')
plt.savefig('top_ten_most_frequent_words_in_titles.png', facecolor='white')
plt.show()

## Modelling stackoverflow data for auto-tagging prediction of the assigning tags to the questions based on title and body of the question

In [None]:
questions_for_model = questions_with_cleaned_text_columns_df \
    .withColumn('TitleWithBodyOfQuestion', concat_ws(' ', col('Title'), col('Body')))

questions_for_model.show()

In [None]:
questions_for_model_with_tag = questions_for_model.alias('q') \
    .join(most_popular_questions_with_unique_tag_df.alias('mq'), col('q.Id') == col('mq.Id')) \
    .select('q.*', 'mq.Tag')

questions_for_model_with_tag.show()

In [None]:
javascript_questions_for_model_df = questions_for_model_with_tag \
    .drop('Body', 'Title', 'Tags') \
    .filter(col('Tag') == '<javascript>') \
    .withColumn('Tag', translate(col('Tag'), '<>', ''))

In [None]:
javascript_questions_for_model_df.show()

In [None]:
javascript_questions_for_model_with_label = javascript_questions_for_model_df \
    .withColumn('Label', when(col('TitleWithBodyOfQuestion').contains(col('Tag')), 1).otherwise(0))

javascript_questions_for_model_with_label.show()

In [None]:
from pyspark.sql.functions import regexp_replace
from pyspark.ml.feature import Tokenizer

questions_for_model_train = javascript_questions_for_model_with_label \
    .withColumn('Text', regexp_replace(col('TitleWithBodyOfQuestion'), '[_():;,.!?\\-]', ' '))

questions_for_model_train = questions_for_model_train \
    .withColumn('Text', regexp_replace(col('Text'), '[0-9]', ' '))

questions_for_model_train = questions_for_model_train \
    .withColumn('Text', regexp_replace(col('Text'), ' +', ' '))

questions_for_model_train = Tokenizer(inputCol='Text', outputCol='Words').transform(questions_for_model_train)

questions_for_model_train.show(truncate=False)

In [None]:
from pyspark.ml.feature import StopWordsRemover, HashingTF, IDF

questions_for_model_train = StopWordsRemover(inputCol='Words', outputCol='Terms').transform(questions_for_model_train)

questions_for_model_train = HashingTF(inputCol='Terms', outputCol='Hash', numFeatures=1024).transform(questions_for_model_train)

questions_for_model_train_idf = IDF(inputCol='Hash', outputCol='Features').fit(questions_for_model_train).transform(questions_for_model_train)
      
questions_for_model_train_idf.select('Terms', 'Features').show(truncate=False)

In [None]:
from pyspark.ml.classification import LogisticRegression

questions_train, questions_test = questions_for_model_train_idf.randomSplit([0.8, 0.2], seed=13)

logistic = LogisticRegression(labelCol='Label', featuresCol='Features', regParam=0.2).fit(questions_train)

predictions = logistic.transform(questions_test)

predictions.groupBy('Label', 'prediction').count().show()

In [None]:
from pyspark.ml.evaluation import BinaryClassificationEvaluator

evaluator = BinaryClassificationEvaluator(labelCol='Label')
print('Poziom poniżej ROC', evaluator.evaluate(predictions))

## Modelling stackoverflow data for auto-tagging prediction of the assigning tags to the questions based on title and body of the question - Coefficients PLOT

In [None]:
model_coefficients = np.sort(logistic.coefficients)

fig, ax = plt.subplots(figsize=(10, 10), facecolor='white', dpi=100)
plt.plot(model_coefficients)

plt.ylabel('Coefficients')
plt.title('Regresja logistyczna')
plt.savefig('Coefficients_LogisticRegression.png', facecolor='white')
plt.show()

In [None]:
training_summary = logistic.summary

## Modelling stackoverflow data for auto-tagging prediction of the assigning tags to the questions based on title and body of the question - Distribution False-Positive and True-Positive based on model - PLOT


In [None]:
model_roc = training_summary.roc.toPandas()

fig, ax = plt.subplots(figsize=(10, 10), facecolor='white', dpi=100)
plt.plot(model_roc['FPR'], model_roc['TPR'])

plt.xlabel('Wskaźnik False Positive')
plt.ylabel('Wskaźnik True Positive')
plt.title('Regresja logistyczna - krzywa ROC \n Model treningowy poniżej poziomu ROC: 0.9515951309201413')
plt.savefig('ROC_LogisticRegression.png', facecolor='white')
plt.show()
print('Model treningowy poniżej poziomu ROC: ' + str(training_summary.areaUnderROC))

## Modelling stackoverflow data for auto-tagging prediction of the assigning tags to the questions based on title and body of the question - Distribution Precision and Recall based on model - PLOT

In [None]:
model_precision = training_summary.pr.toPandas()

fig, ax = plt.subplots(figsize=(10, 10), facecolor='white', dpi=100)
plt.plot(model_precision['recall'], model_precision['precision'])

plt.ylabel('Metryka precyzji')
plt.xlabel('Metryka przywołania')
plt.title('Regresja logistyczna - metryki wydajności')
plt.savefig('model_precision_LogisticRegression.png', facecolor='white')
plt.show()

## Modelling stackoverflow data for auto-tagging prediction of the assigning tags to the questions based on title and body of the question - Prediction result based on model

In [None]:
accuracy = predictions.filter(predictions.Label == predictions.prediction).count() / float(predictions.count())

predictions.select('Id', 'Label', 'Terms', 'rawPrediction', 'probability', 'prediction').show(10)
print('Ewaluacja modelu testowego poniżej poziomu ROC: ', evaluator.evaluate(predictions))
print('Dokładność modelu testowego: ', accuracy)