 Before you start using this notebook change **datalake_name** in the Python variable and file system command

In [None]:
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import numpy as np
import pandas as pd
import matplotlib
import matplotlib.pyplot as plt

In [None]:
font = {'size'   : 12}

matplotlib.rc('font', **font)

In [None]:
datalake_name = 'cadlstorejdlf4tf3x3bwg'


In [None]:
%fs ls adl://cadlstorejdlf4tf3x3bwg.azuredatalakestore.net/

In [None]:
%python
configs = {
  'fs.adl.oauth2.access.token.provider.type': 'CustomAccessTokenProvider',
  'fs.adl.oauth2.access.token.custom.provider': spark.conf.get('spark.databricks.passthrough.adls.tokenProviderClassName')
}
dbutils.fs.mount(
source = f'adl://{datalake_name}.azuredatalakestore.net/',
mount_point = '/mnt/datalake',
extra_configs = configs)

In [None]:
posts_output_path = f'adl://{datalake_name}.azuredatalakestore.net/posts.parquet'

most_popular_questions_with_unique_tag_path = f'adl://{datalake_name}.azuredatalakestore.net/most_popular_questions_with_unique_tag.parquet'
most_popular_answers_with_unique_tag_path = f'adl://{datalake_name}.azuredatalakestore.net/most_popular_answers_with_unique_tag.parquet'
most_popular_accepted_answers_with_unique_tag_path = f'adl://{datalake_name}.azuredatalakestore.net/most_popular_accepted_answers_with_unique_tag.parquet'
users_answers_with_most_popular_tags_parquet_path = f'adl://{datalake_name}.azuredatalakestore.net/users_answers_with_most_popular_tags.parquet'
users_questions_and_answers_with_most_popular_tags_parquet_path = f'adl://{datalake_name}.azuredatalakestore.net/users_questions_and_answers_with_most_popular_tags.parquet

lowest_popular_questions_with_unique_tag_path = f'adl://{datalake_name}.azuredatalakestore.net/lowest_popular_questions_with_unique_tag.parquet'
lowest_popular_answers_with_unique_tag_path = f'adl://{datalake_name}.azuredatalakestore.net/lowest_popular_answers_with_unique_tag.parquet'
lowest_popular_accepted_answers_with_unique_tag_path = f'adl://{datalake_name}.azuredatalakestore.net/lowest_popular_accepted_answers_with_unique_tag.parquet'
users_answers_with_lowest_popular_tags_parquet_path = f'adl://{datalake_name}.azuredatalakestore.net/users_answers_with_lowest_popular_tags.parquet'
users_questions_and_answers_with_lowest_popular_tags_parquet_path = f'adl://{datalake_name}.azuredatalakestore.net/users_questions_and_answers_with_lowest_popular_tags.parquet'

In [None]:
spark = SparkSession.builder.getOrCreate()

In [None]:
posts_df = spark.read.parquet(posts_output_path)

most_popular_questions_with_unique_tag_df=spark.read.parquet(most_popular_questions_with_unique_tag_path)
most_popular_answers_with_unique_tag_df=spark.read.parquet(most_popular_answers_with_unique_tag_path)
most_popular_accepted_answers_with_unique_tag_df=spark.read.parquet(most_popular_accepted_answers_with_unique_tag_path)
users_answers_with_most_popular_tags_df = spark.read.parquet(users_answers_with_most_popular_tags_parquet_path)
users_questions_and_answers_with_most_popular_tags_df = spark.read.parquet(users_questions_and_answers_with_most_popular_tags_parquet_path)

lowest_popular_questions_with_unique_tag_df=spark.read.parquet(lowest_popular_questions_with_unique_tag_path)
lowest_popular_answers_with_unique_tag_df=spark.read.parquet(lowest_popular_answers_with_unique_tag_path)
lowest_popular_accepted_answers_with_unique_tag_df=spark.read.parquet(lowest_popular_accepted_answers_with_unique_tag_path)
users_answers_with_lowest_popular_tags_df = spark.read.parquet(users_answers_with_lowest_popular_tags_parquet_path)
users_questions_and_answers_with_lowest_popular_tags_df = spark.read.parquet(users_questions_and_answers_with_lowest_popular_tags_parquet_path)

In [None]:
questions_df = posts_df.filter(col('_PostTypeId') == 1)

answers_df = posts_df.filter(col('_PostTypeId') == 2)

## Percentage distribution of cases when not accepted answers is the highest scored answer in the question

In [None]:
accepted_answers_df = answers_df.alias('a').join(questions_df.alias('q'), col('a.Id') == col('q.AcceptedAnswerId')) \
    .select(col('q.Id').alias('AcceptedAnswerQuestionId'), col('a.Id').alias('AcceptedAnswerId'), col('a.Score').alias('AcceptedAnswerScore'))

regular_answers_highest_score_without_accepted_per_question = answers_df \
    .alias('a') \
    .join(accepted_answers_df.alias('aa'), col('a.Id') == col('aa.AcceptedAnswerId'), how='left_anti' ) \
    .select(col('a.ParentId'), col('a.Id').alias('RegularAnswerId'), col('a.Score').alias('RegularAnswerScore')) \
    .groupBy('a.ParentId') \
    .max('RegularAnswerScore') \
    .select(col('a._ParentId').alias('RegularAnswerQuestionId'), col('max(RegularAnswerScore)').alias('HighestRegularAnswerScore'))

In [None]:
number_of_accepted_answers = accepted_answers_df.count()

number_of_regular_answers_score_that_is_higher_than_accepted_answer_score = accepted_answers_df.alias('aa') \
    .join(regular_answers_highest_score_without_accepted_per_question.alias('ra'), col('aa.AcceptedAnswerQuestionId') == col('ra.RegularAnswerQuestionId') ) \
    .filter(col('ra.HighestRegularAnswerScore') > col('aa.AcceptedAnswerScore')).count()

number_of_accepted_answers_score_is_highest_in_question = number_of_accepted_answers - number_of_regular_answers_score_that_is_higher_than_accepted_answer_score

print(f'Check {number_of_accepted_answers - number_of_accepted_answers_score_is_highest_in_question - number_of_regular_answers_score_that_is_higher_than_accepted_answer_score }') #0

## Percentage distribution of cases when not accepted answers is the highest scored answer in the question - PLOT

In [None]:
labels = 'Najwyżej ocenia odpowiedź w pytaniu \n to zaakceptowana odpowiedź', 'Najwyżej ocenia odpowiedź w pytaniu \n to nie zaakceptowana odpowiedź'
sizes = [number_of_accepted_answers_score_is_highest_in_question, number_of_regular_answers_score_that_is_higher_than_accepted_answer_score]
explode = (0, 0.1)

fig1, ax = plt.subplots(figsize=(10, 10), facecolor='white', dpi=100)
ax.pie(sizes, explode=explode, autopct='%1.1f%%')
ax.axis('equal')

ax.legend(labels, loc='center left', bbox_to_anchor=(1, 0, 0.5, 1))
plt.title('Procentowa ilość przypadków kiedy \n najwyżej oceniana odpowiedź to nie zaakceptowana odpowiedź \n według danych z portalu stackoverflow.com')

plt.savefig('comparision_of_number_of_accepted_answers_highest_score_to_highest_score_regular_answer_in_question.png', facecolor='white')
plt.show()

## Distribution of the Accepted and Not Accepted Answers in Score matter - Average, Standard Deviation, Minimum, Maximum

In [None]:
accepted_answers_df = answers_df.alias('a').join(questions_df.alias('q'), col('a.Id') == col('q.AcceptedAnswerId')) \
.select(col('a.Id').alias('AcceptedAnswerId'), col('a._Score').alias('AcceptedAnswerScore'))

max_score_in_accepted_answers_value = accepted_answers_df \
    .select(max('AcceptedAnswerScore').alias('HighestScoreInAllAcceptedAnswers')) \
    .first()['HighestScoreInAllAcceptedAnswers']

print(f'Max Score in All Accepted Answers: {max_score_in_accepted_answers_value}')

min_score_in_accepted_answers_value = accepted_answers_df \
    .select(min('AcceptedAnswerScore').alias('LowestScoreInAllAcceptedAnswers')) \
    .first()['LowestScoreInAllAcceptedAnswers']

print(f'Min Score in All Accepted Answers: {min_score_in_accepted_answers_value}')

avg_score_in_accepted_answers_value = accepted_answers_df \
    .select(avg('AcceptedAnswerScore').alias('AverageScoreInAllAcceptedAnswers')) \
    .first()['AverageScoreInAllAcceptedAnswers']

print(f'Average Score in All Accepted Answers: {np.round(avg_score_in_accepted_answers_value)}')

standard_deviation_score_in_accepted_answers_value = accepted_answers_df \
    .select(stddev('AcceptedAnswerScore').alias('StandardDeviationScoreInAllAcceptedAnswers')) \
    .first()['StandardDeviationScoreInAllAcceptedAnswers']

print(f'Standard Deviation Score in All Accepted Answers: {np.round(standard_deviation_score_in_accepted_answers_value)}')

In [None]:
regular_answers_df = answers_df \
.alias('a') \
.join(accepted_answers_df.alias('aa'), col('a.Id') == col('aa.AcceptedAnswerId'), how='left_anti' ) \
.select(col('a.Id').alias('RegularAnswerId'), col('a.Score').alias('RegularAnswerScore'))

In [None]:
max_score_in_regular_answers_value = regular_answers_df \
    .select(max('RegularAnswerScore').alias('HighestScoreInAllRegularAnswers')) \
    .first()['HighestScoreInAllRegularAnswers']

print(f'Max Score in All Accepted Answers: {max_score_in_regular_answers_value}')

min_score_in_regular_answers_value = regular_answers_df \
    .select(min('RegularAnswerScore').alias('LowestScoreInAllRegularAnswers')) \
    .first()['LowestScoreInAllRegularAnswers']

print(f'Min Score in All Accepted Answers: {min_score_in_regular_answers_value}')

avg_score_in_regular_answers_value = regular_answers_df \
    .select(avg('RegularAnswerScore').alias('AverageScoreInAllRegularAnswers')) \
    .first()['AverageScoreInAllRegularAnswers']

print(f'Average Score in All Accepted Answers: {np.round(avg_score_in_regular_answers_value)}')

standard_deviation_score_in_regular_answers_value = regular_answers_df \
    .select(stddev('RegularAnswerScore').alias('StandardDeviationScoreInAllRegularAnswers')) \
    .first()['StandardDeviationScoreInAllRegularAnswers']

print(f'Standard Deviation in All Accepted Answers: {np.round(standard_deviation_score_in_regular_answers_value)}')

## Distribution of the Accepted and Not Accepted Answers in Score matter - Average, Standard Deviation, Minimum, Maximum - BOXPLOT

In [None]:
accepted_answers_pandas_df = accepted_answers_df.toPandas()
regular_answers_pandas_df = regular_answers_df.toPandas()
fig, ax = plt.subplots(1, 2, figsize=(10, 10), facecolor='white', dpi=100)

average_score_values = pd.DataFrame([np.around(avg_score_in_accepted_answers_value, decimals = 2), np.around(avg_score_in_regular_answers_value, decimals = 2)])
standard_deviation_score_values = pd.DataFrame([np.around(standard_deviation_score_in_accepted_answers_value, decimals = 2), np.around(standard_deviation_score_in_regular_answers_value, decimals = 2) ])

# showfliers=True
standard_distribution_plot = ax[0].boxplot([accepted_answers_pandas_df['AcceptedAnswerScore'], regular_answers_pandas_df['RegularAnswerScore']], showmeans=True)

for i, line in enumerate(standard_distribution_plot['medians']):
    x, y = line.get_xydata()[1]
    text = f' μ:{average_score_values[0][i]} \n σ:{standard_deviation_score_values[0][i]}'
    ax[0].annotate(text, xy=(x, y))

ax[0].set_xticklabels(['Zaakceptowane', 'Pozostałe'])
ax[0].set_ylabel('Ocena odpowiedzi', labelpad= 5.0)
ax[0].set_xlabel('Rodzaj odpowiedzi', labelpad= 10.0)

# showfliers=True

standard_distribution_plot = ax[1].boxplot([accepted_answers_pandas_df['AcceptedAnswerScore'], regular_answers_pandas_df['RegularAnswerScore']], showmeans=True, showfliers=False)

for i, line in enumerate(standard_distribution_plot['medians']):
    x, y = line.get_xydata()[1]
    text = f' μ:{average_score_values[0][i]} \n σ:{standard_deviation_score_values[0][i]}'
    ax[1].annotate(text, xy=(x, y))

ax[1].set_xticklabels(['Zaakceptowane', 'Pozostałe'])
ax[1].set_xlabel('Rodzaj odpowiedzi', labelpad= 10.0)

plt.suptitle('Rozkład ocen odpowiedzi zaakceptowanych i pozostałych \n według danych z portalu stackoverflow.com \n gdzie: μ: Średnia z ocen odpowiedzi, σ: Odchylenie standardowe z ocen odpowiedzi')
plt.savefig('distribution_between_accepted_and_regular_answers_based_on_score.png', facecolor='white')
plt.show()

## Average Response Time from question appearing to accepted answer

In [None]:
top_most_popular_questions_accepted_answer_response_time_df = most_popular_questions_with_unique_tag_df.alias('q').join(most_popular_accepted_answers_with_unique_tag_df.alias('a'), col('q.AcceptedAnswerId') == col('a.Id')) \
    .select(col('q.Tag'), col('q.CreationDate').alias('QuestionCreationDate'), col('a.CreationDate').alias('AcceptedAnswerCreationDate')) \
    .withColumn('AcceptedAnswerResponseTimeSeconds', col('AcceptedAnswerCreationDate').cast('long') - col('QuestionCreationDate').cast('long')) \
    .withColumn('AcceptedAnswerResponseTimeInMinutes', round(col('AcceptedAnswerResponseTimeSeconds')/60)) \
    .withColumn('AcceptedAnswerResponseTimeInHours', round(col('AcceptedAnswerResponseTimeSeconds')/3600)) \
    .withColumn('AcceptedAnswerResponseTimeInDays', round(col('AcceptedAnswerResponseTimeSeconds')/(24*3600))) \
    .sort(desc('AcceptedAnswerResponseTimeSeconds'), desc('AcceptedAnswerResponseTimeInMinutes'), desc('AcceptedAnswerResponseTimeInHours'), desc('AcceptedAnswerResponseTimeInDays'))

top_lowest_popular_questions_accepted_answer_response_time_df = lowest_popular_questions_with_unique_tag_df.alias('q').join(lowest_popular_accepted_answers_with_unique_tag_df.alias('a'), col('q.AcceptedAnswerId') == col('a.Id')) \
    .select(col('q.Tag'), col('q.CreationDate').alias('QuestionCreationDate'), col('a.CreationDate').alias('AcceptedAnswerCreationDate')) \
    .withColumn('AcceptedAnswerResponseTimeSeconds', col('AcceptedAnswerCreationDate').cast('long') - col('QuestionCreationDate').cast('long')) \
    .withColumn('AcceptedAnswerResponseTimeInMinutes', round(col('AcceptedAnswerResponseTimeSeconds')/60)) \
    .withColumn('AcceptedAnswerResponseTimeInHours', round(col('AcceptedAnswerResponseTimeSeconds')/3600)) \
    .withColumn('AcceptedAnswerResponseTimeInDays', round(col('AcceptedAnswerResponseTimeSeconds')/(24*3600))) \
    .sort(desc('AcceptedAnswerResponseTimeSeconds'), desc('AcceptedAnswerResponseTimeInMinutes'), desc('AcceptedAnswerResponseTimeInHours'), desc('AcceptedAnswerResponseTimeInDays'))

In [None]:
average_time_need_to_get_accepted_answer_for_most_popular_questions_df = top_most_popular_questions_accepted_answer_response_time_df \
    .groupBy('Tag') \
    .agg(round(avg('AcceptedAnswerResponseTimeInDays'), 1).alias('AverageAcceptedAnswerResponseTimeInDays'))\
    .sort(asc('AverageAcceptedAnswerResponseTimeInDays'))

average_time_need_to_get_accepted_answer_for_lowest_popular_questions_df = top_lowest_popular_questions_accepted_answer_response_time_df \
    .groupBy('Tag') \
    .agg(round(avg('AcceptedAnswerResponseTimeInDays'), 1).alias('AverageAcceptedAnswerResponseTimeInDays'))\
    .sort(asc('AverageAcceptedAnswerResponseTimeInDays'))

## Average Response Time from question appearing to accepted answer - PLOT

In [None]:
average_time_need_to_get_accepted_answer_for_most_popular_questions_pandas_df = average_time_need_to_get_accepted_answer_for_most_popular_questions_df.toPandas()

fig, ax = plt.subplots(figsize=(10, 10), facecolor='white', dpi=100)
plt.barh(average_time_need_to_get_accepted_answer_for_most_popular_questions_pandas_df['Tag'], average_time_need_to_get_accepted_answer_for_most_popular_questions_pandas_df['AverageAcceptedAnswerResponseTimeInDays'])
plt.ticklabel_format(axis='x', style='plain')

plt.ylabel('Technologie')
plt.xlabel('Ilość dni', labelpad= 20.0)
plt.title('Średni czas od pojawienia się pytania do \n pojawienia się zaakceptowanej odpowiedzi w najpopularniejszych \n technologiach według danych z portalu stackoverflow.com')
plt.savefig('average_time_need_to_get_accepted_answer_for_most_popular_questions.png', facecolor='white')
plt.show()

In [None]:
average_time_need_to_get_accepted_answer_for_lowest_popular_questions_pandas_df = average_time_need_to_get_accepted_answer_for_lowest_popular_questions_df.toPandas()

fig, ax = plt.subplots(figsize=(10, 10), facecolor='white', dpi=100)
plt.barh(average_time_need_to_get_accepted_answer_for_lowest_popular_questions_pandas_df['Tag'], average_time_need_to_get_accepted_answer_for_lowest_popular_questions_pandas_df['AverageAcceptedAnswerResponseTimeInDays'])
plt.ticklabel_format(axis='x', style='plain')

plt.ylabel('Technologie')
plt.xlabel('Ilość dni', labelpad= 20.0)
plt.title('Średni czas od pojawienia się pytania do \n pojawienia się zaakceptowanej odpowiedzi w najmniej popularnych \n technologiach według danych z portalu stackoverflow.com')
plt.savefig('average_time_need_to_get_accepted_answer_for_lowest_popular_questions.png', facecolor='white')
plt.show()

## Comparison Analysis Technologies based on: Number of Users involved in Questions and Answers, Users Reputation based on  Answers, Average Time need it to get an answer and Average Answer score in the top 10th MOST popular Technologies due to StackOverflow Professionals Survey based on StackOverflow Questions?


In [None]:
filter_based_duplicated_column = ['Id', 'Tag']

average_time_need_to_get_accepted_answer_for_most_popular_questions_df = top_most_popular_questions_accepted_answer_response_time_df \
    .groupBy('Tag') \
    .agg(round(avg('AcceptedAnswerResponseTimeInDays'), 1).alias('AverageAcceptedAnswerResponseTimeInDays'))\
    .sort(asc('AverageAcceptedAnswerResponseTimeInDays'))

average_score_most_popular_answers_with_unique_tag_df = most_popular_answers_with_unique_tag_df \
    .groupBy('Tag') \
    .agg(round(avg('Score'), 1).alias('AverageScore'))\
    .sort(asc('AverageScore'))

number_of_unique_users_involved_in_questions_and_answers_in_most_popular_tags_df = users_questions_and_answers_with_most_popular_tags_df \
    .withColumn('Filter_Id_Tag', array_sort(array(*filter_based_duplicated_column))) \
    .drop_duplicates(['Filter_Id_Tag']) \
    .drop('Filter_Id_Tag') \
    .groupBy(col('Tag')) \
    .count() \
    .sort(asc('count'))

sum_of_reputation_all_unique_users_involved_in_answers_in_most_popular_tags_df = users_answers_with_most_popular_tags_df \
    .withColumn('Filter_Id_Tag', array_sort(array(*filter_based_duplicated_column))) \
    .drop_duplicates(['Filter_Id_Tag']) \
    .drop('Filter_Id_Tag') \
    .groupBy(col('Tag')) \
    .agg(sum(col('Reputation')).cast(LongType()).alias('SumOfUsersReputations')) \
    .sort(asc('SumOfUsersReputations'))

In [None]:
most_popular_rating_df = average_time_need_to_get_accepted_answer_for_most_popular_questions_df.alias('at') \
    .join(average_score_most_popular_answers_with_unique_tag_df.alias('as'), col('at.Tag') == col('as.Tag')) \
    .join(number_of_unique_users_involved_in_questions_and_answers_in_most_popular_tags_df.alias('un'), col('at.Tag') == col('un.Tag')) \
    .join(sum_of_reputation_all_unique_users_involved_in_answers_in_most_popular_tags_df.alias('ur'), col('at.Tag') == col('ur.Tag')) \
    .select(col('at.Tag').alias('Tag'), col('at.AverageAcceptedAnswerResponseTimeInDays').alias('AverageAcceptedAnswerResponseTimeInDays'), col('as.AverageScore').alias('AverageScore'), \
            col('un.count').alias('NumberOfUniqueUsers'), col('ur.SumOfUsersReputations').alias('SumOfUsersReputations')) \
    .withColumn('Rating', round(col('AverageAcceptedAnswerResponseTimeInDays') + col('AverageScore') + col('NumberOfUniqueUsers') + col('SumOfUsersReputations')).cast(LongType())) \
    .sort(asc('Rating'))

most_popular_rating_df.show()

## Comparison Analysis Technologies based on: Number of Users involved in Questions and Answers, Users Reputation based on  Answers, Average Time need it to get an answer and Average Answer score in the top 10th MOST popular Technologies due to StackOverflow Professionals Survey based on StackOverflow Questions? - PLOT


In [None]:
most_popular_rating_pandas_df = most_popular_rating_df.toPandas()

fig, ax = plt.subplots(figsize=(10,10), facecolor='white', dpi=100)
plt.barh(most_popular_rating_pandas_df['Tag'], most_popular_rating_pandas_df['Rating'])
plt.ticklabel_format(axis='x', style='plain')

plt.xticks(rotation = 45)
plt.ylabel('Technologie')
plt.xlabel('Suma punktów oceny', labelpad= 20.0)
plt.title('Ocena dojrzałości i siły społeczności najpopularniejszych \n technologii według danych z portalu stackoverflow.com')
plt.savefig('most_popular_rating.png', facecolor='white')
plt.show()

## Comparison Analysis Technologies based on: Number of Users involved in Questions and Answers, Users Reputation based on  Answers, Average Time need it to get an answer and Average Answer score in the top 10th LEAST popular Technologies due to StackOverflow Professionals Survey based on StackOverflow Questions?


In [None]:
average_time_need_to_get_accepted_answer_for_lowest_popular_questions_df = top_lowest_popular_questions_accepted_answer_response_time_df \
    .groupBy('Tag') \
    .agg(round(avg('AcceptedAnswerResponseTimeInDays'), 1).alias('AverageAcceptedAnswerResponseTimeInDays'))\
    .sort(asc('AverageAcceptedAnswerResponseTimeInDays'))

average_score_lowest_popular_answers_with_unique_tag_df = lowest_popular_answers_with_unique_tag_df \
    .groupBy('Tag') \
    .agg(round(avg('Score'), 1).alias('AverageScore'))\
    .sort(asc('AverageScore'))

number_of_unique_users_involved_in_questions_and_answers_in_lowest_popular_tags_df = users_questions_and_answers_with_lowest_popular_tags_df \
    .withColumn('Filter_Id_Tag', array_sort(array(*filter_based_duplicated_column))) \
    .drop_duplicates(['Filter_Id_Tag']) \
    .drop('Filter_Id_Tag') \
    .groupBy(col('Tag')) \
    .count() \
    .sort(asc('count'))

sum_of_reputation_all_unique_users_involved_in_answers_in_lowest_popular_tags_df = users_answers_with_lowest_popular_tags_df \
    .withColumn('Filter_Id_Tag', array_sort(array(*filter_based_duplicated_column))) \
    .drop_duplicates(['Filter_Id_Tag']) \
    .drop('Filter_Id_Tag') \
    .groupBy(col('Tag')) \
    .agg(sum(col('Reputation')).cast(LongType()).alias('SumOfUsersReputations')) \
    .sort(asc('SumOfUsersReputations'))

In [None]:
lowest_popular_rating_df = average_time_need_to_get_accepted_answer_for_lowest_popular_questions_df.alias('at') \
    .join(average_score_lowest_popular_answers_with_unique_tag_df.alias('as'), col('at.Tag') == col('as.Tag')) \
    .join(number_of_unique_users_involved_in_questions_and_answers_in_lowest_popular_tags_df.alias('un'), col('at.Tag') == col('un.Tag')) \
    .join(sum_of_reputation_all_unique_users_involved_in_answers_in_lowest_popular_tags_df.alias('ur'), col('at.Tag') == col('ur.Tag')) \
    .select(col('at.Tag').alias('Tag'), col('at.AverageAcceptedAnswerResponseTimeInDays').alias('AverageAcceptedAnswerResponseTimeInDays'), col('as.AverageScore').alias('AverageScore'), \
            col('un.count').alias('NumberOfUniqueUsers'), col('ur.SumOfUsersReputations').alias('SumOfUsersReputations')) \
    .withColumn('Rating', round(col('AverageAcceptedAnswerResponseTimeInDays') + col('AverageScore') + col('NumberOfUniqueUsers') + col('SumOfUsersReputations')).cast(LongType())) \
    .sort(asc('Rating'))

lowest_popular_rating_df.show()

## Comparison Analysis Technologies based on: Number of Users involved in Questions and Answers, Users Reputation based on  Answers, Average Time need it to get an answer and Average Answer score in the top 10th LEAST popular Technologies due to StackOverflow Professionals Survey based on StackOverflow Questions? - PLOT


In [None]:
lowest_popular_rating_pandas_df = lowest_popular_rating_df.toPandas()

fig, ax = plt.subplots(figsize=(10,10), facecolor='white', dpi=100)
plt.barh(lowest_popular_rating_pandas_df['Tag'], lowest_popular_rating_pandas_df['Rating'])
plt.ticklabel_format(axis='x', style='plain')

plt.xticks(rotation=45)
plt.ylabel('Technologie')
plt.xlabel('Suma punktów oceny', labelpad= 20.0)
plt.title('Ocena dojrzałości i siły społeczności najmniej popularnych \n technologii według danych z portalu stackoverflow.com')
plt.savefig('lowest_popular_rating.png', facecolor='white')
plt.show()