In [None]:
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

In [None]:
font = {'size'   : 12}

matplotlib.rc('font', **font)

In [None]:
datalake_name = 'cadlstorejdlf4tf3x3bwg'


In [None]:
%fs ls adl://cadlstorejdlf4tf3x3bwg.azuredatalakestore.net/


In [None]:
%python
configs = {
  "fs.adl.oauth2.access.token.provider.type": "CustomAccessTokenProvider",
  "fs.adl.oauth2.access.token.custom.provider": spark.conf.get("spark.databricks.passthrough.adls.tokenProviderClassName")
}
dbutils.fs.mount(
source = f"adl://{datalake_name}.azuredatalakestore.net/",
mount_point = "/mnt/datalake",
extra_configs = configs)

In [None]:
tags_output_path = f'adl://{datalake_name}.azuredatalakestore.net/tags.parquet'
posts_output_path = f'adl://{datalake_name}.azuredatalakestore.net/posts.parquet'
users_output_path = f'adl://{datalake_name}.azuredatalakestore.net/users.parquet'
comments_output_path = f'adl://{datalake_name}.azuredatalakestore.net/comments.parquet'

most_popular_questions_with_unique_tag_path = f'adl://{datalake_name}.azuredatalakestore.net/most_popular_questions_with_unique_tag.parquet'
most_popular_answers_with_unique_tag_path = f'adl://{datalake_name}.azuredatalakestore.net/most_popular_answers_with_unique_tag.parquet'

lowest_popular_questions_with_unique_tag_path = f'adl://{datalake_name}.azuredatalakestore.net/lowest_popular_questions_with_unique_tag.parquet'
lowest_popular_answers_with_unique_tag_path = f'adl://{datalake_name}.azuredatalakestore.net/lowest_popular_answers_with_unique_tag.parquet'

In [None]:
spark = SparkSession.builder.getOrCreate()

In [None]:
tags_df = spark.read.parquet(tags_output_path)
posts_df = spark.read.parquet(posts_output_path)
users_df = spark.read.parquet(users_output_path)
comments_df = spark.read.parquet(comments_output_path)

most_popular_questions_with_unique_tag_df=spark.read.parquet(most_popular_questions_with_unique_tag_path)
most_popular_answers_with_unique_tag_df=spark.read.parquet(most_popular_answers_with_unique_tag_path)

lowest_popular_questions_with_unique_tag_df=spark.read.parquet(lowest_popular_questions_with_unique_tag_path)
lowest_popular_answers_with_unique_tag_df=spark.read.parquet(lowest_popular_answers_with_unique_tag_path)

## How many Tags are based on Tags Summary?

In [None]:
tags_df.count()

## How many Questions are based on Tags Summary?

In [None]:
tags_df.select(sum(col('Count'))).show()

## Number of posts through the years

In [None]:
posts_through_the_years_df = posts_df \
    .withColumn('CreationYear', year(col('CreationDate'))) \
    .groupBy(col('CreationYear')) \
    .count() \
    .sort(desc('CreationYear'), desc('count'))

posts_through_the_years_df.show()


## Number of posts through the years - PLOT

In [None]:
posts_through_the_years_pandas_df = posts_through_the_years_df.toPandas()

fig, ax = plt.subplots(figsize=(10, 10), facecolor='white', dpi=100)
plt.bar(posts_through_the_years_pandas_df['CreationYear'], posts_through_the_years_pandas_df['count'])
plt.ticklabel_format(axis="y", style="plain")

most_popular_tags_pandas_min_value = posts_through_the_years_pandas_df['count'].min().round(decimals=-4)
most_popular_tags_pandas_max_value = posts_through_the_years_pandas_df['count'].max().round(decimals=-4)
most_popular_tags_pandas_max_step = (most_popular_tags_pandas_max_value/6).round(decimals=-4)
plt.yticks(np.arange(most_popular_tags_pandas_min_value, (most_popular_tags_pandas_max_value + most_popular_tags_pandas_max_step), step=most_popular_tags_pandas_max_step), rotation = 45)
plt.yticks(rotation = 45)

plt.ylabel('Ilość postów')
plt.xlabel('Poszczególne lata', labelpad= 20.0)
plt.title('Ilość postów na przestrzeni czasu według danych z portalu stackoverflow.com')
plt.savefig('number_of_posts_through_the_years.png', facecolor='white')
plt.show()

## Top 10 Most Active Users from registration to the last post or comment (Without Community Bot)

In [None]:
users_without_bots_df = users_df.filter(col('u.Id') != -1)

In [None]:
users_last_posts_activity_df = users_without_bots_df.alias('u') \
    .join(posts_df.alias('p'), col('u.Id') == col('p.OwnerUserId')) \
    .select(col('u.Id'), col('u.CreationDate'), col('p.CreationDate')) \
    .groupBy('u.Id') \
    .agg(max(col('p.CreationDate')).alias('UserLastPostActivityDate')) \
    .sort(desc('UserLastPostActivityDate'))

In [None]:
users_last_comments_activity_df = users_without_bots_df.alias('u') \
    .join(comments_df.alias('c'), col('u.Id') == col('c.UserId')) \
    .select(col('u.Id'), col('u.CreationDate'), col('c.CreationDate')) \
    .groupBy('u.Id') \
    .agg(max(col('c.CreationDate')).alias('UserLastCommentActivityDate')) \
    .sort(desc('UserLastCommentActivityDate'))

In [None]:
users_last_activity_df = users_last_posts_activity_df.alias('up') \
    .join(users_last_comments_activity_df.alias('uc'), col('up.Id') == col('uc.Id')) \
    .select(col('up.Id').alias('UserId'), col('up.UserLastPostActivityDate'), col('uc.UserLastCommentActivityDate'))

In [None]:
users_last_activity_based_on_posts_and_comments_df = users_last_activity_df \
    .groupBy('UserId') \
    .agg(max(struct(col('UserLastPostActivityDate'), col('UserLastCommentActivityDate'))).alias('UserLastActivityDate')) \
    .withColumn('UserLastActivityDate', to_timestamp(when((col('UserLastActivityDate.UserLastPostActivityDate') > col('UserLastActivityDate.UserLastCommentActivityDate')), col('UserLastActivityDate.UserLastPostActivityDate'))
                                                    .otherwise('UserLastActivityDate.UserLastCommentActivityDate'))) \
    .sort(desc('UserLastActivityDate'))

In [None]:
top_ten_users_with_longest_activity_df = users_without_bots_df.alias('u') \
    .join(users_last_activity_based_on_posts_and_comments_df.alias('ula'), col('u.Id') == col('ula.UserId')) \
    .withColumn('ActivityTimeInSeconds', col('ula.UserLastActivityDate').cast('long') - col('u.CreationDate').cast('long')) \
    .withColumn("ActivityTimeInMinutes", round(col("ActivityTimeInSeconds")/60)) \
    .withColumn("ActivityTimeInHours", round(col("ActivityTimeInSeconds")/3600)) \
    .withColumn("ActivityTimeInDays", round(col("ActivityTimeInSeconds")/(24*3600))) \
    .withColumn("ActivityTimeInMonths", round(col("ActivityTimeInSeconds")/(30*24*3600))) \
    .withColumn("ActivityTimeInYears", round(col("ActivityTimeInSeconds")/(12*30*24*3600))) \
    .sort(desc('ActivityTimeInDays')) \
    .select(col('Id').cast(StringType()),  'CreationDate', 'UserLastActivityDate', 'ActivityTimeInSeconds', 'ActivityTimeInMinutes', 'ActivityTimeInHours','ActivityTimeInDays', 'ActivityTimeInMonths', 'ActivityTimeInYears') \
    .limit(10)

## User Activity from registration to the last post or comment (Without Community Bot) - PLOT

In [None]:
top_ten_users_with_longest_activity_pandas_df = top_ten_users_with_longest_activity_df \
.sort(asc('ActivityTimeInDays')) \
.toPandas()

fig, ax = plt.subplots(figsize=(10, 10), facecolor='white', dpi=100)

top_ten_users_with_longest_activity_pandas_min_value = top_ten_users_with_longest_activity_pandas_df['ActivityTimeInDays'].min().round()
top_ten_users_with_longest_activity_pandas_max_value = top_ten_users_with_longest_activity_pandas_df['ActivityTimeInDays'].max().round()
top_ten_users_with_longest_activity_pandas_step = 8
plt.xlim(xmin= top_ten_users_with_longest_activity_pandas_min_value - top_ten_users_with_longest_activity_pandas_step , xmax=top_ten_users_with_longest_activity_pandas_max_value + 2)

plt.barh(top_ten_users_with_longest_activity_pandas_df['Id'], top_ten_users_with_longest_activity_pandas_df['ActivityTimeInDays'])
plt.ticklabel_format(axis="x", style="plain")

plt.ylabel('Identifikatory najdłużej aktywnych użytkowników', labelpad= 20.0)
plt.xlabel('Czas w ilości dni od stworzenia użytkownika do ostatniego postu lub komentarza', labelpad= 20.0)
plt.title('Najdłużej aktywni użytkownicy według danych z portalu stackoverflow.com')
plt.savefig('top_ten_users_with_longest_activity.png', facecolor='white')
plt.show()

## Comparison between the most and least score questions in the most and least popular technologies

In [None]:
highest_score_in_most_popular_questions_with_unique_tag_df = most_popular_questions_with_unique_tag_df \
    .groupBy('Tag') \
    .agg(max(col('Score')).alias('HighestScore')) \
    .sort(asc('HighestScore')) \
    .withColumn('Tag', translate(col('Tag'), "<>", ""))

highest_score_in_lowest_popular_questions_with_unique_tag_df = lowest_popular_questions_with_unique_tag_df \
    .groupBy('Tag') \
    .agg(max(col('Score')).alias('HighestScore')) \
    .sort(asc('HighestScore')) \
    .withColumn('Tag', translate(col('Tag'), "<>", ""))

number_of_answers_in_most_popular_questions_with_unique_tag_df = most_popular_questions_with_unique_tag_df \
    .groupBy('Tag') \
    .agg(count(col('AnswerCount')).alias('NumberOfAnswers')) \
    .sort(asc('NumberOfAnswers')) \
    .withColumn('Tag', translate(col('Tag'), "<>", ""))

number_of_answers_in_lowest_popular_questions_with_unique_tag_df = lowest_popular_questions_with_unique_tag_df \
    .groupBy('Tag') \
    .agg(count(col('AnswerCount')).alias('NumberOfAnswers')) \
    .sort(asc('NumberOfAnswers')) \
    .withColumn('Tag', translate(col('Tag'), "<>", ""))

## Comparison between the most and least score questions in the most and least popular technologies - PLOT

In [None]:
highest_score_in_most_popular_questions_with_unique_tag_pandas_df = highest_score_in_most_popular_questions_with_unique_tag_df.toPandas()

highest_score_in_lowest_popular_questions_with_unique_tag_pandas_df = highest_score_in_lowest_popular_questions_with_unique_tag_df.toPandas()

number_of_answers_in_most_popular_questions_with_unique_tag_pandas_df = number_of_answers_in_most_popular_questions_with_unique_tag_df.toPandas()

number_of_answers_in_lowest_popular_questions_with_unique_tag_pandas_df = number_of_answers_in_lowest_popular_questions_with_unique_tag_df.toPandas()

fig, ax = plt.subplots(2, 2, figsize=(10, 10), facecolor='white', dpi=100)
fig.tight_layout(pad=3.7)

ax[0, 0].barh(highest_score_in_most_popular_questions_with_unique_tag_pandas_df['Tag'], highest_score_in_most_popular_questions_with_unique_tag_pandas_df['HighestScore'], color='blue') #most_score
highest_score_in_most_popular_pandas_min_value = highest_score_in_most_popular_questions_with_unique_tag_pandas_df['HighestScore'].min().round(decimals=-3)
highest_score_in_most_popular_pandas_max_value = highest_score_in_most_popular_questions_with_unique_tag_pandas_df['HighestScore'].max().round(decimals=-3)
highest_score_in_most_popular_pandas_max_step = (highest_score_in_most_popular_pandas_max_value/5).round(decimals=-3)
ax[0, 0].set_xticks(np.arange(highest_score_in_most_popular_pandas_min_value, highest_score_in_most_popular_pandas_max_value + highest_score_in_most_popular_pandas_max_step, step=highest_score_in_most_popular_pandas_max_step))
ax[0, 0].set_ylabel('Najpopularniejsze technologie', labelpad= 20.0)
ax[0, 0].tick_params(axis='x', labelrotation=45)

ax[0, 1].barh(number_of_answers_in_most_popular_questions_with_unique_tag_pandas_df['Tag'], number_of_answers_in_most_popular_questions_with_unique_tag_pandas_df['NumberOfAnswers'], color='blue')#most_answers
number_of_answers_most_popular_tags_pandas_min_value = number_of_answers_in_most_popular_questions_with_unique_tag_pandas_df['NumberOfAnswers'].min().round(decimals=-5)
number_of_answers_most_popular_tags_pandas_max_value = number_of_answers_in_most_popular_questions_with_unique_tag_pandas_df['NumberOfAnswers'].max().round(decimals=-5)
number_of_answers_most_popular_tags_pandas_max_step = (number_of_answers_most_popular_tags_pandas_max_value/5).round(decimals=-5)
ax[0, 1].set_xticks(np.arange(number_of_answers_most_popular_tags_pandas_min_value, number_of_answers_most_popular_tags_pandas_max_value + number_of_answers_most_popular_tags_pandas_max_step, step=number_of_answers_most_popular_tags_pandas_max_step))
ax[0, 1].ticklabel_format(axis="x", style="plain")
ax[0, 1].tick_params(axis='x', labelrotation=45)

ax[1, 0].barh(highest_score_in_lowest_popular_questions_with_unique_tag_pandas_df['Tag'], highest_score_in_lowest_popular_questions_with_unique_tag_pandas_df['HighestScore'], color='red') #lowest_score
highest_score_in_lowest_popular_pandas_min_value = highest_score_in_lowest_popular_questions_with_unique_tag_pandas_df['HighestScore'].min().round(decimals=-1)
highest_score_in_lowest_popular_pandas_max_value = highest_score_in_lowest_popular_questions_with_unique_tag_pandas_df['HighestScore'].max().round(decimals=-1)
highest_score_in_lowest_popular_pandas_max_step = (highest_score_in_lowest_popular_pandas_max_value/5).round(decimals=-1)
ax[1, 0].set_xticks(np.arange(highest_score_in_lowest_popular_pandas_min_value, highest_score_in_lowest_popular_pandas_max_value + highest_score_in_lowest_popular_pandas_max_step, step=highest_score_in_lowest_popular_pandas_max_step))
ax[1, 0].set_ylabel('Najpopularniejsze technologie', labelpad= 20.0)
ax[1, 0].set_ylabel('Najmniej popularne technologie', labelpad= 20.0)
ax[1, 0].set_xlabel('Największa ilość punktów oceny', labelpad= 20.0)
ax[1, 0].tick_params(axis='x', labelrotation=45)

ax[1, 1].barh(number_of_answers_in_lowest_popular_questions_with_unique_tag_pandas_df['Tag'], number_of_answers_in_lowest_popular_questions_with_unique_tag_pandas_df['NumberOfAnswers'], color='red') #lowest_answers
number_of_answers_lowest_popular_tags_pandas_min_value = number_of_answers_in_lowest_popular_questions_with_unique_tag_pandas_df['NumberOfAnswers'].min().round(decimals=-3)
number_of_answers_lowest_popular_tags_pandas_max_value = number_of_answers_in_lowest_popular_questions_with_unique_tag_pandas_df['NumberOfAnswers'].max().round(decimals=-3)
number_of_answers_lowest_popular_tags_pandas_max_step = (number_of_answers_lowest_popular_tags_pandas_max_value/5).round(decimals=-3)
ax[1, 1].set_xticks(np.arange(number_of_answers_lowest_popular_tags_pandas_min_value, number_of_answers_lowest_popular_tags_pandas_max_value + number_of_answers_lowest_popular_tags_pandas_max_step, step=number_of_answers_lowest_popular_tags_pandas_max_step))
ax[1, 1].set_xlabel('Ilość odpowiedzi', labelpad= 20.0)
ax[1, 1].tick_params(axis='x', labelrotation=45)

fig.suptitle('Porównanie najbardziej i najmniej popularnych technologii na podstawie \n najwyższej oceny i ilości odpowiedzi w pytaniach według danych z portalu stackoverflow.com')
plt.savefig('comparision_most_and_lowest_popular_questions_based_on_score_and_answers.png', facecolor='white')
plt.show()

## Number of views in the most and the least popular technologies

In [None]:
sum_of_views_most_popular_questions_with_unique_tag_df = most_popular_questions_with_unique_tag_df \
    .groupBy(col('Tag')) \
    .agg(sum(col('ViewCount')).cast(LongType()).alias('SumOfViews')) \
    .sort(asc('SumOfViews'))

sum_of_views_lowest_popular_questions_with_unique_tag_df = lowest_popular_questions_with_unique_tag_df \
    .groupBy(col('Tag')) \
    .agg(sum(col('ViewCount')).cast(LongType()).alias('SumOfViews')) \
    .sort(asc('SumOfViews'))

## Number of views in the most and the least popular technologies - PLOT

In [None]:
sum_of_views_most_popular_questions_with_unique_tag_pandas_df = sum_of_views_most_popular_questions_with_unique_tag_df.toPandas()

fig, ax = plt.subplots(figsize=(10, 10), facecolor='white', dpi=100)
plt.barh(sum_of_views_most_popular_questions_with_unique_tag_pandas_df['Tag'], sum_of_views_most_popular_questions_with_unique_tag_pandas_df['SumOfViews'])
plt.ticklabel_format(axis="x", style="plain")

most_popular_tags_pandas_min_value = sum_of_views_most_popular_questions_with_unique_tag_pandas_df['SumOfViews'].min().round(decimals=-6)
most_popular_tags_pandas_max_value = sum_of_views_most_popular_questions_with_unique_tag_pandas_df['SumOfViews'].max().round(decimals=-6)
most_popular_tags_pandas_max_step = (most_popular_tags_pandas_max_value/5).round(decimals=-6)
plt.xticks(np.arange(most_popular_tags_pandas_min_value, (most_popular_tags_pandas_max_value + most_popular_tags_pandas_max_step), step=most_popular_tags_pandas_max_step), rotation = 45)

plt.ylabel('Technologie')
plt.xlabel('Ilość wygenerowanych wyświetleń', labelpad= 20.0)
plt.title('Ilość wygenerowanych wyświetleń w najpopularniejszych technologiach \n według danych z portalu stackoverflow.com')
plt.savefig('sum_of_views_most_popular_questions_with_unique_tag.png', facecolor='white')
plt.show()

In [None]:
sum_of_views_lowest_popular_questions_with_unique_tag_pandas_df = sum_of_views_lowest_popular_questions_with_unique_tag_df.toPandas()

fig, ax = plt.subplots(figsize=(10, 10), facecolor='white', dpi=100)
plt.barh(sum_of_views_lowest_popular_questions_with_unique_tag_pandas_df['Tag'], sum_of_views_lowest_popular_questions_with_unique_tag_pandas_df['SumOfViews'])
plt.ticklabel_format(axis="x", style="plain")

plt.xticks(rotation = 45)

plt.ylabel('Technologie')
plt.xlabel('Ilość wygenerowanych wyświetleń', labelpad= 20.0)
plt.title('Ilość wygenerowanych wyświetleń w najmniej popularnych technologiach \n według danych z portalu stackoverflow.com ')
plt.savefig('sum_of_views_lowest_popular_questions_with_unique_tag.png', facecolor='white')
plt.show()

## Number of posts from the most and the least popular technologies through the years

In [None]:
most_popular_posts_through_the_years_df = most_popular_questions_with_unique_tag_df \
    .union(most_popular_answers_with_unique_tag_df) \
    .withColumn('CreationYear', year(col('CreationDate'))) \
    .groupBy(col('Tag'), col('CreationYear')) \
    .count() \
    .sort(desc('CreationYear'), desc('count'))

lowest_popular_posts_through_the_years_df = lowest_popular_questions_with_unique_tag_df \
    .union(lowest_popular_answers_with_unique_tag_df) \
    .withColumn('CreationYear', year(col('CreationDate'))) \
    .groupBy(col('Tag'), col('CreationYear')) \
    .count() \
    .sort(desc('CreationYear'), desc('count'))

##  Number of posts from the most and the least popular technologies through the years - PLOT

In [None]:
top_10_the_most_popular_technologies_based_on_the_survey_posts = ['<javascript>', '<html>', '<sql>', '<python>', '<typescript>', '<java>', '<c#>', '<bash>', '<php>', '<c++>']

most_popular_posts_through_the_years_pandas_df = most_popular_posts_through_the_years_df.toPandas()

fig, ax = plt.subplots(figsize=(10, 10), facecolor='white', dpi=100)
plt.ticklabel_format(axis="y", style="plain")

# Should be done it by foreach - Refactor
ax.plot(most_popular_posts_through_the_years_pandas_df[most_popular_posts_through_the_years_pandas_df['Tag'] == '<javascript>']['CreationYear'], most_popular_posts_through_the_years_pandas_df[most_popular_posts_through_the_years_pandas_df['Tag'] == '<javascript>']['count'], label='javascript', color='blue', marker="v")

ax.plot(most_popular_posts_through_the_years_pandas_df[most_popular_posts_through_the_years_pandas_df['Tag'] == '<html>']['CreationYear'], most_popular_posts_through_the_years_pandas_df[most_popular_posts_through_the_years_pandas_df['Tag'] == '<html>']['count'], label='html', color='green', marker="v")

ax.plot(most_popular_posts_through_the_years_pandas_df[most_popular_posts_through_the_years_pandas_df['Tag'] == '<sql>']['CreationYear'], most_popular_posts_through_the_years_pandas_df[most_popular_posts_through_the_years_pandas_df['Tag'] == '<sql>']['count'], label='sql', color='red', marker="v")

ax.plot(most_popular_posts_through_the_years_pandas_df[most_popular_posts_through_the_years_pandas_df['Tag'] == '<python>']['CreationYear'], most_popular_posts_through_the_years_pandas_df[most_popular_posts_through_the_years_pandas_df['Tag'] == '<python>']['count'], label='python', color='cyan', marker="v")

ax.plot(most_popular_posts_through_the_years_pandas_df[most_popular_posts_through_the_years_pandas_df['Tag'] == '<typescript>']['CreationYear'], most_popular_posts_through_the_years_pandas_df[most_popular_posts_through_the_years_pandas_df['Tag'] == '<typescript>']['count'], label='typescript', color='magenta', marker="v")

ax.plot(most_popular_posts_through_the_years_pandas_df[most_popular_posts_through_the_years_pandas_df['Tag'] == '<java>']['CreationYear'], most_popular_posts_through_the_years_pandas_df[most_popular_posts_through_the_years_pandas_df['Tag'] == '<java>']['count'], label='java', color='yellow', marker="v")

ax.plot(most_popular_posts_through_the_years_pandas_df[most_popular_posts_through_the_years_pandas_df['Tag'] == '<c#>']['CreationYear'], most_popular_posts_through_the_years_pandas_df[most_popular_posts_through_the_years_pandas_df['Tag'] == '<c#>']['count'], label='c#', color='black', marker="v")

ax.plot(most_popular_posts_through_the_years_pandas_df[most_popular_posts_through_the_years_pandas_df['Tag'] == '<bash>']['CreationYear'], most_popular_posts_through_the_years_pandas_df[most_popular_posts_through_the_years_pandas_df['Tag'] == '<bash>']['count'], label='bash', color='orange', marker="v")

ax.plot(most_popular_posts_through_the_years_pandas_df[most_popular_posts_through_the_years_pandas_df['Tag'] == '<php>']['CreationYear'], most_popular_posts_through_the_years_pandas_df[most_popular_posts_through_the_years_pandas_df['Tag'] == '<php>']['count'], label='php', color='gray', marker="v")

ax.plot(most_popular_posts_through_the_years_pandas_df[most_popular_posts_through_the_years_pandas_df['Tag'] == '<c++>']['CreationYear'], most_popular_posts_through_the_years_pandas_df[most_popular_posts_through_the_years_pandas_df['Tag'] == '<c++>']['count'], label='c++', color='brown', marker="v")

ax.set_xlabel('Poszczególne lata')
ax.set_ylabel('Ilość postów')
plt.title('Ilość postów w czasie dla każdej z najpopularniejszych technologii \n w ciągu ostatnich lat według danych z portalu stackoverflow.com')
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.savefig('most_popular_posts_through_the_years.png', facecolor='white')
plt.show()


In [None]:
top_10_the_lowest_popular_technologies_based_on_the_survey_posts = ['<crystal-lang>', '<apl>', '<sas>', '<ocaml>', '<cobol>', '<fortran>', '<erlang>', '<julia>', '<f#>', '<lisp>']

lowest_popular_posts_through_the_years_pandas_df = lowest_popular_posts_through_the_years_df.toPandas()

fig, ax = plt.subplots(figsize=(10, 10), facecolor='white', dpi=100)
plt.ticklabel_format(axis="y", style="plain")

# Should be done it by foreach - Refactor
ax.plot(lowest_popular_posts_through_the_years_pandas_df[lowest_popular_posts_through_the_years_pandas_df['Tag'] == '<crystal-lang>']['CreationYear'], lowest_popular_posts_through_the_years_pandas_df[lowest_popular_posts_through_the_years_pandas_df['Tag'] == '<crystal-lang>']['count'], label='crystal-lang', color='mediumblue', marker="v")

ax.plot(lowest_popular_posts_through_the_years_pandas_df[lowest_popular_posts_through_the_years_pandas_df['Tag'] == '<apl>']['CreationYear'], lowest_popular_posts_through_the_years_pandas_df[lowest_popular_posts_through_the_years_pandas_df['Tag'] == '<apl>']['count'], label='apl', color='darkgreen', marker="v")

ax.plot(lowest_popular_posts_through_the_years_pandas_df[lowest_popular_posts_through_the_years_pandas_df['Tag'] == '<sas>']['CreationYear'], lowest_popular_posts_through_the_years_pandas_df[lowest_popular_posts_through_the_years_pandas_df['Tag'] == '<sas>']['count'], label='sas', color='tomato', marker="v")

ax.plot(lowest_popular_posts_through_the_years_pandas_df[lowest_popular_posts_through_the_years_pandas_df['Tag'] == '<ocaml>']['CreationYear'], lowest_popular_posts_through_the_years_pandas_df[lowest_popular_posts_through_the_years_pandas_df['Tag'] == '<ocaml>']['count'], label='ocaml', color='darkcyan', marker="v")

ax.plot(lowest_popular_posts_through_the_years_pandas_df[lowest_popular_posts_through_the_years_pandas_df['Tag'] == '<cobol>']['CreationYear'], lowest_popular_posts_through_the_years_pandas_df[lowest_popular_posts_through_the_years_pandas_df['Tag'] == '<cobol>']['count'], label='cobol', color='pink', marker="v")

ax.plot(lowest_popular_posts_through_the_years_pandas_df[lowest_popular_posts_through_the_years_pandas_df['Tag'] == '<fortran>']['CreationYear'], lowest_popular_posts_through_the_years_pandas_df[lowest_popular_posts_through_the_years_pandas_df['Tag'] == '<fortran>']['count'], label='fortran', color='gold', marker="v")

ax.plot(lowest_popular_posts_through_the_years_pandas_df[lowest_popular_posts_through_the_years_pandas_df['Tag'] == '<erlang>']['CreationYear'], lowest_popular_posts_through_the_years_pandas_df[lowest_popular_posts_through_the_years_pandas_df['Tag'] == '<erlang>']['count'], label='erlang', color='dimgray', marker="v")

ax.plot(lowest_popular_posts_through_the_years_pandas_df[lowest_popular_posts_through_the_years_pandas_df['Tag'] == '<julia>']['CreationYear'], lowest_popular_posts_through_the_years_pandas_df[lowest_popular_posts_through_the_years_pandas_df['Tag'] == '<julia>']['count'], label='julia', color='darkorange', marker="v")

ax.plot(lowest_popular_posts_through_the_years_pandas_df[lowest_popular_posts_through_the_years_pandas_df['Tag'] == '<f#>']['CreationYear'], lowest_popular_posts_through_the_years_pandas_df[lowest_popular_posts_through_the_years_pandas_df['Tag'] == '<f#>']['count'], label='f#', color='lightgray', marker="v")

ax.plot(lowest_popular_posts_through_the_years_pandas_df[lowest_popular_posts_through_the_years_pandas_df['Tag'] == '<lisp>']['CreationYear'], lowest_popular_posts_through_the_years_pandas_df[lowest_popular_posts_through_the_years_pandas_df['Tag'] == '<lisp>']['count'], label='lisp', color='firebrick', marker="v")

ax.set_xlabel('Poszczególne lata')
ax.set_ylabel('Ilość postów')
plt.title('Ilość postów w czasie dla każdej z najmniej popularnych technologii \n w ciągu ostatnich lat według danych z portalu stackoverflow.com')
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.savefig('lowest_popular_posts_through_the_years.png', facecolor='white')
plt.show()

## Percentage distribution of Users that never post anything

In [None]:
number_of_all_users = users_df.count()

number_of_users_that_never_post = users_df.alias('u') \
    .join(posts_df.alias('p'), col('u.Id') == col('p.OwnerUserId'), how='left') \
    .filter(col('p.OwnerUserId').isNull()) \
    .count()

number_of_users_that_post = number_of_all_users - number_of_users_that_never_post

## Percentage distribution of Users that never post anything - PLOT

In [None]:
labels = 'Użytkownicy którzy \n chociaż raz zapostowali', 'Użytkownicy którzy \n nigdy nic nie zapostowali'
sizes = [number_of_users_that_post, number_of_users_that_never_post]
explode = (0, 0.1)

fig1, ax = plt.subplots(figsize=(10, 10), facecolor='white', dpi=100)
ax.pie(sizes, explode=explode, autopct='%1.1f%%')
ax.axis('equal')

ax.legend(labels, loc="center left", bbox_to_anchor=(1, 0, 0.5, 1))
plt.title('Procentowa ilość użytkowników którzy \n nigdy nic nie zapostowali według danych z portalu stackoverflow.com')

plt.savefig('comparision_of_number_of_users_that_never_post.png', facecolor='white')
plt.show()