 Before you start using this notebook change **datalake_name** in the Python variable and file system command

In [None]:
from pyspark.sql.types import *
from pyspark.sql import SparkSession
from pyspark.sql.functions import *
import numpy as np
import matplotlib
import matplotlib.pyplot as plt

In [None]:
font = {'size'   : 12}

matplotlib.rc('font', **font)

In [None]:
datalake_name = 'cadlstorec6o67ihsthjni'

In [None]:
%fs ls adl://cadlstorec6o67ihsthjni.azuredatalakestore.net/

In [None]:
%python
configs = {
  'fs.adl.oauth2.access.token.provider.type': 'CustomAccessTokenProvider',
  'fs.adl.oauth2.access.token.custom.provider': spark.conf.get('spark.databricks.passthrough.adls.tokenProviderClassName')
}
dbutils.fs.mount(
source = f'adl://{datalake_name}.azuredatalakestore.net/',
mount_point = '/mnt/datalake',
extra_configs = configs)

In [None]:
tags_parquet_path = f'adl://{datalake_name}.azuredatalakestore.net/tags.parquet'
posts_parquet_path = f'adl://{datalake_name}.azuredatalakestore.net/posts.parquet'
users_parquet_path = f'adl://{datalake_name}.azuredatalakestore.net/users.parquet'


most_popular_questions_with_unique_tag_parquet_path = f'adl://{datalake_name}.azuredatalakestore.net/most_popular_questions_with_unique_tag.parquet'
most_popular_answers_with_unique_tag_parquet_path = f'adl://{datalake_name}.azuredatalakestore.net/most_popular_answers_with_unique_tag.parquet'
most_popular_accepted_answers_with_unique_tag_parquet_path = f'adl://{datalake_name}.azuredatalakestore.net/most_popular_accepted_answers_with_unique_tag.parquet'
users_questions_with_most_popular_tags_parquet_path = f'adl://{datalake_name}.azuredatalakestore.net/users_questions_with_most_popular_tags.parquet'
users_answers_with_most_popular_tags_parquet_path = f'adl://{datalake_name}.azuredatalakestore.net/users_answers_with_most_popular_tags.parquet'
users_accepted_answers_with_most_popular_tags_parquet_path = f'adl://{datalake_name}.azuredatalakestore.net/users_accepted_answers_with_most_popular_tags.parquet'
users_questions_and_answers_with_most_popular_tags_parquet_path = f'adl://{datalake_name}.azuredatalakestore.net/users_questions_and_answers_with_most_popular_tags.parquet'


lowest_popular_questions_with_unique_tag_parquet_path = f'adl://{datalake_name}.azuredatalakestore.net/lowest_popular_questions_with_unique_tag.parquet'
lowest_popular_answers_with_unique_tag_parquet_path = f'adl://{datalake_name}.azuredatalakestore.net/lowest_popular_answers_with_unique_tag.parquet'
lowest_popular_accepted_answers_with_unique_tag_parquet_path = f'adl://{datalake_name}.azuredatalakestore.net/lowest_popular_accepted_answers_with_unique_tag.parquet'
users_questions_with_lowest_popular_tags_parquet_path = f'adl://{datalake_name}.azuredatalakestore.net/users_questions_with_lowest_popular_tags.parquet'
users_answers_with_lowest_popular_tags_parquet_path = f'adl://{datalake_name}.azuredatalakestore.net/users_answers_with_lowest_popular_tags.parquet'
users_accepted_answers_with_lowest_popular_tags_parquet_path = f'adl://{datalake_name}.azuredatalakestore.net/users_accepted_answers_with_lowest_popular_tags.parquet'
users_questions_and_answers_with_lowest_popular_tags_parquet_path = f'adl://{datalake_name}.azuredatalakestore.net/users_questions_and_answers_with_lowest_popular_tags.parquet'

In [None]:
tags_df = spark.read.parquet(tags_parquet_path)
posts_df = spark.read.parquet(posts_parquet_path)

questions_df = posts_df.filter((col('PostTypeId') == 1) & (col('ClosedDate').isNull()))
answers_df = posts_df.filter((col('PostTypeId') == 2) & (col('ClosedDate').isNull()))

users_df = spark.read.parquet(users_parquet_path)


most_popular_questions_with_unique_tag_df = spark.read.parquet(most_popular_questions_with_unique_tag_path)
most_popular_answers_with_unique_tag_df = spark.read.parquet(most_popular_answers_with_unique_tag_path)
most_popular_accepted_answers_with_unique_tag_df = spark.read.parquet(most_popular_accepted_answers_with_unique_tag_path)
users_questions_with_most_popular_tags_df = spark.read.parquet(users_questions_with_most_popular_tags_parquet_path)
users_answers_with_most_popular_tags_df = spark.read.parquet(users_answers_with_most_popular_tags_parquet_path)
users_accepted_answers_with_most_popular_tags_df = spark.read.parquet(users_accepted_answers_with_most_popular_tags_parquet_path)
users_questions_and_answers_with_most_popular_tags_df = spark.read.parquet(users_questions_and_answers_with_most_popular_tags_parquet_path)


lowest_popular_questions_with_unique_tag_df = spark.read.parquet(lowest_popular_questions_with_unique_tag_path)
lowest_popular_answers_with_unique_tag_df = spark.read.parquet(lowest_popular_answers_with_unique_tag_path)
lowest_popular_accepted_answers_with_unique_tag_df = spark.read.parquet(lowest_popular_accepted_answers_with_unique_tag_path)
users_questions_with_lowest_popular_tags_df = spark.read.parquet(users_questions_with_lowest_popular_tags_parquet_path)
users_answers_with_lowest_popular_tags_df = spark.read.parquet(users_answers_with_lowest_popular_tags_parquet_path)
users_accepted_answers_with_lowest_popular_tags_df = spark.read.parquet(users_accepted_answers_with_lowest_popular_tags_parquet_path)
users_questions_and_answers_with_lowest_popular_tags_df = spark.read.parquet(users_questions_and_answers_with_lowest_popular_tags_parquet_path)

In [None]:
spark = SparkSession.builder.getOrCreate()

In [None]:
tags_df.show()

In [None]:
print(f'Questions: {questions_df.count()}')
print(f'Answers: {answers_df.count()}')
print(f'Users: {users_df.count()}')

## What are the most popular Tags based on Tags Summary?

In [None]:
tags_df.orderBy(col('Count').desc()).show()

## What are the lowest popular Tags based on Tags Summary?

In [None]:
tags_df.orderBy(col('Count')).show()

## What is the number of questions asked in the top 10th most and least popular Technologies due to the StackOverflow Professionals Survey based on Tags Summary?

In [None]:
top_10_the_most_popular_technologies_based_on_the_survey_tags = ['javascript', 'html', 'sql', 'python', 'typescript', 'java', 'c#', 'bash', 'php', 'c++']
top_10_the_lowest_popular_technologies_based_on_the_survey_tags = ['crystal-lang', 'apl', 'sas', 'ocaml', 'cobol', 'fortran', 'erlang', 'julia', 'f#', 'lisp']

most_popular_tags_df = tags_df.filter(tags_df['TagName'].isin(top_10_the_most_popular_technologies_based_on_the_survey_tags)).sort(asc('Count'))

lowest_popular_tags_df = tags_df.filter(tags_df['TagName'].isin(top_10_the_lowest_popular_technologies_based_on_the_survey_tags)).sort(asc('Count'))

most_popular_tags_df.show()
lowest_popular_tags_df.show()

## What is the number of questions asked in the top 10th most and least popular Technologies due to the StackOverflow Professionals Survey based on Tags Summary? - PLOT


In [None]:
most_popular_tags_pandas_df = most_popular_tags_df.toPandas()

fig, ax = plt.subplots(figsize=(10, 10), facecolor='white', dpi=100)
plt.barh(most_popular_tags_pandas_df['TagName'], most_popular_tags_pandas_df['Count'])
plt.ticklabel_format(axis='x', style='plain')

most_popular_tags_pandas_min_value = most_popular_tags_pandas_df['Count'].min().round(decimals=-5)
most_popular_tags_pandas_max_value = most_popular_tags_pandas_df['Count'].max().round(decimals=-5)
most_popular_tags_pandas_max_step = most_popular_tags_pandas_max_value/5
plt.xticks(np.arange(most_popular_tags_pandas_min_value, (most_popular_tags_pandas_max_value + most_popular_tags_pandas_max_step), step=most_popular_tags_pandas_max_step), rotation = 45)


plt.ylabel('Technologie')
plt.xlabel('Ilość pytań', labelpad= 20.0)
plt.title('Najpopularniejsze technologie według danych z portalu stackoverflow.com')
plt.savefig('most_popular_tags.png', facecolor='white')
plt.show()

In [None]:
lowest_popular_tags_pandas_df = lowest_popular_tags_df.toPandas()

fig, ax = plt.subplots(figsize=(10, 10), facecolor='white', dpi=100)
plt.barh(lowest_popular_tags_pandas_df['TagName'], lowest_popular_tags_pandas_df['Count'])
plt.ticklabel_format(axis='x', style='plain')

lowest_popular_tags_pandas_min_value = lowest_popular_tags_pandas_df['Count'].min().round(decimals=-2)
lowest_popular_tags_pandas_max_value = lowest_popular_tags_pandas_df['Count'].max().round(decimals=-3)
lowest_popular_tags_pandas_max_step = lowest_popular_tags_pandas_max_value/5
plt.xticks(np.arange(lowest_popular_tags_pandas_min_value, (lowest_popular_tags_pandas_max_value + lowest_popular_tags_pandas_max_step), step=lowest_popular_tags_pandas_max_step), rotation = 45)


plt.ylabel('Technologie')
plt.xlabel('Ilość pytań', labelpad= 20.0)
plt.title('Najmniej popularne technologie według danych z portalu stackoverflow.com')
plt.savefig('lowest_popular_tags.png', facecolor='white')
plt.show()

In [None]:
print('MOST POPULAR QUESTIONS WITH UNIQUE TAG')
most_popular_questions_with_unique_tag_df.groupBy(col('Tag')).count().sort(desc('count')).show()

print('MOST POPULAR ANSWERS WITH UNIQUE TAG')
most_popular_answers_with_unique_tag_df.groupBy(col('Tag')).count().sort(desc('count')).show()

print('MOST POPULAR ACCEPTED ANSWERS WITH UNIQUE TAG')
most_popular_accepted_answers_with_unique_tag_df.groupBy(col('Tag')).count().sort(desc('count')).show()

print('USERS QUESTIONS WITH MOST POPULAR TAGS')
users_questions_with_most_popular_tags_df.groupBy(col('Tag')).count().sort(desc('count')).show()

print('USERS ANSWERS WITH MOST POPULAR TAGS')
users_answers_with_most_popular_tags_df.groupBy(col('Tag')).count().sort(desc('count')).show()

print('USERS ACCEPTED ANSWERS WITH MOST POPULAR TAGS')
users_accepted_answers_with_most_popular_tags_df.groupBy(col('Tag')).count().sort(desc('count')).show()

print('USERS QUESTIONS AND ANSWERS WITH MOST POPULAR TAGS')
users_questions_and_answers_with_most_popular_tags_df.groupBy(col('Tag')).count().sort(desc('count')).show()


print('LOWEST POPULAR QUESTIONS WITH UNIQUE TAG')
lowest_popular_questions_with_unique_tag_df.groupBy(col('Tag')).count().sort(desc('count')).show()

print('LOWEST POPULAR ANSWERS WITH UNIQUE TAG')
lowest_popular_answers_with_unique_tag_df.groupBy(col('Tag')).count().sort(desc('count')).show()

print('LOWEST POPULAR ACCEPTED ANSWERS WITH UNIQUE TAG')
lowest_popular_accepted_answers_with_unique_tag_df.groupBy(col('Tag')).count().sort(desc('count')).show()

print('USERS QUESTIONS WITH LOWEST POPULAR TAGS')
users_questions_with_lowest_popular_tags_df.groupBy(col('Tag')).count().sort(desc('count')).show()

print('USERS ANSWERS WITH LOWEST POPULAR TAGS')
users_answers_with_lowest_popular_tags_df.groupBy(col('Tag')).count().sort(desc('count')).show()

print('USERS ACCEPTED ANSWERS WITH LOWEST POPULAR TAGS')
users_accepted_answers_with_lowest_popular_tags_df.groupBy(col('Tag')).count().sort(desc('count')).show()

print('USERS QUESTIONS AND ANSWERS WITH LOWEST POPULAR TAGS')
users_questions_and_answers_with_lowest_popular_tags_df.groupBy(col('Tag')).count().sort(desc('count')).show()

## What is the number of questions asked through the years in the top 10th most and lowest popular Technologies due to StackOverflow Professionals Survey based on StackOverflow Questions?

In [None]:
most_popular_questions_through_the_years_df = most_popular_questions_with_unique_tag_df \
    .withColumn('CreationYear', year(col('CreationDate'))) \
    .groupBy(col('Tag'), col('CreationYear')) \
    .count() \
    .sort(desc('CreationYear'), desc('count'))

most_popular_questions_through_the_years_df.show()


lowest_popular_questions_through_the_years_df = lowest_popular_questions_with_unique_tag_df \
    .withColumn('CreationYear', year(col('CreationDate'))) \
    .groupBy(col('Tag'), col('CreationYear')) \
    .count() \
    .sort(desc('CreationYear'), desc('count'))

lowest_popular_questions_through_the_years_df.show()

## What is the number of questions asked through the years in the top 10th most and lowest popular Technologies due to StackOverflow Professionals Survey based on StackOverflow Questions? - PLOT

In [None]:
top_10_the_most_popular_technologies_based_on_the_survey_posts = ['<javascript>', '<html>', '<sql>', '<python>', '<typescript>', '<java>', '<c#>', '<bash>', '<php>', '<c++>']

most_popular_questions_pandas_df = most_popular_questions_through_the_years_df.toPandas()

fig, ax = plt.subplots(figsize=(10, 10), facecolor='white', dpi=100)
plt.ticklabel_format(axis='y', style='plain')

# Should be done it by foreach
ax.plot(most_popular_questions_pandas_df[most_popular_questions_pandas_df['Tag'] == '<javascript>']['CreationYear'], most_popular_questions_pandas_df[most_popular_questions_pandas_df['Tag'] == '<javascript>']['count'], label='javascript', color='blue', marker='v')

ax.plot(most_popular_questions_pandas_df[most_popular_questions_pandas_df['Tag'] == '<html>']['CreationYear'], most_popular_questions_pandas_df[most_popular_questions_pandas_df['Tag'] == '<html>']['count'], label='html', color='green', marker='v')

ax.plot(most_popular_questions_pandas_df[most_popular_questions_pandas_df['Tag'] == '<sql>']['CreationYear'], most_popular_questions_pandas_df[most_popular_questions_pandas_df['Tag'] == '<sql>']['count'], label='sql', color='red', marker='v')

ax.plot(most_popular_questions_pandas_df[most_popular_questions_pandas_df['Tag'] == '<python>']['CreationYear'], most_popular_questions_pandas_df[most_popular_questions_pandas_df['Tag'] == '<python>']['count'], label='python', color='cyan', marker='v')

ax.plot(most_popular_questions_pandas_df[most_popular_questions_pandas_df['Tag'] == '<typescript>']['CreationYear'], most_popular_questions_pandas_df[most_popular_questions_pandas_df['Tag'] == '<typescript>']['count'], label='typescript', color='magenta', marker='v')

ax.plot(most_popular_questions_pandas_df[most_popular_questions_pandas_df['Tag'] == '<java>']['CreationYear'], most_popular_questions_pandas_df[most_popular_questions_pandas_df['Tag'] == '<java>']['count'], label='java', color='yellow', marker='v')

ax.plot(most_popular_questions_pandas_df[most_popular_questions_pandas_df['Tag'] == '<c#>']['CreationYear'], most_popular_questions_pandas_df[most_popular_questions_pandas_df['Tag'] == '<c#>']['count'], label='c#', color='black', marker='v')

ax.plot(most_popular_questions_pandas_df[most_popular_questions_pandas_df['Tag'] == '<bash>']['CreationYear'], most_popular_questions_pandas_df[most_popular_questions_pandas_df['Tag'] == '<bash>']['count'], label='bash', color='orange', marker='v')

ax.plot(most_popular_questions_pandas_df[most_popular_questions_pandas_df['Tag'] == '<php>']['CreationYear'], most_popular_questions_pandas_df[most_popular_questions_pandas_df['Tag'] == '<php>']['count'], label='php', color='gray', marker='v')

ax.plot(most_popular_questions_pandas_df[most_popular_questions_pandas_df['Tag'] == '<c++>']['CreationYear'], most_popular_questions_pandas_df[most_popular_questions_pandas_df['Tag'] == '<c++>']['count'], label='c++', color='brown', marker='v')

ax.set_xlabel('Poszczególne lata')
ax.set_ylabel('Ilość pytań')
plt.title('Ilość pytań w najpopularniejszych technologiach \n w ciągu ostatnich lat według danych z portalu stackoverflow.com ')
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.savefig('most_popular_questions_through_the_years.png', facecolor='white')
plt.show()

In [None]:
top_10_the_lowest_popular_technologies_based_on_the_survey_posts = ['<crystal-lang>', '<apl>', '<sas>', '<ocaml>', '<cobol>', '<fortran>', '<erlang>', '<julia>', '<f#>', '<lisp>']

lowest_popular_questions_pandas_df = lowest_popular_questions_through_the_years_df.toPandas()


fig, ax = plt.subplots(figsize=(10, 10), facecolor='white', dpi=100)
plt.ticklabel_format(axis='y', style='plain')

# Should be done it by foreach
ax.plot(lowest_popular_questions_pandas_df[lowest_popular_questions_pandas_df['Tag'] == '<crystal-lang>']['CreationYear'], lowest_popular_questions_pandas_df[lowest_popular_questions_pandas_df['Tag'] == '<crystal-lang>']['count'], label='crystal-lang', color='mediumblue', marker='v')

ax.plot(lowest_popular_questions_pandas_df[lowest_popular_questions_pandas_df['Tag'] == '<apl>']['CreationYear'], lowest_popular_questions_pandas_df[lowest_popular_questions_pandas_df['Tag'] == '<apl>']['count'], label='apl', color='darkgreen', marker='v')

ax.plot(lowest_popular_questions_pandas_df[lowest_popular_questions_pandas_df['Tag'] == '<sas>']['CreationYear'], lowest_popular_questions_pandas_df[lowest_popular_questions_pandas_df['Tag'] == '<sas>']['count'], label='sas', color='tomato', marker='v')

ax.plot(lowest_popular_questions_pandas_df[lowest_popular_questions_pandas_df['Tag'] == '<ocaml>']['CreationYear'], lowest_popular_questions_pandas_df[lowest_popular_questions_pandas_df['Tag'] == '<ocaml>']['count'], label='ocaml', color='darkcyan', marker='v')

ax.plot(lowest_popular_questions_pandas_df[lowest_popular_questions_pandas_df['Tag'] == '<cobol>']['CreationYear'], lowest_popular_questions_pandas_df[lowest_popular_questions_pandas_df['Tag'] == '<cobol>']['count'], label='cobol', color='pink', marker='v')

ax.plot(lowest_popular_questions_pandas_df[lowest_popular_questions_pandas_df['Tag'] == '<fortran>']['CreationYear'], lowest_popular_questions_pandas_df[lowest_popular_questions_pandas_df['Tag'] == '<fortran>']['count'], label='fortran', color='gold', marker='v')

ax.plot(lowest_popular_questions_pandas_df[lowest_popular_questions_pandas_df['Tag'] == '<erlang>']['CreationYear'], lowest_popular_questions_pandas_df[lowest_popular_questions_pandas_df['Tag'] == '<erlang>']['count'], label='erlang', color='dimgray', marker='v')

ax.plot(lowest_popular_questions_pandas_df[lowest_popular_questions_pandas_df['Tag'] == '<julia>']['CreationYear'], lowest_popular_questions_pandas_df[lowest_popular_questions_pandas_df['Tag'] == '<julia>']['count'], label='julia', color='darkorange', marker='v')

ax.plot(lowest_popular_questions_pandas_df[lowest_popular_questions_pandas_df['Tag'] == '<f#>']['CreationYear'], lowest_popular_questions_pandas_df[lowest_popular_questions_pandas_df['Tag'] == '<f#>']['count'], label='f#', color='lightgray', marker='v')

ax.plot(lowest_popular_questions_pandas_df[lowest_popular_questions_pandas_df['Tag'] == '<lisp>']['CreationYear'], lowest_popular_questions_pandas_df[lowest_popular_questions_pandas_df['Tag'] == '<lisp>']['count'], label='lisp', color='firebrick', marker='v')

ax.set_xlabel('Poszczególne lata')
ax.set_ylabel('Ilość pytań')
plt.title('Ilość pytań w najmniej popularnych technologiach \n w ciągu ostatnich lat według danych z portalu stackoverflow.com ')
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.savefig('lowest_popular_questions_through_the_years.png', facecolor='white')
plt.show()

## What is the correction between the number of Accepted answers in the top 10th most and lowest popular Technologies due to StackOverflow Professionals Survey based on StackOverflow Questions?

In [None]:
most_popular_questions_with_answers_through_the_years_df = most_popular_questions_with_unique_tag_df \
    .filter(col('AnswerCount') > 0) \
    .withColumn('CreationYear', year(col('CreationDate'))) \
    .groupBy(col('Tag'), col('CreationYear')) \
    .count() \
    .sort(desc('CreationYear'), desc('count'))

most_popular_questions_with_answers_through_the_years_df.show()


lowest_popular_questions_with_answers_through_the_years_df = lowest_popular_questions_with_unique_tag_df \
    .filter(col('AnswerCount') > 0) \
    .withColumn('CreationYear', year(col('CreationDate'))) \
    .groupBy(col('Tag'), col('CreationYear')) \
    .count() \
    .sort(desc('CreationYear'), desc('count'))

lowest_popular_questions_with_answers_through_the_years_df.show()

## What is the correction between the number of Accepted answers in the top 10th most and lowest popular Technologies due to StackOverflow Professionals Survey based on StackOverflow Questions? - PLOTS

In [None]:
top_10_the_most_popular_technologies_based_on_the_survey_posts = ['<javascript>', '<html>', '<sql>', '<python>', '<typescript>', '<java>', '<c#>', '<bash>', '<php>', '<c++>']

most_popular_questions_with_answers_pandas_df = most_popular_questions_with_answers_through_the_years_df.toPandas()


fig, ax = plt.subplots(figsize=(10, 10), facecolor='white', dpi=100)
plt.ticklabel_format(axis='y', style='plain')

# Can be done it by foreach
ax.plot(most_popular_questions_with_answers_pandas_df[most_popular_questions_with_answers_pandas_df['Tag'] == '<javascript>']['CreationYear'], most_popular_questions_with_answers_pandas_df[most_popular_questions_with_answers_pandas_df['Tag'] == '<javascript>']['count'], label='javascript', color='blue', marker='v')

ax.plot(most_popular_questions_with_answers_pandas_df[most_popular_questions_with_answers_pandas_df['Tag'] == '<html>']['CreationYear'], most_popular_questions_with_answers_pandas_df[most_popular_questions_with_answers_pandas_df['Tag'] == '<html>']['count'], label='html', color='green', marker='v')

ax.plot(most_popular_questions_with_answers_pandas_df[most_popular_questions_with_answers_pandas_df['Tag'] == '<sql>']['CreationYear'], most_popular_questions_with_answers_pandas_df[most_popular_questions_with_answers_pandas_df['Tag'] == '<sql>']['count'], label='sql', color='red', marker='v')

ax.plot(most_popular_questions_with_answers_pandas_df[most_popular_questions_with_answers_pandas_df['Tag'] == '<python>']['CreationYear'], most_popular_questions_with_answers_pandas_df[most_popular_questions_with_answers_pandas_df['Tag'] == '<python>']['count'], label='python', color='cyan', marker='v')

ax.plot(most_popular_questions_with_answers_pandas_df[most_popular_questions_with_answers_pandas_df['Tag'] == '<typescript>']['CreationYear'], most_popular_questions_with_answers_pandas_df[most_popular_questions_with_answers_pandas_df['Tag'] == '<typescript>']['count'], label='typescript', color='magenta', marker='v')

ax.plot(most_popular_questions_with_answers_pandas_df[most_popular_questions_with_answers_pandas_df['Tag'] == '<java>']['CreationYear'], most_popular_questions_with_answers_pandas_df[most_popular_questions_with_answers_pandas_df['Tag'] == '<java>']['count'], label='java', color='yellow', marker='v')

ax.plot(most_popular_questions_with_answers_pandas_df[most_popular_questions_with_answers_pandas_df['Tag'] == '<c#>']['CreationYear'], most_popular_questions_with_answers_pandas_df[most_popular_questions_with_answers_pandas_df['Tag'] == '<c#>']['count'], label='c#', color='black', marker='v')

ax.plot(most_popular_questions_with_answers_pandas_df[most_popular_questions_with_answers_pandas_df['Tag'] == '<bash>']['CreationYear'], most_popular_questions_with_answers_pandas_df[most_popular_questions_with_answers_pandas_df['Tag'] == '<bash>']['count'], label='bash', color='orange', marker='v')

ax.plot(most_popular_questions_with_answers_pandas_df[most_popular_questions_with_answers_pandas_df['Tag'] == '<php>']['CreationYear'], most_popular_questions_with_answers_pandas_df[most_popular_questions_with_answers_pandas_df['Tag'] == '<php>']['count'], label='php', color='gray', marker='v')

ax.plot(most_popular_questions_with_answers_pandas_df[most_popular_questions_with_answers_pandas_df['Tag'] == '<c++>']['CreationYear'], most_popular_questions_with_answers_pandas_df[most_popular_questions_with_answers_pandas_df['Tag'] == '<c++>']['count'], label='c++', color='brown', marker='v')

ax.set_xlabel('Poszczególne lata')
ax.set_ylabel('Ilość pytań z odpowiedziami')
plt.title('Ilość pytań z udzielonymi odpowiedziami w najpopularniejszych \n technologiach w ciągu ostatnich lat według danych z portalu stackoverflow.com')
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.savefig('most_popular_questions_with_answers_through_the_years.png', facecolor='white')
plt.show()


In [None]:
top_10_the_lowest_popular_technologies_based_on_the_survey_posts = ['<crystal-lang>', '<apl>', '<sas>', '<ocaml>', '<cobol>', '<fortran>', '<erlang>', '<julia>', '<f#>', '<lisp>']

lowest_popular_questions_with_answers_pandas_df = lowest_popular_questions_with_answers_through_the_years_df.toPandas()

fig, ax = plt.subplots(figsize=(10, 10), facecolor='white', dpi=100)
plt.ticklabel_format(axis='y', style='plain')

# Can be done it by foreach
ax.plot(lowest_popular_questions_with_answers_pandas_df[lowest_popular_questions_with_answers_pandas_df['Tag'] == '<crystal-lang>']['CreationYear'], lowest_popular_questions_with_answers_pandas_df[lowest_popular_questions_with_answers_pandas_df['Tag'] == '<crystal-lang>']['count'], label='crystal-lang', color='mediumblue', marker='v')

ax.plot(lowest_popular_questions_with_answers_pandas_df[lowest_popular_questions_with_answers_pandas_df['Tag'] == '<apl>']['CreationYear'], lowest_popular_questions_with_answers_pandas_df[lowest_popular_questions_with_answers_pandas_df['Tag'] == '<apl>']['count'], label='apl', color='darkgreen', marker='v')

ax.plot(lowest_popular_questions_with_answers_pandas_df[lowest_popular_questions_with_answers_pandas_df['Tag'] == '<sas>']['CreationYear'], lowest_popular_questions_with_answers_pandas_df[lowest_popular_questions_with_answers_pandas_df['Tag'] == '<sas>']['count'], label='sas', color='tomato', marker='v')

ax.plot(lowest_popular_questions_with_answers_pandas_df[lowest_popular_questions_with_answers_pandas_df['Tag'] == '<ocaml>']['CreationYear'], lowest_popular_questions_with_answers_pandas_df[lowest_popular_questions_with_answers_pandas_df['Tag'] == '<ocaml>']['count'], label='ocaml', color='darkcyan', marker='v')

ax.plot(lowest_popular_questions_with_answers_pandas_df[lowest_popular_questions_with_answers_pandas_df['Tag'] == '<cobol>']['CreationYear'], lowest_popular_questions_with_answers_pandas_df[lowest_popular_questions_with_answers_pandas_df['Tag'] == '<cobol>']['count'], label='cobol', color='pink', marker='v')

ax.plot(lowest_popular_questions_with_answers_pandas_df[lowest_popular_questions_with_answers_pandas_df['Tag'] == '<fortran>']['CreationYear'], lowest_popular_questions_with_answers_pandas_df[lowest_popular_questions_with_answers_pandas_df['Tag'] == '<fortran>']['count'], label='fortran', color='gold', marker='v')

ax.plot(lowest_popular_questions_with_answers_pandas_df[lowest_popular_questions_with_answers_pandas_df['Tag'] == '<erlang>']['CreationYear'], lowest_popular_questions_with_answers_pandas_df[lowest_popular_questions_with_answers_pandas_df['Tag'] == '<erlang>']['count'], label='erlang', color='dimgray', marker='v')

ax.plot(lowest_popular_questions_with_answers_pandas_df[lowest_popular_questions_with_answers_pandas_df['Tag'] == '<julia>']['CreationYear'], lowest_popular_questions_with_answers_pandas_df[lowest_popular_questions_with_answers_pandas_df['Tag'] == '<julia>']['count'], label='julia', color='darkorange', marker='v')

ax.plot(lowest_popular_questions_with_answers_pandas_df[lowest_popular_questions_with_answers_pandas_df['Tag'] == '<f#>']['CreationYear'], lowest_popular_questions_with_answers_pandas_df[lowest_popular_questions_with_answers_pandas_df['Tag'] == '<f#>']['count'], label='f#', color='lightgray', marker='v')

ax.plot(lowest_popular_questions_with_answers_pandas_df[lowest_popular_questions_with_answers_pandas_df['Tag'] == '<lisp>']['CreationYear'], lowest_popular_questions_with_answers_pandas_df[lowest_popular_questions_with_answers_pandas_df['Tag'] == '<lisp>']['count'], label='lisp', color='firebrick', marker='v')

ax.set_xlabel('Poszczególne lata')
ax.set_ylabel('Ilość pytań z odpowiedziami')
plt.title('Ilość pytań z udzielonymi odpowiedziami w najmniej popularnych \n technologiach w ciągu ostatnich lat według danych z portalu stackoverflow.com')
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.savefig('lowest_popular_questions_with_answers_through_the_years.png', facecolor='white')
plt.show()

## What is the correction between the number of Accepted answers through the years in the top 10th most and lowest popular Technologies due to StackOverflow Professionals Survey based on StackOverflow Questions?

In [None]:
most_popular_questions_with_unique_tag_df \
    .filter(col('AcceptedAnswerId').isNotNull()) \
    .withColumn('CreationYear', year(col('CreationDate'))) \
    .groupBy(col('Tag'), col('CreationYear')) \
    .count() \
    .sort(desc('CreationYear'), desc('count')) \
    .show()

lowest_popular_questions_with_unique_tag_df \
    .filter(col('AcceptedAnswerId').isNotNull()) \
    .withColumn('CreationYear', year(col('CreationDate'))) \
    .groupBy(col('Tag'), col('CreationYear')) \
    .count() \
    .sort(desc('CreationYear'), desc('count')) \
    .show()

## What is correction between number of unanswered questions in the top 10th most and lowest popular Technologies due to StackOverflow Professionals Survey based on StackOverflow Questions?

In [None]:
most_popular_questions_with_unique_tag_df \
    .filter(col('AcceptedAnswerId').isNull()) \
    .groupBy(col('Tag')) \
    .count() \
    .sort(desc('count')) \
    .show()

lowest_popular_questions_with_unique_tag_df \
    .filter(col('AcceptedAnswerId').isNull()) \
    .groupBy(col('Tag')) \
    .count() \
    .sort(desc('count')) \
    .show()

## What is correction between number of unanswered questions through the years in the top 10th most and lowest popular Technologies due to StackOverflow Professionals Survey based on StackOverflow Questions?

In [None]:
most_popular_questions_with_unique_tag_df \
    .filter(col('AcceptedAnswerId').isNull()) \
    .withColumn('CreationYear', year(col('CreationDate'))) \
    .groupBy(col('Tag'), col('CreationYear')) \
    .count() \
    .sort(desc('CreationYear'), desc('count')) \
    .show()

lowest_popular_questions_with_unique_tag_df \
    .filter(col('AcceptedAnswerId').isNull()) \
    .withColumn('CreationYear', year(col('CreationDate')) \
    .groupBy(col('Tag'), col('CreationYear')) \
    .count() \
    .sort(desc('CreationYear'), desc('count')) \
    .show()

## What is the correction between the number of questions views in the top 10th most and lowest popular Technologies due to StackOverflow Professionals Survey based on StackOverflow Questions?

In [None]:
most_popular_questions_with_unique_tag_df \
    .withColumn('CreationYear', year(col('CreationDate'))) \
    .groupBy(col('Tag'), col('CreationYear')).sum('ViewCount') \
    .sort(desc('CreationYear'), desc('sum(_ViewCount)')) \
    .show()

lowest_popular_questions_with_unique_tag_df \
    .withColumn('CreationYear', year(col('CreationDate'))) \
    .groupBy(col('Tag'), col('CreationYear')).sum('ViewCount') \
    .sort(desc('CreationYear'), desc('sum(_ViewCount)')) \
    .show()

## What is the correlation between the number of views and Accepted Answers in the top 10th most and lowest popular Technologies due to StackOverflow Professionals Survey based on StackOverflow Questions?

In [None]:
most_popular_questions_with_unique_tag_df.groupBy(col('Tag')) \
    .agg(count(when(col('AcceptedAnswerId').isNotNull(), True)).alias('NumberOfAnsweredQuestions'), count(when(col('AcceptedAnswerId').isNull(), True)).alias('NumberOfUnansweredQuestions')) \
    .sort(desc('NumberOfAnsweredQuestions'), desc('NumberOfUnansweredQuestions')).show()


lowest_popular_questions_with_unique_tag_df.groupBy(col('Tag')) \
    .agg(count(when(col('AcceptedAnswerId').isNotNull(), True)).alias('NumberOfAnsweredQuestions'), count(when(col('AcceptedAnswerId').isNull(), True)).alias('NumberOfUnansweredQuestions')) \
    .sort(desc('NumberOfAnsweredQuestions'), desc('NumberOfUnansweredQuestions')).show()

## What is the correlation between getting Accepted Answers and the fastest time needed in the top 10th most and lowest popular Technologies due to StackOverflow Professionals Survey based on StackOverflow Questions?

In [None]:
top_most_popular_questions_accepted_answer_response_time_df = most_popular_questions_with_unique_tag_df.alias('q').join(most_popular_accepted_answers_with_unique_tag_df.alias('a'), col('q.AcceptedAnswerId') == col('a.Id')) \
    .select(col('q.Tag'), col('q.CreationDate').alias('QuestionCreationDate'), col('a.CreationDate').alias('AcceptedAnswerCreationDate')) \
    .withColumn('AcceptedAnswerResponseTimeSeconds', col('AcceptedAnswerCreationDate').cast('long') - col('QuestionCreationDate').cast('long')) \
    .withColumn('AcceptedAnswerResponseTimeInMinutes', round(col('AcceptedAnswerResponseTimeSeconds')/60)) \
    .withColumn('AcceptedAnswerResponseTimeInHours', round(col('AcceptedAnswerResponseTimeSeconds')/3600)) \
    .withColumn('AcceptedAnswerResponseTimeInDays', round(col('AcceptedAnswerResponseTimeSeconds')/(24*3600))) \
    .sort(desc('AcceptedAnswerResponseTimeSeconds'), desc('AcceptedAnswerResponseTimeInMinutes'), desc('AcceptedAnswerResponseTimeInHours'), desc('AcceptedAnswerResponseTimeInDays'))

top_most_popular_questions_accepted_answer_response_time_df.show()

top_most_popular_questions_accepted_answer_response_time_df \
    .groupBy('Tag') \
    .min('AcceptedAnswerResponseTimeSeconds') \
    .sort(asc('min(AcceptedAnswerResponseTimeSeconds)')) \
    .show()

top_lowest_popular_questions_accepted_answer_response_time_df = lowest_popular_questions_with_unique_tag_df.alias('q').join(lowest_popular_accepted_answers_with_unique_tag_df.alias('a'), col('q.AcceptedAnswerId') == col('a.Id')) \
    .select(col('q.Tag'), col('q.CreationDate').alias('QuestionCreationDate'), col('a.CreationDate').alias('AcceptedAnswerCreationDate')) \
    .withColumn('AcceptedAnswerResponseTimeSeconds', col('AcceptedAnswerCreationDate').cast('long') - col('QuestionCreationDate').cast('long')) \
    .withColumn('AcceptedAnswerResponseTimeInMinutes', round(col('AcceptedAnswerResponseTimeSeconds')/60)) \
    .withColumn('AcceptedAnswerResponseTimeInHours', round(col('AcceptedAnswerResponseTimeSeconds')/3600)) \
    .withColumn('AcceptedAnswerResponseTimeInDays', round(col('AcceptedAnswerResponseTimeSeconds')/(24*3600))) \
    .sort(desc('AcceptedAnswerResponseTimeSeconds'), desc('AcceptedAnswerResponseTimeInMinutes'), desc('AcceptedAnswerResponseTimeInHours'), desc('AcceptedAnswerResponseTimeInDays'))

top_lowest_popular_questions_accepted_answer_response_time_df.show()

top_lowest_popular_questions_accepted_answer_response_time_df \
    .groupBy('Tag') \
    .min('AcceptedAnswerResponseTimeSeconds') \
    .sort(asc('min(AcceptedAnswerResponseTimeSeconds)')) \
    .show()

## What is the correlation between getting Accepted Answers and the slowest time needed in the top 10th most and lowest popular Technologies due to StackOverflow Professionals Survey based on StackOverflow Questions?

In [None]:
top_most_popular_questions_accepted_answer_response_time_df \
    .groupBy('Tag') \
    .max('AcceptedAnswerResponseTimeInDays') \
    .sort(desc('max(AcceptedAnswerResponseTimeInDays)')) \
    .show()

top_lowest_popular_questions_accepted_answer_response_time_df \
    .groupBy('Tag') \
    .max('AcceptedAnswerResponseTimeInDays') \
    .sort(desc('max(AcceptedAnswerResponseTimeInDays)')) \
    .show()

## What are the highest-scored  Answers in the top 10th most and lowest popular Technologies due to StackOverflow Professionals Survey based on StackOverflow Questions?

In [None]:
most_popular_answers_with_unique_tag_df \
    .groupBy('Tag') \
    .max('Score') \
    .sort(desc('max(_Score)')) \
    .show()

lowest_popular_answers_with_unique_tag_df \
    .groupBy('Tag') \
    .max('Score') \
    .sort(desc('max(_Score)')) \
    .show()

## What are the lowest-scored  Answers in the top 10th most and lowest popular Technologies due to StackOverflow Professionals Survey based on StackOverflow Questions?

In [None]:
most_popular_answers_with_unique_tag_df \
    .groupBy('Tag') \
    .min('Score') \
    .sort(asc('min(_Score)')) \
    .show()

lowest_popular_answers_with_unique_tag_df \
    .groupBy('Tag') \
    .min('Score') \
    .sort(asc('min(_Score)')) \
    .show()

## What are the average scored Answers in the top 10th most and lowest popular Technologies due to StackOverflow Professionals Survey based on StackOverflow Questions?

In [None]:
average_score_most_popular_answers_with_unique_tag_df = most_popular_answers_with_unique_tag_df \
    .groupBy('Tag') \
    .agg(round(avg('Score'), 1).alias('AverageScore'))\
    .sort(asc('AverageScore'))

average_score_lowest_popular_answers_with_unique_tag_df = lowest_popular_answers_with_unique_tag_df \
    .groupBy('Tag') \
    .agg(round(avg('Score'), 1).alias('AverageScore'))\
    .sort(asc('AverageScore'))

## What is the correction between the number of Accepted answers in the top 10th most and lowest popular Technologies due to StackOverflow Professionals Survey based on StackOverflow Questions? - PLOT

In [None]:
average_score_most_popular_answers_with_unique_tag_pandas_df = average_score_most_popular_answers_with_unique_tag_df.toPandas()

fig, ax = plt.subplots(figsize=(10,10), facecolor='white', dpi=100)
plt.barh(average_score_most_popular_answers_with_unique_tag_pandas_df['Tag'], average_score_most_popular_answers_with_unique_tag_pandas_df['AverageScore'])
plt.ticklabel_format(axis='x', style='plain')

plt.ylabel('Technologie')
plt.xlabel('Ilość punktów', labelpad= 20.0)
plt.title('Średnia ilość punktów za udzielone odpowiedzi w najpopularniejszych \n technologiach według danych z portalu stackoverflow.com')
plt.savefig('average_score_most_popular_answers.png', facecolor='white')
plt.show()

In [None]:
average_score_lowest_popular_answers_with_unique_tag_pandas_df = average_score_lowest_popular_answers_with_unique_tag_df.toPandas()

fig, ax = plt.subplots(figsize=(10, 10), facecolor='white', dpi=100)
plt.barh(average_score_lowest_popular_answers_with_unique_tag_pandas_df['Tag'], average_score_lowest_popular_answers_with_unique_tag_pandas_df['AverageScore'])
plt.ticklabel_format(axis='x', style='plain')

plt.xticks(np.arange(0, 5, step=1))
plt.ylabel('Technologie')
plt.xlabel('Ilość punktów', labelpad= 20.0)
plt.title('Średnia ilość punktów za udzielone odpowiedzi w najmniej popularnych \n technologiach według danych z portalu stackoverflow.com')
plt.savefig('average_score_lowest_popular_answers.png', facecolor='white')
plt.show()

## What are the highest-scored Accepted Answers in the top 10th most and lowest popular Technologies due to StackOverflow Professionals Survey based on StackOverflow Questions?

In [None]:
most_popular_accepted_answers_with_unique_tag_df \
    .groupBy('Tag') \
    .max('Score') \
    .sort(desc('max(_Score)')) \
    .show()

lowest_popular_accepted_answers_with_unique_tag_df \
    .groupBy('Tag') \
    .max('Score') \
    .sort(desc('max(_Score)')) \
    .show()

## What are the lowest-scored Accepted Answers in the top 10th most and lowest popular Technologies due to StackOverflow Professionals Survey based on StackOverflow Questions?

In [None]:
most_popular_accepted_answers_with_unique_tag_df \
    .groupBy('Tag') \
    .min('Score') \
    .sort(asc('min(_Score)')) \
    .show()

lowest_popular_accepted_answers_with_unique_tag_df \
    .groupBy('Tag') \
    .min('Score') \
    .sort(asc('min(_Score)')) \
    .show()

## What are the average scored Accepted Answers in the top 10th most and lowest popular Technologies due to StackOverflow Professionals Survey based on StackOverflow Questions?

In [None]:
most_popular_accepted_answers_with_unique_tag_df \
    .groupBy('Tag') \
    .agg(round(avg('Score'), 1).alias('AverageScore'))\
    .sort(desc('AverageScore')) \
    .show()

lowest_popular_accepted_answers_with_unique_tag_df \
    .groupBy('Tag') \
    .agg(round(avg('Score'), 1).alias('AverageScore'))\
    .sort(desc('AverageScore')) \
    .show()

## How do the highest-scored Answers correlate with the number of views in the top 10th most and lowest popular Technologies due to StackOverflow Professionals Survey based on StackOverflow Questions?

In [None]:
most_popular_accepted_answers_with_unique_tag_df.alias('a').join(most_popular_questions_with_unique_tag_df.alias('q'), col('a.ParentId') == col('q.Id')) \
    .groupBy(col('a.Tag').alias('AcceptedAnswersTags')) \
    .agg(max('a.Score').alias('HighestScore'), first(col('q.ViewCount')).alias('CorrelatedQuestionViewCount'))\
    .sort(desc('HighestScore')) \
    .show()

lowest_popular_accepted_answers_with_unique_tag_df.alias('a').join(lowest_popular_questions_with_unique_tag_df.alias('q'), col('a.ParentId') == col('q.Id')) \
    .groupBy(col('a.Tag').alias('AcceptedAnswersTags')) \
    .agg(max('a.Score').alias('HighestScore'), first(col('q.ViewCount')).alias('CorrelatedQuestionViewCount'))\
    .sort(desc('HighestScore')) \
    .show()

## How do the lowest-scored Answers correlate with the number of views in the top 10th most and lowest popular Technologies due to StackOverflow Professionals Survey based on StackOverflow Questions?

In [None]:
most_popular_accepted_answers_with_unique_tag_df.alias('a').join(most_popular_questions_with_unique_tag_df.alias('q'), col('a.ParentId') == col('q.Id')) \
    .groupBy(col('a.Tag').alias('AcceptedAnswersTags')) \
    .agg(min('a.Score').alias('LowestScore'), first(col('q.ViewCount')).alias('CorrelatedQuestionViewCount'))\
    .sort(asc('LowestScore')) \
    .show()


lowest_popular_accepted_answers_with_unique_tag_df.alias('a').join(lowest_popular_questions_with_unique_tag_df.alias('q'), col('a.ParentId') == col('q.Id')) \
    .groupBy(col('a.Tag').alias('AcceptedAnswersTags')) \
    .agg(min('a.Score').alias('LowestScore'), first(col('q.ViewCount')).alias('CorrelatedQuestionViewCount'))\
    .sort(asc('LowestScore')) \
    .show()

## What is the average score  Accepted Answers in the top 10th most and lowest popular Technologies due to StackOverflow Professionals Survey based on StackOverflow Questions?

In [None]:
most_popular_accepted_answers_with_unique_tag_df.alias('a').join(most_popular_questions_with_unique_tag_df.alias('q'), col('a.ParentId') == col('q.Id')) \
    .groupBy(col('a.Tag').alias('AcceptedAnswersTags')) \
    .agg(round(avg('a.Score'), 1).alias('AverageScore'), first(col('q.ViewCount')).alias('CorrelatedQuestionViewCount'))\
    .sort(desc('AverageScore')) \
    .show()


lowest_popular_accepted_answers_with_unique_tag_df.alias('a').join(lowest_popular_questions_with_unique_tag_df.alias('q'), col('a.ParentId') == col('q.Id')) \
    .groupBy(col('a.Tag').alias('AcceptedAnswersTags')) \
    .agg(round(avg('a.Score'), 1).alias('AverageScore'), first(col('q.ViewCount')).alias('CorrelatedQuestionViewCount'))\
    .sort(desc('AverageScore')) \
    .show()

## What is the correlation between Users in the top 10th most and lowest popular Technologies due to StackOverflow Professionals Survey based on StackOverflow Questions?

In [None]:
users_questions_with_most_popular_tags_df \
    .groupBy(col('Tag')) \
    .count() \
    .sort(desc('count')) \
    .show()

users_questions_with_lowest_popular_tags_df \
    .groupBy(col('Tag')) \
    .count() \
    .sort(desc('count')) \
    .show()

## What is the correlation between Users through the years in the top 10th most and lowest popular Technologies due to StackOverflow Professionals Survey based on StackOverflow Questions?

In [None]:
filter_based_duplicated_column = ['Id', 'Tag']

number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_most_popular_tags_df = users_questions_and_answers_with_most_popular_tags_df \
    .withColumn('Filter_Id_Tag', array_sort(array(*filter_based_duplicated_column))) \
    .drop_duplicates(['Filter_Id_Tag']) \
    .drop('Filter_Id_Tag') \
    .withColumn('CreationYear', year(col('CreationDate'))) \
    .groupBy(col('Tag'), col('CreationYear')) \
    .count() \
    .sort(desc('CreationYear'), desc('count'))

print('ALL QUESTIONS AND ANSWERS UNIQUE USERS THROUGH THE YEARS IN MOST POPULAR TECHNOLOGIES')
number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_most_popular_tags_df.show()


print('ALL QUESTIONS AND ANSWERS UNIQUE USERS THROUGH THE YEARS IN LOWEST POPULAR TECHNOLOGIES')
number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_lowest_popular_tags_df = users_questions_and_answers_with_lowest_popular_tags_df \
    .withColumn('Filter_Id_Tag', array_sort(array(*filter_based_duplicated_column))) \
    .drop_duplicates(['Filter_Id_Tag']) \
    .drop('Filter_Id_Tag') \
    .withColumn('CreationYear', year(col('CreationDate'))) \
    .groupBy(col('Tag'), col('CreationYear')) \
    .count() \
    .sort(desc('CreationYear'), desc('count'))

print('ALL QUESTIONS AND ANSWERS UNIQUE USERS IN LOWEST POPULAR TECHNOLOGIES')
number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_lowest_popular_tags_df.show()



## What is the correlation between Users through the years in the top 10th most and lowest popular Technologies due to StackOverflow Professionals Survey based on StackOverflow Questions? - PLOT

In [None]:
top_10_the_most_popular_technologies_based_on_the_survey_posts = ['<javascript>', '<html>', '<sql>', '<python>', '<typescript>', '<java>', '<c#>', '<bash>', '<php>', '<c++>']

number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_most_popular_tags_pandas_df = number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_most_popular_tags_df.toPandas()

fig, ax = plt.subplots(figsize=(10, 10), facecolor='white', dpi=100)
plt.ticklabel_format(axis='y', style='plain')

# Should be done it by for each - refactor
ax.plot(number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_most_popular_tags_pandas_df[number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_most_popular_tags_pandas_df['Tag'] == '<javascript>']['CreationYear'], number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_most_popular_tags_pandas_df[number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_most_popular_tags_pandas_df['Tag'] == '<javascript>']['count'], label='javascript', color='blue', marker='v')

ax.plot(number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_most_popular_tags_pandas_df[number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_most_popular_tags_pandas_df['Tag'] == '<html>']['CreationYear'], number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_most_popular_tags_pandas_df[number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_most_popular_tags_pandas_df['Tag'] == '<html>']['count'], label='html', color='green', marker='v')

ax.plot(number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_most_popular_tags_pandas_df[number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_most_popular_tags_pandas_df['Tag'] == '<sql>']['CreationYear'], number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_most_popular_tags_pandas_df[number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_most_popular_tags_pandas_df['Tag'] == '<sql>']['count'], label='sql', color='red', marker='v')

ax.plot(number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_most_popular_tags_pandas_df[number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_most_popular_tags_pandas_df['Tag'] == '<python>']['CreationYear'], number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_most_popular_tags_pandas_df[number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_most_popular_tags_pandas_df['Tag'] == '<python>']['count'], label='python', color='cyan', marker='v')

ax.plot(number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_most_popular_tags_pandas_df[number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_most_popular_tags_pandas_df['Tag'] == '<typescript>']['CreationYear'], number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_most_popular_tags_pandas_df[number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_most_popular_tags_pandas_df['Tag'] == '<typescript>']['count'], label='typescript', color='magenta', marker='v')

ax.plot(number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_most_popular_tags_pandas_df[number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_most_popular_tags_pandas_df['Tag'] == '<java>']['CreationYear'], number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_most_popular_tags_pandas_df[number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_most_popular_tags_pandas_df['Tag'] == '<java>']['count'], label='java', color='yellow', marker='v')

ax.plot(number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_most_popular_tags_pandas_df[number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_most_popular_tags_pandas_df['Tag'] == '<c#>']['CreationYear'], number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_most_popular_tags_pandas_df[number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_most_popular_tags_pandas_df['Tag'] == '<c#>']['count'], label='c#', color='black', marker='v')

ax.plot(number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_most_popular_tags_pandas_df[number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_most_popular_tags_pandas_df['Tag'] == '<bash>']['CreationYear'], number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_most_popular_tags_pandas_df[number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_most_popular_tags_pandas_df['Tag'] == '<bash>']['count'], label='bash', color='orange', marker='v')

ax.plot(number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_most_popular_tags_pandas_df[number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_most_popular_tags_pandas_df['Tag'] == '<php>']['CreationYear'], number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_most_popular_tags_pandas_df[number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_most_popular_tags_pandas_df['Tag'] == '<php>']['count'], label='php', color='gray', marker='v')

ax.plot(number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_most_popular_tags_pandas_df[number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_most_popular_tags_pandas_df['Tag'] == '<c++>']['CreationYear'], number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_most_popular_tags_pandas_df[number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_most_popular_tags_pandas_df['Tag'] == '<c++>']['count'], label='c++', color='brown', marker='v')

ax.set_xlabel('Poszczególne lata')
ax.set_ylabel('Ilość użytkowników')
plt.title('Sumaryczna ilość użytkowników w najpopularniejszych technologiach \n w ciągu ostatnich lat według danych z portalu stackoverflow.com')
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.savefig('number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_most_popular_tags.png', facecolor='white')
plt.show()

In [None]:
top_10_the_lowest_popular_technologies_based_on_the_survey_posts = ['<crystal-lang>', '<apl>', '<sas>', '<ocaml>', '<cobol>', '<fortran>', '<erlang>', '<julia>', '<f#>', '<lisp>']

number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_lowest_popular_tags_pandas_df = number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_lowest_popular_tags_df.toPandas()

fig, ax = plt.subplots(figsize=(10, 10), facecolor='white', dpi=100)
plt.ticklabel_format(axis='y', style='plain')

# Should be done it by for each - refactor
ax.plot(number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_lowest_popular_tags_pandas_df[number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_lowest_popular_tags_pandas_df['Tag'] == '<crystal-lang>']['CreationYear'], number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_lowest_popular_tags_pandas_df[number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_lowest_popular_tags_pandas_df['Tag'] == '<crystal-lang>']['count'], label='crystal-lang', color='mediumblue', marker='v')

ax.plot(number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_lowest_popular_tags_pandas_df[number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_lowest_popular_tags_pandas_df['Tag'] == '<apl>']['CreationYear'], number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_lowest_popular_tags_pandas_df[number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_lowest_popular_tags_pandas_df['Tag'] == '<apl>']['count'], label='apl', color='darkgreen', marker='v')

ax.plot(number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_lowest_popular_tags_pandas_df[number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_lowest_popular_tags_pandas_df['Tag'] == '<sas>']['CreationYear'], number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_lowest_popular_tags_pandas_df[number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_lowest_popular_tags_pandas_df['Tag'] == '<sas>']['count'], label='sas', color='tomato', marker='v')

ax.plot(number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_lowest_popular_tags_pandas_df[number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_lowest_popular_tags_pandas_df['Tag'] == '<ocaml>']['CreationYear'], number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_lowest_popular_tags_pandas_df[number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_lowest_popular_tags_pandas_df['Tag'] == '<ocaml>']['count'], label='ocaml', color='darkcyan', marker='v')

ax.plot(number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_lowest_popular_tags_pandas_df[number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_lowest_popular_tags_pandas_df['Tag'] == '<cobol>']['CreationYear'], number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_lowest_popular_tags_pandas_df[number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_lowest_popular_tags_pandas_df['Tag'] == '<cobol>']['count'], label='cobol', color='pink', marker='v')

ax.plot(number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_lowest_popular_tags_pandas_df[number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_lowest_popular_tags_pandas_df['Tag'] == '<fortran>']['CreationYear'], number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_lowest_popular_tags_pandas_df[number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_lowest_popular_tags_pandas_df['Tag'] == '<fortran>']['count'], label='fortran', color='gold', marker='v')

ax.plot(number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_lowest_popular_tags_pandas_df[number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_lowest_popular_tags_pandas_df['Tag'] == '<erlang>']['CreationYear'], number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_lowest_popular_tags_pandas_df[number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_lowest_popular_tags_pandas_df['Tag'] == '<erlang>']['count'], label='erlang', color='dimgray', marker='v')

ax.plot(number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_lowest_popular_tags_pandas_df[number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_lowest_popular_tags_pandas_df['Tag'] == '<julia>']['CreationYear'], number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_lowest_popular_tags_pandas_df[number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_lowest_popular_tags_pandas_df['Tag'] == '<julia>']['count'], label='julia', color='darkorange', marker='v')

ax.plot(number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_lowest_popular_tags_pandas_df[number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_lowest_popular_tags_pandas_df['Tag'] == '<f#>']['CreationYear'], number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_lowest_popular_tags_pandas_df[number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_lowest_popular_tags_pandas_df['Tag'] == '<f#>']['count'], label='f#', color='lightgray', marker='v')

ax.plot(number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_lowest_popular_tags_pandas_df[number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_lowest_popular_tags_pandas_df['Tag'] == '<lisp>']['CreationYear'], number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_lowest_popular_tags_pandas_df[number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_lowest_popular_tags_pandas_df['Tag'] == '<lisp>']['count'], label='lisp', color='firebrick', marker='v')

ax.set_xlabel('Poszczególne lata')
ax.set_ylabel('Ilość użytkowników')
plt.title('Sumaryczna ilość użytkowników w najmniej popularnych technologiach \n w ciągu ostatnich lat według danych z portalu stackoverflow.com')
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.savefig('number_of_unique_users_involved_in_questions_and_answers_through_the_years_in_lowest_popular_tags.png', facecolor='white')
plt.show()

## What is the correlation between Users in the top 10th most and lowest popular Technologies due to StackOverflow Professionals Survey based on StackOverflow Accepted Answers?

In [None]:
users_accepted_answers_with_most_popular_tags_df \
    .groupBy(col('Tag')) \
    .count() \
    .sort(desc('count')) \
    .show()

users_accepted_answers_with_lowest_popular_tags_df \
    .groupBy(col('Tag')) \
    .count() \
    .sort(desc('count')) \
    .show()

## What is the correlation between Users through the years in the top 10th most and lowest popular Technologies due to StackOverflow Professionals Survey based on StackOverflow Accepted Answers?

In [None]:
users_accepted_answers_with_most_popular_tags_df \
    .withColumn('CreationYear', year(col('CreationDate'))) \
    .groupBy(col('Tag'), col('CreationYear')) \
    .count() \
    .sort(desc('CreationYear'), desc('count')) \
    .show()

users_accepted_answers_with_lowest_popular_tags_df \
    .withColumn('CreationYear', year(col('CreationDate'))) \
    .groupBy(col('Tag'), col('CreationYear')) \
    .count() \
    .sort(desc('CreationYear'), desc('count')) \
    .show()

In [None]:
filter_based_duplicated_column = ['Id', 'Tag']

number_of_unique_users_involved_in_questions_and_answers_in_most_popular_tags_df = users_questions_and_answers_with_most_popular_tags_df \
    .withColumn('Filter_Id_Tag', array_sort(array(*filter_based_duplicated_column))) \
    .drop_duplicates(['Filter_Id_Tag']) \
    .drop('Filter_Id_Tag') \
    .groupBy(col('Tag')) \
    .count() \
    .sort(asc('count'))

number_of_unique_users_involved_in_questions_and_answers_in_lowest_popular_tags_df = users_questions_and_answers_with_lowest_popular_tags_df \
    .withColumn('Filter_Id_Tag', array_sort(array(*filter_based_duplicated_column))) \
    .drop_duplicates(['Filter_Id_Tag']) \
    .drop('Filter_Id_Tag') \
    .groupBy(col('Tag')) \
    .count() \
    .sort(asc('count'))

## What is the correlation between Users through the years in the top 10th most and lowest popular Technologies due to StackOverflow Professionals Survey based on StackOverflow Accepted Answers? - PLOT

In [None]:
number_of_unique_users_involved_in_questions_and_answers_in_most_popular_tags_pandas_df = number_of_unique_users_involved_in_questions_and_answers_in_most_popular_tags_df.toPandas()

fig, ax = plt.subplots(figsize=(10,10), facecolor='white', dpi=100)
plt.barh(number_of_unique_users_involved_in_questions_and_answers_in_most_popular_tags_pandas_df['Tag'], number_of_unique_users_involved_in_questions_and_answers_in_most_popular_tags_pandas_df['count'])
plt.ticklabel_format(axis='x', style='plain')

plt.ylabel('Technologie')
plt.xlabel('Ilość unikalnych użytkowników', labelpad= 20.0)
plt.title('Ilość zaangażowanych użytkowników w najpopularniejszych \n technologiach według danych z portalu stackoverflow.com ')
plt.savefig('number_of_unique_users_involved_in_questions_and_answers_in_most_popular_tags.png', facecolor='white')
plt.show()

In [None]:
number_of_unique_users_involved_in_questions_and_answers_in_lowest_popular_tags_df_pandas_df = number_of_unique_users_involved_in_questions_and_answers_in_lowest_popular_tags_df.toPandas()

fig, ax = plt.subplots(figsize=(10,10), facecolor='white', dpi=100)
plt.barh(number_of_unique_users_involved_in_questions_and_answers_in_lowest_popular_tags_df_pandas_df['Tag'], number_of_unique_users_involved_in_questions_and_answers_in_lowest_popular_tags_df_pandas_df['count'])
plt.ticklabel_format(axis='x', style='plain')

plt.ylabel('Technologie')
plt.xlabel('Ilość unikalnych użytkowników', labelpad= 20.0)
plt.title('Ilość zaangażowanych użytkowników w najmniej popularnych \n technologiach według danych z portalu stackoverflow.com ')
plt.savefig('number_of_unique_users_involved_in_questions_and_answers_in_lowest_popular_tags.png', facecolor='white')
plt.show()

## What is the correlation between Users Reputation sum in the top 10th most and lowest popular Technologies due to StackOverflow Professionals Survey based on StackOverflow Questions?

In [None]:
users_questions_with_most_popular_tags_df \
    .groupBy(col('Tag')) \
    .count() \
    .sort(desc('count')) \
    .show()

users_questions_with_lowest_popular_tags_df \
    .groupBy(col('Tag')) \
    .count() \
    .sort(desc('count')) \
    .show()

## What is the correlation between Users Reputation sum in the top 10th most and lowest popular Technologies due to StackOverflow Professionals Survey based on StackOverflow Questions?

In [None]:
users_questions_with_most_popular_tags_df \
    .groupBy(col('Tag')) \
    .agg(sum(col('Reputation')).cast(LongType()).alias('SumOfUsersReputations')) \
    .sort(desc('SumOfUsersReputations')) \
    .show()

users_questions_with_lowest_popular_tags_df \
    .groupBy(col('Tag')) \
    .agg(sum(col('Reputation')).cast(LongType()).alias('SumOfUsersReputations')) \
    .sort(desc('SumOfUsersReputations')) \
    .show()

## What is the correlation between Users Reputation sum through the years in the top 10th most and lowest popular Technologies due to StackOverflow Professionals Survey based on StackOverflow Questions?

In [None]:
users_questions_with_most_popular_tags_df \
    .withColumn('CreationYear', year(col('CreationDate'))) \
    .groupBy(col('Tag'), col('CreationYear')) \
    .agg(sum(col('Reputation')).cast(LongType()).alias('SumOfUsersReputations')) \
    .sort(desc('CreationYear'), desc('SumOfUsersReputations')) \
    .show()

users_questions_with_lowest_popular_tags_df \
    .withColumn('CreationYear', year(col('CreationDate'))) \
    .groupBy(col('Tag'), col('CreationYear')) \
    .agg(sum(col('Reputation')).cast(LongType()).alias('SumOfUsersReputations')) \
    .sort(desc('CreationYear'), desc('SumOfUsersReputations')) \
    .show()

## What is the correlation between Users Reputation sum in the top 10th most and lowest popular Technologies due to StackOverflow Professionals Survey based on StackOverflow Accepted Answers?

In [None]:
users_accepted_answers_with_most_popular_tags_df \
    .groupBy(col('Tag')) \
    .agg(sum(col('Reputation')).cast(LongType()).alias('SumOfUsersReputations')) \
    .sort(desc('SumOfUsersReputations')) \
    .show()

users_accepted_answers_with_lowest_popular_tags_df \
    .groupBy(col('Tag')) \
    .agg(sum(col('Reputation')).cast(LongType()).alias('SumOfUsersReputations')) \
    .sort(desc('SumOfUsersReputations')) \
    .show()

## What is the correlation between Users Reputation sum through the years in the top 10th most and lowest popular Technologies due to StackOverflow Professionals Survey based on StackOverflow Accepted Answers?

In [None]:
users_accepted_answers_with_most_popular_tags_df \
    .withColumn('CreationYear', year(col('CreationDate'))) \
    .groupBy(col('Tag'), col('CreationYear')) \
    .agg(sum(col('Reputation')).cast(LongType()).alias('SumOfUsersReputations')) \
    .sort(desc('CreationYear'), desc('SumOfUsersReputations')) \
    .show()

users_accepted_answers_with_lowest_popular_tags_df \
    .withColumn('CreationYear', year(col('CreationDate'))) \
    .groupBy(col('Tag'), col('CreationYear')) \
    .agg(sum(col('Reputation')).cast(LongType()).alias('SumOfUsersReputations')) \
    .sort(desc('CreationYear'), desc('SumOfUsersReputations')) \
    .show()

In [None]:
filter_based_duplicated_column = ['Id', 'Tag']

sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_most_popular_tags_df = users_answers_with_most_popular_tags_df \
    .withColumn('Filter_Id_Tag', array_sort(array(*filter_based_duplicated_column))) \
    .drop_duplicates(['Filter_Id_Tag']) \
    .drop('Filter_Id_Tag') \
    .withColumn('CreationYear', year(col('CreationDate'))) \
    .groupBy(col('Tag'), col('CreationYear')) \
    .agg(sum(col('Reputation')).cast(LongType()).alias('SumOfUsersReputations')) \
    .sort(desc('CreationYear'), asc('SumOfUsersReputations'))

sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_lowest_popular_tags_df = users_answers_with_lowest_popular_tags_df \
    .withColumn('Filter_Id_Tag', array_sort(array(*filter_based_duplicated_column))) \
    .drop_duplicates(['Filter_Id_Tag']) \
    .drop('Filter_Id_Tag') \
    .withColumn('CreationYear', year(col('CreationDate'))) \
    .groupBy(col('Tag'), col('CreationYear')) \
    .agg(sum(col('Reputation')).cast(LongType()).alias('SumOfUsersReputations')) \
    .sort(desc('CreationYear'), asc('SumOfUsersReputations'))

## What is the correlation between Users Reputation sum through the years in the top 10th most and lowest popular Technologies due to StackOverflow Professionals Survey based on StackOverflow Accepted Answers? - PLOT

In [None]:
top_10_the_most_popular_technologies_based_on_the_survey_posts = ['<javascript>', '<html>', '<sql>', '<python>', '<typescript>', '<java>', '<c#>', '<bash>', '<php>', '<c++>']

sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_most_popular_tags_pandas_df = sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_most_popular_tags_df.toPandas()

fig, ax = plt.subplots(figsize=(10, 10), facecolor='white', dpi=100)
plt.ticklabel_format(axis='y', style='plain')

# Should be done it by for each - refactor
ax.plot(sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_most_popular_tags_pandas_df[sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_most_popular_tags_pandas_df['Tag'] == '<javascript>']['CreationYear'], sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_most_popular_tags_pandas_df[sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_most_popular_tags_pandas_df['Tag'] == '<javascript>']['SumOfUsersReputations'], label='javascript', color='blue', marker='v')

ax.plot(sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_most_popular_tags_pandas_df[sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_most_popular_tags_pandas_df['Tag'] == '<html>']['CreationYear'], sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_most_popular_tags_pandas_df[sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_most_popular_tags_pandas_df['Tag'] == '<html>']['SumOfUsersReputations'], label='html', color='green', marker='v')

ax.plot(sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_most_popular_tags_pandas_df[sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_most_popular_tags_pandas_df['Tag'] == '<sql>']['CreationYear'], sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_most_popular_tags_pandas_df[sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_most_popular_tags_pandas_df['Tag'] == '<sql>']['SumOfUsersReputations'], label='sql', color='red', marker='v')

ax.plot(sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_most_popular_tags_pandas_df[sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_most_popular_tags_pandas_df['Tag'] == '<python>']['CreationYear'], sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_most_popular_tags_pandas_df[sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_most_popular_tags_pandas_df['Tag'] == '<python>']['SumOfUsersReputations'], label='python', color='cyan', marker='v')

ax.plot(sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_most_popular_tags_pandas_df[sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_most_popular_tags_pandas_df['Tag'] == '<typescript>']['CreationYear'], sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_most_popular_tags_pandas_df[sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_most_popular_tags_pandas_df['Tag'] == '<typescript>']['SumOfUsersReputations'], label='typescript', color='magenta', marker='v')

ax.plot(sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_most_popular_tags_pandas_df[sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_most_popular_tags_pandas_df['Tag'] == '<java>']['CreationYear'], sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_most_popular_tags_pandas_df[sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_most_popular_tags_pandas_df['Tag'] == '<java>']['SumOfUsersReputations'], label='java', color='yellow', marker='v')

ax.plot(sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_most_popular_tags_pandas_df[sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_most_popular_tags_pandas_df['Tag'] == '<c#>']['CreationYear'], sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_most_popular_tags_pandas_df[sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_most_popular_tags_pandas_df['Tag'] == '<c#>']['SumOfUsersReputations'], label='c#', color='black', marker='v')

ax.plot(sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_most_popular_tags_pandas_df[sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_most_popular_tags_pandas_df['Tag'] == '<bash>']['CreationYear'], sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_most_popular_tags_pandas_df[sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_most_popular_tags_pandas_df['Tag'] == '<bash>']['SumOfUsersReputations'], label='bash', color='orange', marker='v')

ax.plot(sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_most_popular_tags_pandas_df[sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_most_popular_tags_pandas_df['Tag'] == '<php>']['CreationYear'], sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_most_popular_tags_pandas_df[sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_most_popular_tags_pandas_df['Tag'] == '<php>']['SumOfUsersReputations'], label='php', color='gray', marker='v')

ax.plot(sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_most_popular_tags_pandas_df[sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_most_popular_tags_pandas_df['Tag'] == '<c++>']['CreationYear'], sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_most_popular_tags_pandas_df[sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_most_popular_tags_pandas_df['Tag'] == '<c++>']['SumOfUsersReputations'], label='c++', color='brown', marker='v')

ax.set_xlabel('Poszczególne lata')
ax.set_ylabel('Ilość punktów reputacji')
plt.title('Sumaryczna reputacja użytkowników udzielający odpowiedzi \n w najpopularniejszych technologiach w ciągu ostatnich lat \n według danych z portalu stackoverflow.com')
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.savefig('sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_most_popular_tags.png', facecolor='white')
plt.show()


In [None]:
top_10_the_lowest_popular_technologies_based_on_the_survey_posts = ['<crystal-lang>', '<apl>', '<sas>', '<ocaml>', '<cobol>', '<fortran>', '<erlang>', '<julia>', '<f#>', '<lisp>']

sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_lowest_popular_tags_pandas_df = sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_lowest_popular_tags_df.toPandas()

fig, ax = plt.subplots(figsize=(10, 10), facecolor='white', dpi=100)
plt.ticklabel_format(axis='y', style='plain')

# Should be done it by for each - refactor
ax.plot(sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_lowest_popular_tags_pandas_df[sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_lowest_popular_tags_pandas_df['Tag'] == '<crystal-lang>']['CreationYear'], sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_lowest_popular_tags_pandas_df[sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_lowest_popular_tags_pandas_df['Tag'] == '<crystal-lang>']['SumOfUsersReputations'], label='crystal-lang', color='mediumblue', marker='v')

ax.plot(sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_lowest_popular_tags_pandas_df[sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_lowest_popular_tags_pandas_df['Tag'] == '<apl>']['CreationYear'], sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_lowest_popular_tags_pandas_df[sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_lowest_popular_tags_pandas_df['Tag'] == '<apl>']['SumOfUsersReputations'], label='apl', color='darkgreen', marker='v')

ax.plot(sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_lowest_popular_tags_pandas_df[sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_lowest_popular_tags_pandas_df['Tag'] == '<sas>']['CreationYear'], sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_lowest_popular_tags_pandas_df[sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_lowest_popular_tags_pandas_df['Tag'] == '<sas>']['SumOfUsersReputations'], label='sas', color='tomato', marker='v')

ax.plot(sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_lowest_popular_tags_pandas_df[sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_lowest_popular_tags_pandas_df['Tag'] == '<ocaml>']['CreationYear'], sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_lowest_popular_tags_pandas_df[sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_lowest_popular_tags_pandas_df['Tag'] == '<ocaml>']['SumOfUsersReputations'], label='ocaml', color='darkcyan', marker='v')

ax.plot(sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_lowest_popular_tags_pandas_df[sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_lowest_popular_tags_pandas_df['Tag'] == '<cobol>']['CreationYear'], sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_lowest_popular_tags_pandas_df[sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_lowest_popular_tags_pandas_df['Tag'] == '<cobol>']['SumOfUsersReputations'], label='cobol', color='pink', marker='v')

ax.plot(sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_lowest_popular_tags_pandas_df[sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_lowest_popular_tags_pandas_df['Tag'] == '<fortran>']['CreationYear'], sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_lowest_popular_tags_pandas_df[sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_lowest_popular_tags_pandas_df['Tag'] == '<fortran>']['SumOfUsersReputations'], label='fortran', color='gold', marker='v')

ax.plot(sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_lowest_popular_tags_pandas_df[sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_lowest_popular_tags_pandas_df['Tag'] == '<erlang>']['CreationYear'], sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_lowest_popular_tags_pandas_df[sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_lowest_popular_tags_pandas_df['Tag'] == '<erlang>']['SumOfUsersReputations'], label='erlang', color='dimgray', marker='v')

ax.plot(sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_lowest_popular_tags_pandas_df[sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_lowest_popular_tags_pandas_df['Tag'] == '<julia>']['CreationYear'], sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_lowest_popular_tags_pandas_df[sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_lowest_popular_tags_pandas_df['Tag'] == '<julia>']['SumOfUsersReputations'], label='julia', color='darkorange', marker='v')

ax.plot(sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_lowest_popular_tags_pandas_df[sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_lowest_popular_tags_pandas_df['Tag'] == '<f#>']['CreationYear'], sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_lowest_popular_tags_pandas_df[sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_lowest_popular_tags_pandas_df['Tag'] == '<f#>']['SumOfUsersReputations'], label='f#', color='lightgray', marker='v')

ax.plot(sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_lowest_popular_tags_pandas_df[sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_lowest_popular_tags_pandas_df['Tag'] == '<lisp>']['CreationYear'], sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_lowest_popular_tags_pandas_df[sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_lowest_popular_tags_pandas_df['Tag'] == '<lisp>']['SumOfUsersReputations'], label='lisp', color='firebrick', marker='v')

ax.set_xlabel('Poszczególne lata')
ax.set_ylabel('Ilość punktów reputacji')
plt.title('Sumaryczna reputacja użytkowników udzielający odpowiedzi \n w najmniej popularnych technologiach w ciągu ostatnich lat \n według danych z portalu stackoverflow.com')
ax.legend(loc='center left', bbox_to_anchor=(1, 0.5))
plt.savefig('sum_of_reputation_all_unique_users_involved_in_answers_through_the_years_in_lowest_popular_tags.png', facecolor='white')
plt.show()


## What is the correlation between Users Reputation in the top 10th most and lowest popular Technologies due to StackOverflow Professionals Survey based on StackOverflow Answers?

In [None]:
filter_based_duplicated_column = ['Id', 'Tag']

sum_of_reputation_all_unique_users_involved_in_answers_in_most_popular_tags_df = users_answers_with_most_popular_tags_df \
    .withColumn('Filter_Id_Tag', array_sort(array(*filter_based_duplicated_column))) \
    .drop_duplicates(['Filter_Id_Tag']) \
    .drop('Filter_Id_Tag') \
    .groupBy(col('Tag')) \
    .agg(sum(col('Reputation')).cast(LongType()).alias('SumOfUsersReputations')) \
    .sort(asc('SumOfUsersReputations'))

sum_of_reputation_all_unique_users_involved_in_answers_in_lowest_popular_tags_df = users_answers_with_lowest_popular_tags_df \
    .withColumn('Filter_Id_Tag', array_sort(array(*filter_based_duplicated_column))) \
    .drop_duplicates(['Filter_Id_Tag']) \
    .drop('Filter_Id_Tag') \
    .groupBy(col('Tag')) \
    .agg(sum(col('Reputation')).cast(LongType()).alias('SumOfUsersReputations')) \
    .sort(asc('SumOfUsersReputations'))

## What is the correlation between Users Reputation in the top 10th most and lowest popular Technologies due to StackOverflow Professionals Survey based on StackOverflow Answers? - PLOT


In [None]:
sum_of_reputation_all_unique_users_involved_in_answers_in_most_popular_tags_pandas_df = sum_of_reputation_all_unique_users_involved_in_answers_in_most_popular_tags_df.toPandas()

fig, ax = plt.subplots(figsize=(10,10), facecolor='white', dpi=100)
plt.barh(sum_of_reputation_all_unique_users_involved_in_answers_in_most_popular_tags_pandas_df['Tag'], sum_of_reputation_all_unique_users_involved_in_answers_in_most_popular_tags_pandas_df['SumOfUsersReputations'])
plt.ticklabel_format(axis='x', style='plain')

plt.xticks(rotation = 45)
plt.ylabel('Technologie')
plt.xlabel('Suma punktów reputacji unikalnych użytkowników', labelpad= 20.0)
plt.title('Sumaryczna ilość punktów reputacji zaangażowanych użytkowników \n w udzielaniu odpowiedzi w najpopularniejszych \n technologiach według danych z portalu stackoverflow.com')
plt.savefig('sum_of_reputation_all_unique_users_involved_in_questions_and_answers_in_most_popular_tags.png', facecolor='white')
plt.show()

In [None]:
sum_of_reputation_all_unique_users_involved_in_answers_in_lowest_popular_tags_pandas_df = sum_of_reputation_all_unique_users_involved_in_answers_in_lowest_popular_tags_df.toPandas()

fig, ax = plt.subplots(figsize=(10,10), facecolor='white', dpi=100)
plt.barh(sum_of_reputation_all_unique_users_involved_in_answers_in_lowest_popular_tags_pandas_df['Tag'], sum_of_reputation_all_unique_users_involved_in_answers_in_lowest_popular_tags_pandas_df['SumOfUsersReputations'])
plt.ticklabel_format(axis='x', style='plain')

plt.xticks(np.arange(0, 100000000, step=20000000), rotation = 45)
plt.xticks(rotation = 45)
plt.ylabel('Technologie')
plt.xlabel('Suma punktów reputacji unikalnych użytkowników', labelpad= 20.0)
plt.title('Sumaryczna ilość punktów reputacji zaangażowanych użytkowników \n w udzielaniu odpowiedzi w najmniej popularnych \n technologiach według danych z portalu stackoverflow.com')
plt.savefig('sum_of_reputation_all_unique_users_involved_in_questions_and_answers_in_lowest_popular_tags.png', facecolor='white')
plt.show()