# Exploratory data analysis - job offers from justjoinit

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

## Set globally to show all data instead ...

In [None]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)

## Loading and displaying data

### Find all files with data

In [None]:
import glob

In [None]:
csv_files = glob.glob("./Dane/AI & ML/*.csv")
# csv_files

In [None]:
import os
import re

folder = "./Dane/AI & ML"
wykluczenia = re.compile(r"tabela_relacyjna|relational_table", re.IGNORECASE)

csv_files = [
    os.path.join(folder, f) 
    for f in os.listdir(folder) 
    if not wykluczenia.search(f)
]
len(csv_files)

In [None]:
# for file in csv_files:
#     print(file)

### Concatenate all files to one dataframe

In [None]:
df_list = [pd.read_csv(file) for file in csv_files]
# df_list

In [None]:
import pandas as pd
# df_all_offers = pd.concat(df_list[0:5]) # (595, 269)
df_all_offers = pd.concat(df_list) # (1199, 308)
# df_all_offers

In [None]:
# df_all_offers.to_csv(r"dane_zlaczone.csv", index=False)  # r przed ścieżką

In [None]:
df_all_offers.shape

In [None]:
job_offers = df_all_offers.copy(deep=True)
job_offers.shape

### Engineering feature - change nan to 0

In [None]:
job_offers.isna().sum()

In [None]:
job_offers[job_offers.isna().any(axis=1)].head(3)

In [None]:
job_offers.loc[:, ["offer_id",	"title",	"company",	"location",	"salary",	"link",	"type_of_work",	"experience",	"employment_type",	"operating_mode"]].isna().sum()

In [None]:
# job_offers[job_offers.isna().any(axis=1)].fillna(0) # will not update original dataframe!

In [None]:
job_offers.fillna(0, inplace=True)

In [None]:
job_offers.isna().sum().sort_values(ascending=True)

In [None]:
job_offers[job_offers.isna().any(axis=1)]

In [None]:
job_offers.reset_index(drop=True, inplace=True)
job_offers

In [None]:
# job_offers = pd.read_csv("./Dane/AI & ML/16_03_2025.csv")
# job_offers.head(50)

In [None]:
job_offers.loc[:, "title"]

## Initial Data Exploration

### Dataframe Shape (rows, columns)

In [None]:
job_offers.shape

### Dataframe columns names

In [None]:
job_offers.columns

In [None]:
job_offers.columns[0:11]

### Dataframe data types

In [None]:
job_offers.dtypes[0:11]

### Find missing data

In [None]:
sum((job_offers.isna().mean() * 100).round(1))

In [None]:
job_offers.isna().sum().sum()

### Description statistics

In [None]:
job_offers.describe()

In [None]:
job_offers.info()

### Count how often each skill appears in job offers

In [None]:
job_offers.iloc[:, 11:].sum().sort_values(ascending=False)

### Find "missing" values (with missing keyword)

In [None]:
job_offers.apply(lambda col: col.astype(str).str.contains(r'\bmissing\b', case=False, na=False)).sum().sort_values(ascending=False)

### Delete unnecessary columns

In [None]:
job_offers.columns[0:11]

In [None]:
# job_offers = job_offers.drop(columns=["link", "scraped_at"])
# job_offers = job_offers.drop(columns=["link", "scraped_at"], errors="ignore")
job_offers = job_offers.drop(columns=["link"], errors="ignore")

In [None]:
job_offers

### Group data

#### Group by company name

In [None]:
job_offers.groupby("company")["company"].count().sort_values(ascending=False)

In [None]:
job_offers.groupby(['company', 'type_of_work']).size().reset_index(name='count')

In [None]:
job_offers.groupby(['company', 'type_of_work']).size().reset_index(name='count')["company"].value_counts()

In [None]:
job_offers.groupby("company")["type_of_work"].nunique() > 1

In [None]:
job_offers.groupby("company")["type_of_work"].nunique().reset_index(name="unique_type_of_work")

##### Delete currency and change dtype to int

In [None]:
jobs_with_salary = job_offers[~job_offers["salary"].str.contains("missing", case=False, na=False)]
jobs_with_salary["salary"].value_counts().head()

In [None]:
jobs_with_salary["salary"] = jobs_with_salary["salary"].str.replace("PLN/month", "").str.replace("PLN/h", "").str.replace("PLN/year", "")
jobs_with_salary

##### Cut salary column to min_salary and max_salary columns (work in progress)

In [None]:
jobs_with_salary["salary"].value_counts()

In [None]:
# jobs_with_salary.insert(4, "min_salary", jobs_with_salary["salary"].str.split("-").apply(lambda x:x[0]))
if "min_salary" not in jobs_with_salary.columns:
    jobs_with_salary.insert(4, "min_salary", jobs_with_salary["salary"].str.split("-").apply(lambda x:x[0]))
else:
    print("Kolumna 'min_salary' już istnieje.")


In [None]:
jobs_with_salary

In [None]:
# jobs_with_salary["salary"].str.split("-").apply(lambda x:x[0].strip().replace(" ", "")).values

In [None]:
# pd.to_numeric(jobs_with_salary["salary"].str.split("-").apply(lambda x:x[0].strip().replace(" ", "")).values) < 1000

In [None]:
(pd.to_numeric(jobs_with_salary["salary"].str.split("-").apply(lambda x:x[0].strip().replace(" ", "")).values) < 1000).sum()

In [None]:
jobs_with_salary.loc[pd.to_numeric(jobs_with_salary["salary"].str.split("-").apply(lambda x:x[0].strip().replace(" ", "")).values) < 1000, "min_salary"] 

In [None]:
jobs_with_salary.loc[pd.to_numeric(jobs_with_salary["salary"].str.split("-").apply(lambda x:x[0].strip().replace(" ", "")).values) < 1000, "min_salary"].str.strip().values

In [None]:
jobs_with_salary.head()

In [None]:
pd.to_numeric(jobs_with_salary.loc[pd.to_numeric(jobs_with_salary["salary"].str.split("-").apply(lambda x:x[0].strip().replace(" ", "")).values) < 1000, "min_salary"].str.strip().values) * 168

In [None]:
jobs_with_salary.head()

In [None]:
jobs_with_salary.loc[pd.to_numeric(jobs_with_salary["salary"].str.split("-").apply(lambda x:x[0].strip().replace(" ", "")).values) < 1000, "min_salary"].index

In [None]:
jobs_with_salary = job_offers[~job_offers["salary"].str.contains("missing", case=False, na=False)]
jobs_with_salary["salary"] = jobs_with_salary["salary"].str.replace("PLN/month", "").str.replace("PLN/h", "").str.replace("PLN/year", "")
jobs_with_salary
jobs_with_salary.insert(4, "min_salary", jobs_with_salary["salary"].str.split("-").apply(lambda x:x[0]))

In [None]:
jobs_with_salary

In [None]:
indexes = jobs_with_salary.loc[pd.to_numeric(jobs_with_salary["salary"].str.split("-").apply(lambda x:x[0].strip().replace(" ", "")).values) < 1000, "min_salary"].index

In [None]:
indexes

In [None]:
jobs_with_salary.loc[indexes, "min_salary"]

In [None]:
jobs_with_salary.loc[indexes, "min_salary"] = jobs_with_salary.loc[indexes, "min_salary"].str.replace(" ", "").astype(int) * 168
# jobs_with_salary.loc[indexes, "salary"] = jobs_with_salary.loc[indexes, "salary"].astype(int) * 168

In [None]:
jobs_with_salary

In [None]:
jobs_with_salary["salary"].str.split("-").apply(lambda x: len(x))

In [None]:
(jobs_with_salary["salary"].str.split("-").apply(lambda x: len(x)) < 2).sum()

##### Find missing max values of salary (when single value instead of range)

In [None]:
test = jobs_with_salary.copy()
missing_max_salary_indexes = jobs_with_salary[jobs_with_salary["salary"].str.split("-").apply(lambda x: len(x)) < 2].index
missing_max_salary_indexes

In [None]:
jobs_with_salary.loc[missing_max_salary_indexes, "salary"]

In [None]:
# jobs_with_salary.loc[missing_max_salary_indexes, "salary"] = int(jobs_with_salary.loc[14, "salary"]) * 168 # DO weryfikacji czy potrzebne

In [None]:
jobs_with_salary.loc[missing_max_salary_indexes, "salary"]

In [None]:
jobs_with_salary.loc[missing_max_salary_indexes, "salary"].values

In [None]:
# jobs_with_salary.loc[14, ["min_salary", "salary"]] # Do weryfikacji czy potrzebne

In [None]:
jobs_with_salary.loc[missing_max_salary_indexes, "salary"] = str(jobs_with_salary.loc[missing_max_salary_indexes, "salary"].values[0])
# str(jobs_with_salary.loc[missing_max_salary_indexes, "salary"].values[0]) + " - " + str(jobs_with_salary.loc[missing_max_salary_indexes, "salary"].values[0])

In [None]:
jobs_with_salary.loc[missing_max_salary_indexes, "salary"]

In [None]:
jobs_with_salary.loc[missing_max_salary_indexes, "salary"] = jobs_with_salary.loc[missing_max_salary_indexes, "salary"] + " - " + jobs_with_salary.loc[missing_max_salary_indexes, "salary"]
jobs_with_salary.loc[missing_max_salary_indexes, "salary"]

In [None]:
jobs_with_salary.head(3)

###### MAX Salary

In [None]:
max_salaries = jobs_with_salary.loc[:, "salary"].str.split("-").apply(lambda x: x[1].strip().replace(" ", ""))

In [None]:
jobs_with_salary.insert(6, "max_salary", max_salaries)

In [None]:
# jobs_with_salary[pd.to_numeric(jobs_with_salary.loc[:, "max_salary"]) < 1000]
jobs_with_salary.head(3)

In [None]:
jobs_with_salary.loc[19]

In [None]:
jobs_with_salary["max_salary"]

In [None]:
jobs_with_salary["max_salary"].astype(int) < 1000

In [None]:
jobs_with_salary[(jobs_with_salary["max_salary"].astype(int) < 1000) == True]

In [None]:
jobs_with_salary.loc[(jobs_with_salary["max_salary"].astype(int) < 1000) == True, "max_salary"] = pd.to_numeric(jobs_with_salary.loc[(jobs_with_salary["max_salary"].astype(int) < 1000) == True, "max_salary"]) * 168

In [None]:
# jobs_with_salary.drop(columns="max_salary", inplace=True)
jobs_with_salary.head(100)

##### Update min and max salaries to be numbers (int)

In [None]:
pd.to_numeric("22 500" , errors="coerce")

In [None]:
# type(jobs_with_salary["min_salary"][0])

In [None]:
# type(jobs_with_salary["max_salary"][0])

In [None]:
jobs_with_salary["min_salary"]
# jobs_with_salary["min_salary"].str.strip()

In [None]:
# Poprawa stringów aby były bez przerw
jobs_with_salary.loc[:, "min_salary"] = jobs_with_salary.loc[:, "min_salary"].astype(str).str.strip().str.replace(" ", "")
jobs_with_salary.loc[:, "max_salary"] = jobs_with_salary.loc[:, "max_salary"].astype(str).str.strip().str.replace(" ", "")
jobs_with_salary.loc[:, ["min_salary", "max_salary"]]

##### Update salary (include when PLN/h instead of PLN/month) (WORK IN PROGRESS)

In [None]:
# jobs_with_salary["salary"].str.split("-").apply(lambda x: x[0]).str.strip().str.replace(" ", "").values

In [None]:
# pd.to_numeric(jobs_with_salary["salary"].str.split("-").apply(lambda x: x[0]).str.strip().str.replace(" ", "").values) < 1000

In [None]:
salary_before_update = jobs_with_salary[pd.to_numeric(jobs_with_salary["salary"].str.split("-").apply(lambda x: x[0]).str.strip().str.replace(" ", "").values) < 1000]["salary"]
salary_before_update

In [None]:
jobs_with_salary["min_salary"].head()

In [None]:
# type(jobs_with_salary["max_salary"][0])

##### Change min and max salary columns to int

In [None]:
# min_salaries_num = pd.to_numeric(jobs_with_salary["min_salary"], errors="coerce")
# max_salaries_num = pd.to_numeric(jobs_with_salary["max_salary"], errors="coerce")
# jobs_with_salary.loc[:, "min_salary"] = min_salaries_num.astype(int)
# jobs_with_salary.loc[:, "max_salary"] = max_salaries_num.astype(int)

In [None]:
#test if works
# print(jobs_with_salary["min_salary"][0] + jobs_with_salary["min_salary"][1])
# print(jobs_with_salary["max_salary"][0] + jobs_with_salary["max_salary"][1])

In [None]:
# TESTING for ERRORS
jobs_with_salary.dtypes

In [None]:
# TESTING for ERRORS
# jobs_with_salary["min_salary"] = pd.to_numeric(jobs_with_salary["min_salary"].astype(str).str.replace(" ", "", regex=True), errors="coerce")
# jobs_with_salary["max_salary"] = pd.to_numeric(jobs_with_salary["max_salary"].astype(str).str.replace(" ", "", regex=True), errors="coerce")
jobs_with_salary.loc[:, "min_salary"] = pd.to_numeric(jobs_with_salary.loc[:, "min_salary"], errors="coerce")
jobs_with_salary.loc[:, "max_salary"] = pd.to_numeric(jobs_with_salary.loc[:, "max_salary"], errors="coerce")
jobs_with_salary["min_salary"] = jobs_with_salary["min_salary"].fillna(0).astype(int)
jobs_with_salary["max_salary"] = jobs_with_salary["max_salary"].fillna(0).astype(int)
print(jobs_with_salary.dtypes)

In [None]:
jobs_with_salary.loc[:, "salary"] = jobs_with_salary.loc[:, "min_salary"].astype(str) + " - " + jobs_with_salary.loc[:, "max_salary"].astype(str)
jobs_with_salary.loc[:, "salary"].head()

In [None]:
jobs_with_salary

In [None]:
# After update
jobs_with_salary[pd.to_numeric(jobs_with_salary["salary"].str.split("-").apply(lambda x: x[0]).str.strip().str.replace(" ", "").values) < 1000]["salary"]

In [None]:
jobs_with_salary.to_csv(r"dane_zlaczone_z_min_max_salary.csv", index=False)  # r przed ścieżką
jobs_with_salary.to_csv(r"dane_zlaczone_z_i_bez_salary.csv", index=False)  # r przed ścieżką

##### Testing section

In [None]:
# test.loc[14, "salary"].strip()

In [None]:
# pd.to_numeric(test.loc[14, "salary"].strip())

In [None]:
# pd.to_numeric(test.loc[14, "salary"].strip()) * 168

In [None]:
# test.loc[14, "salary"] = pd.to_numeric(test.loc[14, "salary"].strip()) * 168
# test.loc[14, "salary"]

In [None]:
# test.loc[14, "salary"] = 25200 
# int(test.loc[14, "salary"])
# test.loc[14, "salary"] = str(test.loc[14, "salary"])
# test.loc[14, "salary"] = test.loc[14, "salary"] + " - " + test.loc[14, "salary"]
# type(test.loc[14, "salary"])
# test.loc[14, "salary"]

In [None]:
# test.loc[[14]]

In [None]:
# test.head()

In [None]:
# test.drop(columns=["max_salary"], inplace=True)

In [None]:
# test["salary"].str.split("-").apply(lambda x: x[1])
# test.insert(5, "max_salary", test["salary"].str.split("-").apply(lambda x: x[1]))

In [None]:
# test.loc[missing_max_salary_indexes, "salary"] 

In [None]:
# test.loc[14]

In [None]:
# test[["min_salary", "max_salary", "salary"]]

#### Group by "experience", "operating_mode", "employment_type"

In [None]:
job_offers.head()

In [None]:
job_offers.groupby(["experience", "operating_mode"])[["employment_type"]].value_counts()

In [None]:
job_offers.groupby(["experience", "operating_mode"])[["employment_type"]].value_counts().reset_index(name="unique_employment_type")

In [None]:
job_offers.groupby(["experience", "operating_mode", "employment_type"])["salary"].value_counts()

In [None]:
jobs_with_salary.groupby(["experience", "operating_mode", "employment_type"])["salary"].value_counts()

In [None]:
jobs_with_salary.groupby(["experience", "operating_mode", "employment_type"])["salary"].value_counts().reset_index()

## Analysis of individual features

#### Salary distribution

In [None]:
plt.plot( range(len(jobs_with_salary["min_salary"])), jobs_with_salary["min_salary"])
plt.plot( range(len(jobs_with_salary["max_salary"])), jobs_with_salary["max_salary"])
plt.show()

In [None]:
jobs_with_salary.columns[:11]

In [None]:
jobs_with_salary.groupby("experience")[["min_salary", "max_salary"]].value_counts()

In [None]:
print("====Junior====")
print(jobs_with_salary.groupby("experience").get_group("Junior")[["min_salary", "max_salary"]])
print("====Mid====")
print(jobs_with_salary.groupby("experience").get_group("Mid")[["min_salary", "max_salary"]])
print("====Senior====")
print(jobs_with_salary.groupby("experience").get_group("Senior")[["min_salary", "max_salary"]])

In [None]:
junior_salaries = jobs_with_salary.groupby("experience").get_group("Junior")[["min_salary", "max_salary"]]
mid_salaries = jobs_with_salary.groupby("experience").get_group("Mid")[["min_salary", "max_salary"]]
senior_salaries = jobs_with_salary.groupby("experience").get_group("Senior")[["min_salary", "max_salary"]]
junior_salaries.loc[:, "min_salary"]

In [None]:
plt.figure(figsize=(14, 8))
# plt.grid()
plt.title("Job offers with salary")
plt.xlabel("Offers count")
plt.ylabel("Salary")
plt.scatter(range(len(junior_salaries)), junior_salaries["min_salary"], color="red", label="Junior")
plt.scatter(range(len(mid_salaries)), mid_salaries["min_salary"], color="#973678", label="Mid")
plt.scatter(range(len(senior_salaries)), senior_salaries["min_salary"], color="green", label="Senior")

# plt.xticks
plt.legend()

In [None]:
jobs_with_salary.head()

In [None]:

filtered_offers = jobs_with_salary.groupby(["experience", "operating_mode", "employment_type"])["min_salary"].value_counts().reset_index()
print(filtered_offers.head())
print("===============")
# filtered_offers = filtered_offers[filtered_offers["experience"]  == "Senior"] 
# print(filtered_offers.head())
# filtered_offers = filtered_offers[filtered_offers["operating_mode"]  == "Remote"] 
# print(filtered_offers.head())
# filtered_offers = filtered_offers[filtered_offers["employment_type"]  == "Permanent"] 
# print(filtered_offers.head())
# filtered_offers = filtered_offers[filtered_offers["employment_type"].str.contains("Permanent")] 
# print(filtered_offers.head(10))
# print("+++++++++++++++++++++++++")
print(filtered_offers[(filtered_offers["experience"]  == "Senior") & (filtered_offers["operating_mode"]  == "Remote") & (filtered_offers["employment_type"].str.contains("Permanent"))])

# filtered_offers
# print(filtered_offers.head())

In [None]:
filtered_offers[(filtered_offers["experience"]  == "Senior") & (filtered_offers["operating_mode"]  == "Remote") & (filtered_offers["employment_type"].str.contains("Permanent"))]

In [None]:
filtered_offers[(filtered_offers["experience"]  == "Mid") & (filtered_offers["operating_mode"]  == "Remote") & (filtered_offers["employment_type"].str.contains("Permanent"))]

In [None]:
filtered_offers[(filtered_offers["experience"]  == "Mid") & (filtered_offers["operating_mode"]  == "Remote") & (filtered_offers["employment_type"].str.contains("B2B"))]

#### Popularity of technology

In [None]:
job_offers.head()

In [None]:
technologies_cols = job_offers.iloc[:, 11:]

In [None]:
technologies_cols.sum()

In [None]:
ten_most_popular_techs = technologies_cols.sum().sort_values(ascending=False).iloc[0:10]
# ten_most_popular_techs.index

In [None]:
plt.bar(ten_most_popular_techs.index, ten_most_popular_techs.values)
plt.xticks(rotation=45)
plt.show()

In [None]:
plt.barh(ten_most_popular_techs.index, ten_most_popular_techs.values)
plt.show()

#### Cities with most offers

In [None]:
cities_offers = job_offers.groupby("location")["location"].count().sort_values(ascending=False)[:10]

In [None]:
plt.bar(cities_offers.index, cities_offers.values)
plt.xticks(rotation=45)
plt.show()

In [None]:
plt.barh(cities_offers.index, cities_offers.values)
plt.show()

#### Type of work

In [None]:
job_offers.groupby(["location", "type_of_work"])["type_of_work"].value_counts()

In [None]:
jobs_by_type_of_work_without_salary = job_offers.groupby(["type_of_work"])["type_of_work"].value_counts()
plt.bar(jobs_by_type_of_work_without_salary.index, jobs_by_type_of_work_without_salary.values)

In [None]:
jobs_with_salary.groupby(["location", "type_of_work"])["type_of_work"].value_counts()

In [None]:
jobs_by_type_of_work = jobs_with_salary.groupby(["type_of_work"])["type_of_work"].value_counts()
plt.bar(jobs_by_type_of_work.index, jobs_by_type_of_work.values)

In [None]:
fig, axes = plt.subplots(nrows=1, ncols=2, figsize=(14, 6))
markers = ["^", ".", ]
colors = ["r", "b", ]

axes[0].bar(jobs_by_type_of_work_without_salary.index, jobs_by_type_of_work_without_salary.values)
axes[0].set_title("Jobs types with missing salaries")
axes[0].set_xlabel("Jobs types")
axes[0].set_ylabel("Amount")
axes[1].bar(jobs_by_type_of_work.index, jobs_by_type_of_work.values, color="green")
axes[1].set_title("Jobs types without missing salaries")
axes[1].set_xlabel("Jobs types")
axes[1].set_ylabel("Amount")

### Technology correlation

In [None]:
num_columns = len(job_offers.columns[11:])
print(num_columns)
print(num_columns / 50)
print(num_columns // 50)
print(num_columns - ((num_columns // 50) * 50))

In [None]:
import math

chunk = 50
start_cols = 11
num_columns = len(job_offers.columns[11:])
num_plots = math.ceil(num_columns/chunk)
print(num_plots)

cols_per_row = 2
rows = math.ceil(num_plots/cols_per_row)

fig, axes = plt.subplots(rows, cols_per_row, figsize=(cols_per_row * 12, rows * 10))
# fig, axes = plt.subplots(rows, cols_per_row, figsize=(cols_per_row * 12, rows * 10))

axes = axes.flatten()

for i in range(num_plots):
    tech_columns = job_offers.columns[start_cols: start_cols + chunk] 
    tech_df = job_offers[tech_columns]
    corr_matrix = tech_df.corr()

    sns.heatmap(corr_matrix, cmap="coolwarm", annot=False, linewidths=0.5, ax=axes[i])
    axes[i].set_title(f"Technology correlation in job offers")
    # plt.title("Technology correlation in job offers")
    # plt.show()
    start_cols = start_cols + chunk

for j in range(num_plots, len(axes)):
    axes[j].axis("off")

plt.tight_layout()
plt.show()


In [None]:
import math

chunk = 50
start_index = 0
start_cols = 11
num_columns = len(job_offers.columns[11:])
rest_int_cols = num_columns / 50
cols_num = math.ceil(rest_int_cols)
rest_cols = num_columns - ((num_columns // 50) * 50)
print(rest_int_cols)
print(math.ceil(rest_int_cols))

fig, axes = plt.subplots(1, cols_num, figsize=(cols_num * 12, 10))

for i in range(math.ceil(rest_int_cols)):
    tech_columns = job_offers.columns[start_cols: start_cols + chunk] 
    tech_df = job_offers[tech_columns]
    corr_matrix = tech_df.corr()
    # plt.figure(figsize=(12, 10))
    sns.heatmap(corr_matrix, cmap="coolwarm", annot=False, linewidths=0.5, ax=axes[i])
    
    # plt.title("Technology correlation in job offers")
    # plt.show()
    start_cols = start_cols + chunk

In [None]:
tech_columns = job_offers.columns[11:] 
tech_df = job_offers[tech_columns]
corr_matrix = tech_df.corr()
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, cmap="coolwarm", annot=False, linewidths=0.5)
plt.title("Technology correlation in job offers")
plt.show()

In [None]:
tech_columns = job_offers.columns[11:] 
tech_df = job_offers[tech_columns].sum().sort_values(ascending=False)[0:50].index
# tech_df = job_offers[tech_columns].sum().sort_values(ascending=False).head(50).index
corr_matrix = job_offers[tech_df].corr()
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, cmap="coolwarm", annot=False, linewidths=0.5)
plt.title("Technology correlation in job offers")
plt.show()

In [None]:
tech_columns = job_offers.columns[11:] 
tech_df = job_offers[tech_columns].sum().sort_values(ascending=False)[50:100].index
# tech_df = job_offers[tech_columns].sum().sort_values(ascending=False).head(50).index
corr_matrix = job_offers[tech_df].corr()
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, cmap="coolwarm", annot=False, linewidths=0.5)
plt.title("Technology correlation in job offers")
plt.show()

In [None]:
tech_columns = job_offers.columns[11:] 
tech_df = job_offers[tech_columns].sum().sort_values(ascending=False)[150:200].index
# tech_df = job_offers[tech_columns].sum().sort_values(ascending=False).head(50).index
corr_matrix = job_offers[tech_df].corr()
plt.figure(figsize=(12, 10))
sns.heatmap(corr_matrix, cmap="coolwarm", annot=False, linewidths=0.5)
plt.title("Technology correlation in job offers")
plt.show()

In [None]:
len(job_offers.columns[11:])

In [None]:
tech_columns = job_offers.columns[11:] 
tech_columns
tech_df = job_offers[tech_columns].sum().sort_values(ascending=False)[150:200].index
tech_df
corr_matrix = job_offers[tech_df].corr()
corr_matrix
# plt.figure(figsize=(12, 10))
# sns.heatmap(corr_matrix, cmap="coolwarm", annot=False, linewidths=0.5)
# plt.title("Technology correlation in job offers")
# plt.show()

In [None]:
corr_pairs = corr_matrix.unstack().reset_index()
# print(corr_matrix.unstack())
# print(corr_pairs)
corr_pairs.columns = ['var1', 'var2', 'corr_value']
# print(corr_pairs)
corr_pairs = corr_pairs[corr_pairs['var1'] != corr_pairs['var2']]
corr_pairs = corr_pairs.sort_values(by='corr_value', ascending=False)
print(corr_pairs.head(20))

In [None]:
corr_pairs[corr_pairs["corr_value"] > 0]

### Write data to csv files

In [None]:
jobs_with_salary.head(10)

In [None]:
len(jobs_with_salary)

In [None]:
len(job_offers)

In [None]:
job_offers.shape

In [None]:
job_offers[:3]

In [None]:
job_offers.to_csv(r"wszystkie_dane_zlaczone_z_i_bez_salary.csv", index=False)  # r przed ścieżką

In [None]:
jobs_with_salary.to_csv(r"wszystkie_dane_tylko_z_salary.csv", index=False)  # r przed ścieżką