# **Exploratory Data Analysis**

Roles to Explore:

In [None]:
# Importing Libraries
import ast
import pandas as pd
import seaborn as sns
from datasets import load_dataset
import matplotlib.pyplot as plt
import plotly.express as px
import plotly.graph_objects as go 
from plotly.subplots import make_subplots

# Loading Data
dataset = load_dataset('lukebarousse/data_jobs')
df = dataset['train'].to_pandas()

# Data Cleanup
df['job_posted_date'] = pd.to_datetime(df['job_posted_date'])
df['job_skills'] = df['job_skills'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else(x))

Filter for Germany Data Analyst roles

In [None]:
df_DA_Germany = df[(df['job_country'] == 'Germany') & (df['job_title_short'] == 'Data Analyst')]
df_DA_Germany.head(10)

In [None]:
df_DA_Germany['job_location'].value_counts().head(5)

In [None]:
df_location_plot = (df_DA_Germany['job_location']
    .value_counts()
    .head(10)
    .to_frame()
    .reset_index())

sns.set_theme(style='ticks')
sns.barplot(data=df_location_plot, x='count', y='job_location', palette='dark:g_r')
plt.title('Counts of Job Locations for Data Analyst in Germany')
plt.xlabel('Number of Jobs')
plt.ylabel('')
plt.show()

Cleaning Job Locations

In [None]:
# Checking for exact string "Germany" (not containing Germany)
exact_germany = df_DA_Germany[df_DA_Germany['job_location'] == 'Germany']

# Checking if exact string 'Germany' is associated with remote work and can we re-named as 'Anywhere'
if exact_germany.empty:
    print("No rows with exact 'Germany in job_location")
else:
    all_remote = exact_germany['job_work_from_home'].all()
    
    if all_remote:
        print("✓ All exact 'Germany' rows have work_from_home = True")
    else:
        print("✗ Some 'Germany' rows don't have work_from_home = True")
        print("Problematic rows:")
        print(exact_germany[~exact_germany['job_work_from_home']])

Checking the percentage of number of rows containing both exact string 'Germany' and work_from_home as False

In [None]:
# Creating dataset containing only 'Germany' and is not remote
df_exact_germany_no_remote = df_DA_Germany[(df_DA_Germany['job_location'] == 'Germany') & (df_DA_Germany['job_work_from_home'] == False)]

germany_no_remote_percentage = (len(df_exact_germany_no_remote) / len(df_DA_Germany)) * 100

print(f"Rows with exact 'Germany' and no remote work: {len(df_exact_germany_no_remote):,}")
print(f"Total rows in dataset Germany - Data Analyst: {len(df_DA_Germany)}")
print(f"Percentage of jobs with location Germany and no remote work {germany_no_remote_percentage:.2f}")

In [None]:
df_DA_Germany['location_cleaned'] = df_DA_Germany['job_location'].str.split(',').str[0]

# Updating based on the work from home status
df_DA_Germany.loc[(df_DA_Germany['job_location'] == 'Germany') & df_DA_Germany['job_work_from_home'], 'location_cleaned'] = 'Anywhere'
df_DA_Germany.loc[(df_DA_Germany['job_location'] == 'Germany') & ~df_DA_Germany['job_work_from_home'], 'location_cleaned'] = 'Multiple locations'

print(df_DA_Germany['location_cleaned'].value_counts().head(10))



In [None]:
df_DA_Germany.head(10)

Plotting with cleaned location data

In [None]:
df_location_plot = (df_DA_Germany['location_cleaned']
    .value_counts()
    .head(10)
    .to_frame()
    .reset_index())

fig = px.bar(
    df_location_plot,
    x='count',
    y='location_cleaned',
    orientation='h',
    title= 'Counts of Job Locations for Data Analyst in Germany',
    color='count',
    color_continuous_scale='Greens',
    text='count'
)

fig.update_layout(
    yaxis={'categoryorder': 'total ascending'},
    xaxis_title='',
    yaxis_title='',
)

fig.update_traces(
    textposition='outside',
    textfont=dict(size=12)
)

fig.show()

Mapping type of contracts

In [None]:
df_DA_Germany['job_schedule_type'].value_counts()

In [None]:
df_DA_Germany = df_DA_Germany.copy()

# Simplying to 4 main categories contract types
def simplify_schedule_type(schedule):
    if pd.isna(schedule):
        return None
    
    schedule_types = str(schedule).lower()

    if 'full-time' in schedule_types:
        return 'Full-time'

    elif 'part-time' in schedule_types:
        return 'Part-time'

    elif 'internship' in schedule_types:
        return 'Internship'

    elif 'contractor' in schedule_types or 'temp' in schedule_types:
        return 'Contract/Temp'

    else:
        return 'Other'

# Applying schedule type mapping
df_DA_Germany['schedule_simplified'] = df_DA_Germany['job_schedule_type'].apply(simplify_schedule_type)

print("Distribution after simplification:")
print(df_DA_Germany['schedule_simplified'].value_counts())

In [None]:
filtered_df = df_DA_Germany.loc[
    df_DA_Germany['schedule_simplified'] == 'Other',
    ['job_schedule_type', 'schedule_simplified']
]

print(filtered_df)

In [None]:
# Dropping 'Other' for type of contracts in Germany since it feels like mistake
df_DA_Germany = df_DA_Germany[df_DA_Germany['schedule_simplified'] != 'Other']


In [None]:
# Creating visualization dataframe
df_viz = (
    df_DA_Germany['schedule_simplified']
    .value_counts()
    .reset_index()
)

fig = px.pie(df_viz,
            values= 'count',
            names= 'schedule_simplified',
            title= 'Type of Contracts in Germany',
            color_discrete_sequence=px.colors.qualitative.Set3
)

fig.update_traces(textposition= 'outside',
                  textinfo= 'percent+label'
)

fig.show()

Checking the number of null values for salaries

In [None]:
salary_nan = df_DA_Germany['salary_year_avg'].isna().sum()
print(f"Total rows: {len(df_DA_Germany)}")
print(f"Null values: {salary_nan}")

In [None]:
# Adding new column to specify if the salary is shown in the job offer or not
df_DA_Germany['salary_mentioned'] = df_DA_Germany['salary_year_avg'].apply(lambda x: pd.notna(x))
df_DA_Germany['salary_mentioned'].value_counts()

In [None]:
print(df_DA_Germany.columns.tolist())

In [None]:
dict_column = {
    'job_work_from_home': 'Work From Home',
    'job_no_degree_mention': 'Job Degree Required',
    'salary_mentioned': 'Salary Specified'
}

fig = make_subplots(
    rows=1,
    cols=3,
    specs=[[{'type': 'pie'}, {'type': 'pie'}, {'type': 'pie'}]],
    subplot_titles=[f"{title}" for title in dict_column.values()],
    vertical_spacing=0.9
)

for i, (column, title) in enumerate(dict_column.items(), 1):
    value_counts = df_DA_Germany[column].value_counts()

    fig.add_trace(
        go.Pie(
            labels=['No', 'Yes'],
            values=value_counts.values,
            hole=0.4,
            textinfo='percent+label',
            textposition='outside',
            marker=dict(
                colors=['#FF6B6B', '#4ECDC4'],
                line=dict(color='white', width=2)
            ),
            pull=[0.05, 0],
            rotation=90
        ),
        row=1, col=i
    )

fig.update_annotations(
    dict(
        font=dict(
            size=16,
            family="Arial Black",
            color="#2E86AB"
        ),
        y=1.07 
    )
)

fig.show()

In [None]:
df_company_plot = (df_DA_Germany['company_name']
    .value_counts()
    .head(10)
    .to_frame()
    .reset_index())

fig = px.bar(
    df_company_plot,
    x='count',
    y='company_name',
    orientation='h',
    title= 'Top 10 Companies Hiring Data Analysts in Germany',
    color='count',
    color_continuous_scale='bluyl',
    text='count'
)

fig.update_layout(
    yaxis={'categoryorder': 'total ascending'},
    xaxis_title='',
    yaxis_title='',
)

fig.update_traces(
    textposition='outside',
    textfont=dict(size=12)
)

fig.show()