## Job Location Analysis

#### **Geographical Trends:** Identify which locations have the highest number of job postings and the most lucrative salaries. What are the top cities for job seekers?


In [2]:
import pandas as pd

# Load the datasets
postings = pd.read_csv('updated_data\postings.csv')
companies = pd.read_csv('updated_data\company.csv')
company_industries = pd.read_csv('updated_data\company_industries.csv')
company_specialities = pd.read_csv('updated_data\company_specialities.csv')
employee_counts = pd.read_csv('updated_data\employee_counts.csv')
salaries = pd.read_csv('updated_data\salaries.csv')
job_skills = pd.read_csv('updated_data\Job_skills.csv')
job_industries = pd.read_csv('updated_data\modified_job_industries.csv')


In [8]:
postings = postings.astype({
    'job_id': 'int32',
    'company_name': 'string',    # Use 'string' for modern text processing (like VARCHAR)
    'title': 'string',
    'description': 'string',
    'pay_period': 'string',
    'location': 'string',
    'company_id': 'int32',
    'views': 'int32',
    'formatted_work_type': 'string',
    'applies': 'int32',
    'job_posting_url': 'string',
    'work_type': 'string',
    'normalized_salary': 'float',
    'zip_code': 'string',
    'fips': 'int32'
})

In [10]:
salary_job_stats_by_location = postings.groupby('location').agg(
    job_count=('job_id', 'count'),
    max_salary=('normalized_salary', 'max')
)
salary_job_stats_by_location

Unnamed: 0_level_0,job_count,max_salary
location,Unnamed: 1_level_1,Unnamed: 2_level_1
"Aberdeen Proving Ground, MD",2,125000.0
"Aberdeen, MD",1,132550.0
"Aberdeen, WA",3,108160.0
"Abilene, TX",8,78821.6
"Abingdon, MD",1,77562.5
...,...,...
"Yuma, AZ",3,123950.0
"Zanesville, OH",2,50835.2
"Zebulon, NC",2,110364.8
"Zelienople, PA",1,97500.0


In [21]:
# Disable scientific notation in pandas for floats
pd.set_option('display.float_format', '{:.2f}'.format)

In [19]:
salary_job_stats_by_location['max_salary'] = salary_job_stats_by_location['max_salary'].round(2)
salary_job_stats_by_location

Unnamed: 0_level_0,job_count,max_salary
location,Unnamed: 1_level_1,Unnamed: 2_level_1
"Aberdeen Proving Ground, MD",2,125000.00
"Aberdeen, MD",1,132550.00
"Aberdeen, WA",3,108160.00
"Abilene, TX",8,78821.60
"Abingdon, MD",1,77562.50
...,...,...
"Yuma, AZ",3,123950.00
"Zanesville, OH",2,50835.20
"Zebulon, NC",2,110364.80
"Zelienople, PA",1,97500.00


In [22]:
salary_job_stats_by_location_sorted = salary_job_stats_by_location.sort_values(by='job_count', ascending=False)

salary_job_stats_by_location_sorted


Unnamed: 0_level_0,job_count,max_salary
location,Unnamed: 1_level_1,Unnamed: 2_level_1
United States,2556,6900400.00
"New York, NY",1499,925000.00
"Los Angeles, CA",522,415000.00
"Chicago, IL",459,750000.00
New York City Metropolitan Area,446,875000.00
...,...,...
"Wytheville, VA",1,71468.80
"Yadkinville, NC",1,115000.00
"Yolo County, CA",1,57336.00
"York County, PA",1,140000.00


In [26]:
import plotly.express as px
import plotly.graph_objects as go

# Assuming 'salary_job_stats_by_location_sorted' is your sorted DataFrame
# Select top 10 states based on salary and job count
top_10_salary = salary_job_stats_by_location_sorted.nlargest(10, 'max_salary')
top_10_job_count = salary_job_stats_by_location_sorted.nlargest(10, 'job_count')

# Create the initial bar graph for salary
fig = px.bar(
    top_10_salary,
    x='max_salary',
    y=top_10_salary.index,
    orientation='h',
    labels={'max_salary': 'Max Salary', 'index': 'Location'},
    title="Top 10 Locations by Max Salary"
)

# Customize the hover template for salary
hover_template_salary = '<b>Location</b>: %{y}<br><b>Max Salary</b>: $%{x:.2f}<extra></extra>'

# Customize the hover template for job count
hover_template_job_count = '<b>Location</b>: %{y}<br><b>Job Count</b>: %{x}<extra></extra>'

# Add a dropdown filter to switch between 'salary' and 'job_count'
fig.update_layout(
    updatemenus=[
        dict(
            buttons=[
                dict(
                    args=[{
                        'x': [top_10_salary['max_salary']],
                        'y': [top_10_salary.index],
                        'type': 'bar',
                        'hovertemplate': hover_template_salary
                    }],
                    label="Max Salary",
                    method="update"
                ),
                dict(
                    args=[{
                        'x': [top_10_job_count['job_count']],
                        'y': [top_10_job_count.index],
                        'type': 'bar',
                        'hovertemplate': hover_template_job_count
                    }],
                    label="Job Count",
                    method="update"
                )
            ],
            direction="down",
            pad={"r": 10, "t": 20, "b": 40},  # Increased padding below the dropdown
            showactive=True,
            x=0.17,
            xanchor="left",
            y=1.1,  # Adjust 'y' to move the dropdown lower
            yanchor="top"
        ),
    ]
)

# Update the layout to make the bar graph horizontal
fig.update_traces(marker_color='indianred', marker_line_color='black', marker_line_width=1.5)
fig.update_layout(
    xaxis_title="Values",
    yaxis_title="Location",
    title="Top 10 Locations by Max Salary or Job Count",
    yaxis=dict(categoryorder='total ascending'),
    autosize=False,
    width=900,
    height=700,  # Further increased height for more space below the dropdown
)

# Show the figure
fig.show()


#### **Remote Work Opportunities:** Analyze the growth of remote job postings. How has the percentage of remote jobs changed over time?

In [28]:
postings.head(5)

Unnamed: 0,job_id,company_name,title,description,pay_period,location,company_id,views,formatted_work_type,applies,original_listed_time,remote_allowed,job_posting_url,expiry,listed_time,work_type,normalized_salary,zip_code,fips
0,921716,Corcoran Sawyer Smith,Marketing Coordinator,Job descriptionA leading real estate firm in N...,HOURLY,"Princeton, NJ",2774458,20,Full-time,2,2024-04-17 23:45:08,False,https://www.linkedin.com/jobs/view/921716/?trk...,2024-05-17 23:45:08,2024-04-17 23:45:08,FULL_TIME,38480.0,8540.0,34021
1,10998357,The National Exemplar,Assitant Restaurant Manager,The National Exemplar is accepting application...,YEARLY,"Cincinnati, OH",64896719,8,Full-time,0,2024-04-16 14:26:54,False,https://www.linkedin.com/jobs/view/10998357/?t...,2024-05-16 14:26:54,2024-04-16 14:26:54,FULL_TIME,55000.0,45202.0,39061
2,23221523,"Abrams Fensterman, LLP",Senior Elder Law / Trusts and Estates Associat...,Senior Associate Attorney - Elder Law / Trusts...,YEARLY,"New Hyde Park, NY",766262,16,Full-time,0,2024-04-12 04:23:32,False,https://www.linkedin.com/jobs/view/23221523/?t...,2024-05-12 04:23:32,2024-04-12 04:23:32,FULL_TIME,157500.0,11040.0,36059
3,91700727,Downtown Raleigh Alliance,Economic Development and Planning Intern,Job summary:The Economic Development & Plannin...,HOURLY,"Raleigh, NC",1481176,9,Internship,4,2024-04-18 16:01:39,False,https://www.linkedin.com/jobs/view/91700727/?t...,2024-05-18 16:01:39,2024-04-18 16:01:39,INTERNSHIP,35360.0,27601.0,37183
4,103254301,Raw Cereal,Producer,Company DescriptionRaw Cereal is a creative de...,YEARLY,United States,81942316,7,Contract,1,2024-04-11 18:43:39,True,https://www.linkedin.com/jobs/view/103254301/?...,2024-05-11 18:43:39,2024-04-11 18:43:39,CONTRACT,180000.0,Unknown,0


In [30]:
remote_allowed_counts = postings['remote_allowed'].value_counts()
remote_allowed_counts

remote_allowed
False    30809
True      4755
Name: count, dtype: int64

In [54]:
import plotly.express as px
import plotly.graph_objects as go

# Assuming 'salary_job_stats_by_location_sorted' is your sorted DataFrame
# Select top 10 states based on salary and job count
top_10_salary = salary_job_stats_by_location_sorted.nlargest(10, 'max_salary')
top_10_job_count = salary_job_stats_by_location_sorted.nlargest(10, 'job_count')

# Remote allowed counts
remote_counts = {'remote_allowed': [False, True], 'count': [30809, 4755]}  # Your count values
remote_labels = ['Onsite', 'Remote']
remote_values = [remote_counts['count'][0], remote_counts['count'][1]]

# Create the initial bar graph for salary
fig = px.bar(
    top_10_salary,
    x='max_salary',
    y=top_10_salary.index,
    orientation='h',
    labels={'max_salary': 'Max Salary', 'index': 'Location'},
    title="Top 10 Locations by Max Salary"
)

# Customize the hover template for salary
hover_template_salary = '<b>Location</b>: %{y}<br><b>Max Salary</b>: $%{x:.2f}<extra></extra>'

# Customize the hover template for job count
hover_template_job_count = '<b>Location</b>: %{y}<br><b>Job Count</b>: %{x}<extra></extra>'

# Add a dropdown filter to switch between 'salary' and 'job_count'
fig.update_layout(
    updatemenus=[dict(
        buttons=[
            dict(
                args=[{
                    'x': [top_10_salary['max_salary']],
                    'y': [top_10_salary.index],
                    'type': 'bar',
                    'hovertemplate': hover_template_salary
                }],
                label="Max Salary",
                method="update"
            ),
            dict(
                args=[{
                    'x': [top_10_job_count['job_count']],
                    'y': [top_10_job_count.index],
                    'type': 'bar',
                    'hovertemplate': hover_template_job_count
                }],
                label="Job Count",
                method="update"
            )
        ],
        direction="down",
        pad={"r": 70, "t": 80, "b": 100},  # Increased padding below the dropdown
        showactive=True,
        x=0.17,
        xanchor="left",
        y=1.3,  # Adjusted 'y' to move the dropdown higher
        yanchor="top"
    )]
)

# Update the layout to make the bar graph horizontal
fig.update_traces(marker_color='indianred', marker_line_color='black', marker_line_width=1.5)
fig.update_layout(
    xaxis_title="Values",
    yaxis_title="Location",
    title="Top 10 Locations by Max Salary or Job Count",
    yaxis=dict(categoryorder='total ascending'),
    autosize=False,
    width=900,
    height=700,  # Further increased height for more space below the dropdown
)

# Add annotations for Remote and Onsite counts (styled)
spacing = 0.1  # Adjust spacing between boxes
for i, (label, value) in enumerate(zip(remote_labels, remote_values)):
    color = 'green' if label == 'Onsite' else 'blue'  # Set color for each box
    x_position = 0.85 + (i * spacing)  # Place boxes with spacing
    # Adjust the x_position for "Onsite" to be more to the left
    if label == 'Onsite':
        x_position -= 0.15  # Move "Onsite" box slightly left to avoid overlap

    fig.add_annotation(
        xref='paper',
        yref='paper',
        x=x_position,  # Updated x position
        y=1.2,  # Adjusted y position for better spacing
        text=f"<b>{label}</b>: {value}",
        showarrow=False,  # No arrow for the box
        bgcolor=color,  # Background color for the box
        bordercolor='black',
        borderwidth=2,
        borderpad=10,
        font=dict(color='black', size=14, family='Arial'),
        align='center',
        width=100,  # Width of the box
        height=40  # Height of the box
    )

# Add margins to create space below the boxes and dropdown
fig.update_layout(margin=dict(t=120, l=50, b=50, r=50))  # Increased top margin for more space

# Show the figure
fig.show()
