In [2]:
import pandas as pd 

In [3]:
df = pd.read_csv('./india_job_market_dataset.csv')
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000 entries, 0 to 19999
Data columns (total 15 columns):
 #   Column                 Non-Null Count  Dtype 
---  ------                 --------------  ----- 
 0   Job ID                 20000 non-null  object
 1   Job Title              20000 non-null  object
 2   Company Name           20000 non-null  object
 3   Job Location           20000 non-null  object
 4   Job Type               20000 non-null  object
 5   Salary Range           20000 non-null  object
 6   Experience Required    20000 non-null  object
 7   Posted Date            20000 non-null  object
 8   Application Deadline   20000 non-null  object
 9   Job Portal             20000 non-null  object
 10  Number of Applicants   20000 non-null  int64 
 11  Education Requirement  20000 non-null  object
 12  Skills Required        20000 non-null  object
 13  Remote/Onsite          20000 non-null  object
 14  Company Size           20000 non-null  object
dtypes: int64(1), object

In [4]:
import pandas as pd
import numpy as np
from datetime import datetime
import re

def clean_salary_range(salary):
    # Extract numbers from salary range
    numbers = re.findall(r'(\d+(?:\.\d+)?)', str(salary))
    if len(numbers) >= 2:
        return (float(numbers[0]) + float(numbers[1])) / 2
    elif len(numbers) == 1:
        return float(numbers[0])
    return np.nan

def analyze_job_market(df):
    # Create a copy to avoid modifying original
    df_analysis = df.copy()
    
    # Clean and process salary data
    df_analysis['Average_Salary'] = df_analysis['Salary Range'].apply(clean_salary_range)
    
    # Convert dates to datetime
    df_analysis['Posted_Date'] = pd.to_datetime(df_analysis['Posted Date'])
    df_analysis['Application_Deadline'] = pd.to_datetime(df_analysis['Application Deadline'])
    
    # Calculate various metrics
    job_metrics = {
        'top_paying_jobs': analyze_top_paying_jobs(df_analysis),
        'job_trends': analyze_job_trends(df_analysis),
        'location_analysis': analyze_locations(df_analysis),
        'risky_jobs': analyze_risky_jobs(df_analysis),
        'skill_demand': analyze_skill_demand(df_analysis)
    }
    
    return job_metrics

def analyze_top_paying_jobs(df):
    top_jobs = df.groupby('Job Title').agg({
        'Average_Salary': 'mean',
        'Job ID': 'count'
    }).reset_index()
    
    # Filter jobs with significant number of openings
    min_job_count = top_jobs['Job ID'].quantile(0.1)
    top_jobs = top_jobs[top_jobs['Job ID'] >= min_job_count]
    
    return top_jobs.sort_values('Average_Salary', ascending=False).head(10)

def analyze_job_trends(df):
    monthly_trends = df.groupby([
        df['Posted_Date'].dt.to_period('M'),
        'Job Title'
    ]).size().reset_index(name='count')
    
    # Calculate growth rate
    trends = monthly_trends.pivot(
        index='Job Title',
        columns='Posted_Date',
        values='count'
    ).fillna(0)
    
    growth_rate = (trends.iloc[:, -1] - trends.iloc[:, 0]) / trends.iloc[:, 0] * 100
    
    return growth_rate.sort_values(ascending=False)

def analyze_locations(df):
    location_metrics = df.groupby('Job Location').agg({
        'Average_Salary': ['mean', 'count'],
        'Number of Applicants': 'mean'
    }).round(2)
    
    location_metrics.columns = ['Avg_Salary', 'Job_Count', 'Avg_Applicants']
    return location_metrics.sort_values('Job_Count', ascending=False)

def analyze_risky_jobs(df):
    risk_metrics = df.groupby('Job Title').agg({
        'Number of Applicants': ['mean', 'count'],
        'Average_Salary': 'mean'
    }).round(2)
    
    risk_metrics.columns = ['Avg_Applicants', 'Job_Count', 'Avg_Salary']
    
    # Calculate competition score (higher means more risky)
    risk_metrics['Competition_Score'] = (
        risk_metrics['Avg_Applicants'] / risk_metrics['Job_Count']
    ) / risk_metrics['Avg_Salary']
    
    return risk_metrics.sort_values('Competition_Score', ascending=False)

def analyze_skill_demand(df):
    # Split skills and analyze frequency
    all_skills = df['Skills Required'].str.split(',').explode()
    skill_demand = all_skills.value_counts().head(20)
    
    return skill_demand

# Main analysis
def main():
    # Read the dataset
    df = pd.read_csv('./india_job_market_dataset.csv')
    
    # Perform analysis
    results = analyze_job_market(df)
    
    # Print insights
    print("\n=== Top 10 Highest Paying Jobs ===")
    print(results['top_paying_jobs'][['Job Title', 'Average_Salary', 'Job ID']]
          .rename(columns={'Job ID': 'Number of Openings'}))
    
    print("\n=== Fastest Growing Job Roles ===")
    print(results['job_trends'].head(10))
    
    print("\n=== Top Job Locations by Opportunity ===")
    print(results['location_analysis'].head(10))
    
    print("\n=== Jobs to Approach with Caution (High Competition) ===")
    print(results['risky_jobs'].head(10))
    
    print("\n=== Most In-Demand Skills ===")
    print(results['skill_demand'])

if __name__ == "__main__":
    main()


=== Top 10 Highest Paying Jobs ===
                Job Title  Average_Salary  Number of Openings
0        Business Analyst       11.472097                1989
6     Marketing Executive       11.384086                2036
7         Product Manager       11.351446                2006
5              HR Manager       11.347518                1974
3       Financial Analyst       11.311006                2008
2          Data Scientist       11.215840                1976
4        Graphic Designer       11.208644                2013
1  Cyber Security Analyst       11.201199                2085
9       Software Engineer       11.081431                1971

=== Fastest Growing Job Roles ===
Job Title
HR Manager                163.535912
Sales Representative      160.966543
Data Scientist            151.601423
Graphic Designer          151.308901
Product Manager           145.266781
Financial Analyst         141.496599
Marketing Executive       139.899833
Software Engineer         139.827586
Cyb

In [7]:
df.head(2)

Unnamed: 0,Job ID,Job Title,Company Name,Job Location,Job Type,Salary Range,Experience Required,Posted Date,Application Deadline,Job Portal,Number of Applicants,Education Requirement,Skills Required,Remote/Onsite,Company Size
0,JOB1,Software Engineer,Amazon,Ahmedabad,Full-time,5-8 LPA,2-5 years,2025-01-16,2025-01-25,LinkedIn,23,PhD,"C++, SQL, Python",Remote,Small (1-50)
1,JOB2,Marketing Executive,Infosys,Ahmedabad,Internship,5-8 LPA,2-5 years,2024-12-25,2025-01-19,Indeed,462,MBA,"SQL, C++, Python",Remote,Large (500+)
