In [None]:
import pandas as pd

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv('/content/drive/MyDrive/lowongan_jobstreet.csv')

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 555 entries, 0 to 554
Data columns (total 10 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   posisi             555 non-null    object 
 1   perusahaan         555 non-null    object 
 2   kota               535 non-null    object 
 3   provinsi           544 non-null    object 
 4   gaji               77 non-null     float64
 5   tools              555 non-null    object 
 6   pendidikan         419 non-null    object 
 7   pengalaman         377 non-null    float64
 8   deskripsi_lengkap  555 non-null    object 
 9   level              377 non-null    object 
dtypes: float64(2), object(8)
memory usage: 43.5+ KB


In [None]:
df.head()

Unnamed: 0,posisi,perusahaan,kota,provinsi,gaji,tools,pendidikan,pengalaman,deskripsi_lengkap,level
0,Data Analyst,PT IEKIMTIE MEDIKA WEST,Genteng,Jawa Timur,,SQL,"Industrial Engineering, Informatics Engineering",0.0,Kualifikasi\n:\nS1 Teknik Industri / Informati...,Entry Level
1,Data Analyst/Intelligence,PT Susu Life Indonesia,Jakarta Barat,Jakarta Raya,,"Power BI, Tableau, Excel","Accounting, Economics, Finance",3.0,"Key Responsibilities\nBuild, maintain, and aut...",Mid Level
2,DATA ANALYST,PT Venindo Jaya Abadi,Jakarta Utara,Jakarta Raya,9500000.0,"SPSS, SQL, Python, Power BI, Oracle, Excel",,,Job Desk Data Analyst\nBerikut merupakan Job D...,
3,Data Analyst Associate,PT Solusi Transportasi Indonesia,Jakarta Lainnya,Jakarta Raya,,"SQL, Tableau, Python, Power BI, Oracle","Business Analytics, Computer Science, Data Ana...",5.0,Company Description\nAbout Grab and Our Workpl...,Senior Level
4,Data Analyst Officer for Finance Divison,PT Global Jet Express (Cabang Jakarta),Jakarta Utara,Jakarta Raya,,Excel,,3.0,Responsibilities:\nSupporting Accounting Divis...,Mid Level


# STANDARISASI TEXT

standarisasi text dilakukan untuk memastikan konsistensi kategori pada kolom berbasis string seperti posisi dan lokasi. proses ini meliputi konversi tipe data, penghapusan spasi berlebih, dan penyamaan format huruf agar tidak terjadi duplikat kategori secara semu saat analisis frekuensi

In [None]:
# standardize text columns
text_columns =['posisi','perusahaan','kota','provinsi','level']
for col in text_columns:
  df[col]= df[col].astype(str).str.strip().str.title()
text_columns

['posisi', 'perusahaan', 'kota', 'provinsi', 'level']

# NORMALISASI JOB TITLE

menyatukan penulisan judul pekerjaan yang sebenarnya sama, tapi ditulis berbeda, contohnya (data analyst, data analyst/intelligence, data analyst associate) yang sebenarnya masih dalam role data analyst.tujuanya agar pembagian kategori tidak mempengaruhi posisi pekerjaan

In [None]:
df['posisi_clean'] = df['posisi'].str.upper()

df['posisi_clean'] = df['posisi_clean'].replace({
    'DATA ANALYST/INTELLIGENCE': 'DATA ANALYST',
    'DATA ANALYST ASSOCIATE': 'DATA ANALYST'
})


In [None]:
df['posisi_clean'].value_counts().head(10)

Unnamed: 0_level_0,count
posisi_clean,Unnamed: 1_level_1
DATA ANALYST,45
BUSINESS ANALYST,29
SYSTEM ANALYST,12
IT BUSINESS ANALYST,11
DATA ANALYST STAFF,6
DATA ANALYST INTERN,5
CREDIT ANALYST,5
BUSINESS SYSTEM ANALYST,5
SENIOR DATA ANALYST,4
FINANCE ANALYST,4


# HANDLING MISSING VALUES (KOLOM GAJI)

nilai gaji yang kosong tidak diisi karena mencerminkan kurangnya tranparansi perusahaan. informasi ini dipertahankan untuk menganalisis pola keterbukaan gaji berdasarkan role dan level pekerjaan

In [None]:
df['gaji'].isna().sum()

np.int64(478)

In [None]:
#melihat data Nan pada kolom
df['gaji_missing']=df['gaji'].isna()

In [None]:
#jumlah data missing pada kolom gaji
#TRUE = tidak mencantumkan, FALSE = mencantumkan
df['gaji_missing'].value_counts()

Unnamed: 0_level_0,count
gaji_missing,Unnamed: 1_level_1
True,478
False,77


In [None]:
df.groupby('posisi_clean')['gaji_missing'].mean().sort_values(ascending=False)
#nilai 1.0= kosong dan nilai 0.0=gaji tidak kosong

Unnamed: 0_level_0,gaji_missing
posisi_clean,Unnamed: 1_level_1
[6 MONTHS CONTRACT] DATA ANALYST,1.0
(SENIOR) SYSLOG SERVER ANALYST,1.0
2026 ASIA ANALYST DEVELOPMENT PROGRAM - SUMMER ANALYST - INDONESIA,1.0
2026 ASIA ANALYST DEVELOPMENT PROGRAM – SUMMER ANALYST - INDONESIA,1.0
23. MARKETING ANALYST STAFF,1.0
...,...
BUSSINES ANALYST,0.0
BISNIS ANALIS TEKNOLOGI INFORMASI,0.0
BUSINESS ANALYST (MODULE EPROC),0.0
SYSTEM ANALYST AS400 ( INTEGRATION ),0.0


In [None]:
df[['level','gaji_missing']]

Unnamed: 0,level,gaji_missing
0,Entry Level,True
1,Mid Level,True
2,Nan,False
3,Senior Level,True
4,Mid Level,True
...,...,...
550,Mid Level,True
551,Nan,True
552,Mid Level,True
553,Entry Level,True


In [None]:
df.groupby('level')['gaji_missing'].mean()

Unnamed: 0_level_0,gaji_missing
level,Unnamed: 1_level_1
Entry Level,0.893617
Mid Level,0.861925
Nan,0.848315
Senior Level,0.840909


In [None]:
tranparansi_level=(
    df.groupby('level')['gaji_missing'].mean().sort_values(ascending=False)*100)
tranparansi_level

Unnamed: 0_level_0,gaji_missing
level,Unnamed: 1_level_1
Entry Level,89.361702
Mid Level,86.192469
Nan,84.831461
Senior Level,84.090909


hasil analisis menunjukan bahwa lowongan Entry Level memiliki tingkat ketidak tranparan gaji tertinggi 0.89, sementara senior level relatif lebih tranparan yaitu 0.84, meskipun demikian secara umum tranparansi gaji pada lowongan data dan analytics di indonesia masih tergolong rendah

# HANDLING MISSING VALUES (PENGALAMAN)

pada nilai NaN pada data pengalaman akan di isi dengan 0, yang bermaksud tidak dispesifikan/ entry-friendly

In [None]:
df['pengalaman'] = df['pengalaman'].fillna(0)
df['pengalaman']

Unnamed: 0,pengalaman
0,0.0
1,3.0
2,0.0
3,5.0
4,3.0
...,...
550,3.0
551,0.0
552,4.0
553,0.0


nilai kosong pada kolom pengalaman diisi dengan 0 untuk merepresentasikan lowongan yang tidak secara eksplisit menyebutkan pengalaman kerja atau bersifat entry-friendly

In [None]:
df['pengalaman'].isna().sum()

np.int64(0)

In [None]:
df['pengalaman'].value_counts().sort_index()
(df['pengalaman'].value_counts(normalize=True) * 100).round(2)

Unnamed: 0_level_0,proportion
pengalaman,Unnamed: 1_level_1
0.0,38.74
2.0,20.9
3.0,18.74
1.0,10.27
5.0,6.67
4.0,3.42
6.0,0.72
7.0,0.54


distribusi pengalaman menunjukan bahwa mayoritas lowongan data dan analytics di indonesia bersifat entry-friendly (39%) dan junior-mid level(2-3 tahun). hal ini mengidentifikasi bahwa pasar tenaga kerja data masih terbuka bagi kandidat dengan pengalaman awal

# HANDLING MSIING VALUE (PENDIDIKAN)

karena pada kolom pendidikan memiliki jurusan yang beragam maka kita akan mengkategorikannya

In [None]:
import re

# pastikan tidak ada NaN
df['pendidikan'] = df['pendidikan'].fillna('TIDAK DISEBUTKAN')

def map_jurusan(text):
    t = str(text).upper()

    if 'TIDAK DISEBUTKAN' in t:
        return 'Not Specified'
    elif re.search(r'INFORMATICS|COMPUTER|DATA|INFORMATION SYSTEM|INFORMATION TECHNOLOGY|IT', t):
        return 'IT / Computer Science'
    elif re.search(r'ENGINEERING|INDUSTRIAL|ELECTRICAL|MECHANICAL|CIVIL', t):
        return 'Engineering'
    elif re.search(r'ACCOUNTING|ECONOMICS|FINANCE|BUSINESS|MANAGEMENT', t):
        return 'Business / Economics'
    elif re.search(r'STATISTICS|MATHEMATICS|MATH', t):
        return 'Statistics / Math'
    elif 'JURUSAN LAINNYA' in t:
        return 'Other / General'
    else:
        return 'Other / General'

df['kategori_jurusan'] = df['pendidikan'].apply(map_jurusan)


In [None]:
df['kategori_jurusan'].value_counts()

Unnamed: 0_level_0,count
kategori_jurusan,Unnamed: 1_level_1
IT / Computer Science,223
Not Specified,136
Business / Economics,90
Other / General,64
Engineering,33
Statistics / Math,9


hasil analisi menunjukkan bahwa jurusan IT/Computer Science merupakan latar belakang paling dominan.

In [None]:
df

Unnamed: 0,posisi,perusahaan,kota,provinsi,gaji,tools,pendidikan,pengalaman,deskripsi_lengkap,level,posisi_clean,gaji_missing,kategori_jurusan
0,Data Analyst,Pt Iekimtie Medika West,Genteng,Jawa Timur,,SQL,"Industrial Engineering, Informatics Engineering",0.0,Kualifikasi\n:\nS1 Teknik Industri / Informati...,Entry Level,DATA ANALYST,True,IT / Computer Science
1,Data Analyst/Intelligence,Pt Susu Life Indonesia,Jakarta Barat,Jakarta Raya,,"Power BI, Tableau, Excel","Accounting, Economics, Finance",3.0,"Key Responsibilities\nBuild, maintain, and aut...",Mid Level,DATA ANALYST,True,Business / Economics
2,Data Analyst,Pt Venindo Jaya Abadi,Jakarta Utara,Jakarta Raya,9500000.0,"SPSS, SQL, Python, Power BI, Oracle, Excel",TIDAK DISEBUTKAN,0.0,Job Desk Data Analyst\nBerikut merupakan Job D...,Nan,DATA ANALYST,False,Not Specified
3,Data Analyst Associate,Pt Solusi Transportasi Indonesia,Jakarta Lainnya,Jakarta Raya,,"SQL, Tableau, Python, Power BI, Oracle","Business Analytics, Computer Science, Data Ana...",5.0,Company Description\nAbout Grab and Our Workpl...,Senior Level,DATA ANALYST,True,IT / Computer Science
4,Data Analyst Officer For Finance Divison,Pt Global Jet Express (Cabang Jakarta),Jakarta Utara,Jakarta Raya,,Excel,TIDAK DISEBUTKAN,3.0,Responsibilities:\nSupporting Accounting Divis...,Mid Level,DATA ANALYST OFFICER FOR FINANCE DIVISON,True,Not Specified
...,...,...,...,...,...,...,...,...,...,...,...,...,...
550,Business Analyst Supervisor Di Jakarta (Wfo),Pt Msbu Konsultan Indonesia,Jakarta Selatan,Jakarta Raya,,Tidak ada tools spesifik,TIDAK DISEBUTKAN,3.0,Tanggung Jawab Utama:1. Melakukan analisis keb...,Mid Level,BUSINESS ANALYST SUPERVISOR DI JAKARTA (WFO),True,Not Specified
551,Lending Operations Analyst,Pt Sinar Digital Terdepan,Jakarta Lainnya,Jakarta Raya,,Tidak ada tools spesifik,TIDAK DISEBUTKAN,0.0,Skip to content\nJob Application\nBack to All ...,Nan,LENDING OPERATIONS ANALYST,True,Not Specified
552,Senior Business Analyst,Igloo,Jakarta Lainnya,Jakarta Raya,,SQL,Jurusan Lainnya,4.0,About Igloo\nIgloo (formerly Axinan) is the fi...,Mid Level,SENIOR BUSINESS ANALYST,True,Other / General
553,Customer Solutions Analyst,Jendela Group,Jakarta Barat,Jakarta Raya,,Tidak ada tools spesifik,Jurusan Lainnya,0.0,Job Description\nServe as the primary point of...,Entry Level,CUSTOMER SOLUTIONS ANALYST,True,Other / General


# TOOLS yang paling di cari

In [None]:
#ambil kolom tools dan drop NaN
tools_series=df['tools'].dropna().str.upper()
#samakan delimiter
tools_series = tools_series.str.replace('|', ',', regex=False)
tools_series = tools_series.str.replace(';', ',', regex=False)
#split & explode
tools_exploded = tools_series.str.split(',').explode()
#membersihkan spasi
tools_exploded=tools_exploded.str.strip()

In [None]:
#hitung tools paling di cari
tools_counts = tools_exploded.value_counts(normalize=True)*100
tools_counts.head(10)

Unnamed: 0_level_0,proportion
tools,Unnamed: 1_level_1
EXCEL,18.105616
TIDAK ADA TOOLS SPESIFIK,17.770327
SQL,13.579212
POWER BI,8.549874
TABLEAU,7.963118
PYTHON,6.454317
R,3.688181
LOOKER,2.849958
API,1.676446
ORACLE,1.508801


dari analisis di atas menunjukkan bahwa excel dan sql merupakan tools paling banyak diminta.

# LOKASI

In [None]:
#membersihkan kolom kota
df['kota'] = df['kota'].str.strip().str.title()

In [None]:
#menghitung distribusi kota
kota_counts = df['kota'].value_counts(normalize=True)*100
kota_counts.head(10)

Unnamed: 0_level_0,proportion
kota,Unnamed: 1_level_1
Jakarta Lainnya,29.90991
Jakarta Selatan,16.396396
Jakarta Pusat,9.72973
Jakarta Utara,7.927928
Tangerang,5.945946
Jakarta Barat,5.945946
Nan,3.603604
Bandung,2.702703
Jakarta Timur,1.981982
Surabaya,1.801802


sebagian besar lowongan data dan analystics terkonsentrasi di jakarta sampai 29% menunjukkan tingginya sentralisasi industri data di indonesia. kota lain seperti Bandung dan Surabaya memiliki peluang, namun masih terbatas

In [None]:
df.to_csv("jobstreet_cleaned.csv",index=False)

In [None]:
from google.colab import files
files.download("jobstreet_cleaned.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>