# **Data Cleaning**

## **Import Library**

In [71]:
import pandas as pd
import ast


## **Data Reviewing**

In [72]:
magang_opportunities = pd.read_csv('magang_opportunities.csv')

magang_opportunities.head()

Unnamed: 0,meta,id,name,description,mitra_id,start_period,months_duration,activity_type,location,fields_of_study,...,web_portal,show_salary,mobilization,accommodation,is_applied,wishlist,is_external,external_platform_name,external_platform_logo_url,salary
0,,b794e1a1-ecd6-449d-9ed9-0118a2e02626,Website Developer,Kualifikasi : Wajib memiliki laptop sendiri.Ma...,c7746ff1-87de-47bb-a43f-6d63f521e07c,2024-03-01T00:00:00+07:00,3,WFH,Kota Surabaya,[],...,,False,False,False,False,False,False,,,
1,,7c8623ef-5f9b-470d-bc7a-73b1dd706c8b,Software Architect Intern,To help architect team on demonstration of an ...,af00bd20-3ac2-44a4-877f-af4307f9b599,2024-03-01T00:00:00+07:00,6,WFO,Kota Jakarta Selatan,['Teknik Informatika dan Ilmu Komputer'],...,,False,False,False,False,False,False,,,
2,,bd95c730-b157-4fc5-8791-9dc9ea5e0e32,Product Marketing Intern,- Design and develop sharepoint page to suppor...,af00bd20-3ac2-44a4-877f-af4307f9b599,2024-03-01T00:00:00+07:00,6,WFO,Kota Jakarta Selatan,['Teknik Informatika dan Ilmu Komputer'],...,,False,False,False,False,False,False,,,
3,,fc01895c-0d74-4874-a8f2-d1d46956d697,Asisten Finance,Pemahaman Sistem Keuangan Sekolah: Anda akan d...,0bf5a4d1-9980-48ae-ad65-48e3b239f15c,2024-03-01T00:00:00+07:00,4,WFO,Kota Banda Aceh,[],...,,False,False,False,False,False,False,,,
4,,b6062e51-4764-474c-a2b7-617affd56959,Marketing Asuransi Mini Mikro,WFH dari Seluruh Indonesia.Penelitian Pasar: M...,6bbc729e-c3ee-46a1-93be-2b9f3b5f478d,2024-03-01T00:00:00+07:00,4,WFH,Kota Banda Aceh,[],...,https://forms.gle/7SSUSeRFa6UQT2Fe6,False,False,False,False,False,False,,,


In [77]:
data = pd.DataFrame({
  'id': list(magang_opportunities['id'].values),
  'name': list(magang_opportunities['name'].values),
  'description': list(magang_opportunities['description'].values),
  'skills': list(magang_opportunities['skills'].values)
})

# Convert the "skills" column from string to list for non-null values
data['skills'] = data['skills'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else x)

# Join the skills into a single string
data['skills'] = data['skills'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)

# Combine "name," "description," and "skills" into a new column
data['combined_text'] = data['name'] + ' ' + data['description'] + ' ' + data['skills'].astype(str)

data

Unnamed: 0,id,name,description,skills,combined_text
0,b794e1a1-ecd6-449d-9ed9-0118a2e02626,Website Developer,Kualifikasi : Wajib memiliki laptop sendiri.Ma...,"Komunikasi, Kreatif, Pemahaman Industri, Manaj...",Website Developer Kualifikasi : Wajib memiliki...
1,7c8623ef-5f9b-470d-bc7a-73b1dd706c8b,Software Architect Intern,To help architect team on demonstration of an ...,"Komunikasi, Kreatif, Desain Grafis, Coding",Software Architect Intern To help architect te...
2,bd95c730-b157-4fc5-8791-9dc9ea5e0e32,Product Marketing Intern,- Design and develop sharepoint page to suppor...,"Kreatif, Analisis Data, Pemahaman Lingkungan B...",Product Marketing Intern - Design and develop ...
3,fc01895c-0d74-4874-a8f2-d1d46956d697,Asisten Finance,Pemahaman Sistem Keuangan Sekolah: Anda akan d...,"Administrasi, Operasional, Komunikasi, Kreatif",Asisten Finance Pemahaman Sistem Keuangan Seko...
4,b6062e51-4764-474c-a2b7-617affd56959,Marketing Asuransi Mini Mikro,WFH dari Seluruh Indonesia.Penelitian Pasar: M...,"Operasional, Kreatif, Administrasi, Komunikasi",Marketing Asuransi Mini Mikro WFH dari Seluru...
...,...,...,...,...,...
677,7a2f6ef1-844d-4c70-99bc-03be7d5aac88,Marketing Creative Intern,Responsibilities \n\nSupport Marketing and Bra...,,Marketing Creative Intern Responsibilities \n\...
678,404ccd37-0c1e-43ab-bdfd-7b9a6fee551c,Graphic Design (Intern),Kami Idea merupakan sebuah usaha atau bisnis y...,,Graphic Design (Intern) Kami Idea merupakan se...
679,5a704786-6865-40c9-9086-a2bd85dc5098,Influencer Management Intern,- Last semester student from reputable univers...,,Influencer Management Intern - Last semester s...
680,780902fd-e51a-463c-a012-e7a0941da47e,New Business Development Internship (Jakarta U...,- Final year student or fresh graduate \n- Pas...,,New Business Development Internship (Jakarta U...


In [78]:
data['combined_text'][1]

'Software Architect Intern To help architect team on demonstration of an improvement idea in which work is focused on determining whether an idea can be turned into a reality.The improvement might include:- Test Optimization- Test Automation- Framework usage e.g Test framework or design Komunikasi, Kreatif, Desain Grafis, Coding'

## **Case Folding**

In [112]:
result_case_folding_data = data['combined_text'].apply(lambda x: x.lower() if x == 'object' else x)
result_case_folding_data

0      Website Developer Kualifikasi : Wajib memiliki...
1      Software Architect Intern To help architect te...
2      Product Marketing Intern - Design and develop ...
3      Asisten Finance Pemahaman Sistem Keuangan Seko...
4      Marketing Asuransi Mini Mikro  WFH dari Seluru...
                             ...                        
677    Marketing Creative Intern Responsibilities \n\...
678    Graphic Design (Intern) Kami Idea merupakan se...
679    Influencer Management Intern - Last semester s...
680    New Business Development Internship (Jakarta U...
681    Finance & Accounting Intern Kualifikasi :\n- M...
Name: combined_text, Length: 682, dtype: object