# **Data Cleaning**

## **Data Reviewing**

In [1]:
import pandas as pd

In [2]:
magang_opportunities = pd.read_csv('magang_opportunities.csv')

magang_opportunities.head()

Unnamed: 0,meta,id,name,description,mitra_id,start_period,months_duration,activity_type,location,fields_of_study,...,show_salary,mobilization,accommodation,is_applied,wishlist,is_external,external_platform_name,external_platform_logo_url,mitra_name,salary
0,,b794e1a1-ecd6-449d-9ed9-0118a2e02626,Website Developer,<p><strong>Kualifikasi :</strong>&nbsp;</p><ul...,c7746ff1-87de-47bb-a43f-6d63f521e07c,2024-03-01T00:00:00+07:00,3,WFH,Kota Surabaya,[],...,False,False,False,False,False,False,,,CV. APPAREL BERKAH SELALU,
1,,7c8623ef-5f9b-470d-bc7a-73b1dd706c8b,Software Architect Intern,<p>To help architect team on demonstration of ...,af00bd20-3ac2-44a4-877f-af4307f9b599,2024-03-01T00:00:00+07:00,6,WFO,Kota Jakarta Selatan,['Teknik Informatika dan Ilmu Komputer'],...,False,False,False,False,False,False,,,PT. Idemia Technologies Indonesia,
2,,bd95c730-b157-4fc5-8791-9dc9ea5e0e32,Product Marketing Intern,<p>- Design and develop sharepoint page to sup...,af00bd20-3ac2-44a4-877f-af4307f9b599,2024-03-01T00:00:00+07:00,6,WFO,Kota Jakarta Selatan,['Teknik Informatika dan Ilmu Komputer'],...,False,False,False,False,False,False,,,PT. Idemia Technologies Indonesia,
3,,fc01895c-0d74-4874-a8f2-d1d46956d697,Asisten Finance,<ol><li>Pemahaman Sistem Keuangan Sekolah: And...,0bf5a4d1-9980-48ae-ad65-48e3b239f15c,2024-03-01T00:00:00+07:00,4,WFO,Kota Banda Aceh,[],...,False,False,False,False,False,False,,,SMAN 14 Banda Aceh,
4,,b6062e51-4764-474c-a2b7-617affd56959,Marketing Asuransi Mini Mikro,<p>WFH dari Seluruh Indonesia.</p><p><br></p><...,6bbc729e-c3ee-46a1-93be-2b9f3b5f478d,2024-03-01T00:00:00+07:00,4,WFH,Kota Banda Aceh,[],...,False,False,False,False,False,False,,,Natural Aceh,


In [3]:
data = pd.DataFrame({
  'id': list(magang_opportunities['id'].values),
  'name': list(magang_opportunities['name'].values),
  'description': list(magang_opportunities['description'].values),
  'skills': list(magang_opportunities['skills'].values)
})

data

Unnamed: 0,id,name,description,skills
0,b794e1a1-ecd6-449d-9ed9-0118a2e02626,Website Developer,<p><strong>Kualifikasi :</strong>&nbsp;</p><ul...,"['Komunikasi', 'Kreatif', 'Pemahaman Industri'..."
1,7c8623ef-5f9b-470d-bc7a-73b1dd706c8b,Software Architect Intern,<p>To help architect team on demonstration of ...,"['Komunikasi', 'Kreatif', 'Desain Grafis', 'Co..."
2,bd95c730-b157-4fc5-8791-9dc9ea5e0e32,Product Marketing Intern,<p>- Design and develop sharepoint page to sup...,"['Kreatif', 'Analisis Data', 'Pemahaman Lingku..."
3,fc01895c-0d74-4874-a8f2-d1d46956d697,Asisten Finance,<ol><li>Pemahaman Sistem Keuangan Sekolah: And...,"['Administrasi', 'Operasional', 'Komunikasi', ..."
4,b6062e51-4764-474c-a2b7-617affd56959,Marketing Asuransi Mini Mikro,<p>WFH dari Seluruh Indonesia.</p><p><br></p><...,"['Operasional', 'Kreatif', 'Administrasi', 'Ko..."
...,...,...,...,...
677,7a2f6ef1-844d-4c70-99bc-03be7d5aac88,Marketing Creative Intern,Responsibilities \n\nSupport Marketing and Bra...,
678,404ccd37-0c1e-43ab-bdfd-7b9a6fee551c,Graphic Design (Intern),Kami Idea merupakan sebuah usaha atau bisnis y...,
679,5a704786-6865-40c9-9086-a2bd85dc5098,Influencer Management Intern,- Last semester student from reputable univers...,
680,780902fd-e51a-463c-a012-e7a0941da47e,New Business Development Internship (Jakarta U...,- Final year student or fresh graduate \n- Pas...,


## **Combine Text**

In [4]:
import ast

In [5]:
# Convert the "skills" column from string to list for non-null values
data['skills'] = data['skills'].apply(lambda x: ast.literal_eval(x) if pd.notna(x) else x)

# Join the skills into a single string
data['skills'] = data['skills'].apply(lambda x: ', '.join(x) if isinstance(x, list) else x)

# Combine "name," "description," and "skills" into a new column
data['combined_text'] = data['name'] + ' ' + data['description'] + ' ' + data['skills'].astype(str)

data['combined_text']

0      Website Developer <p><strong>Kualifikasi :</st...
1      Software Architect Intern <p>To help architect...
2      Product Marketing Intern <p>- Design and devel...
3      Asisten Finance <ol><li>Pemahaman Sistem Keuan...
4      Marketing Asuransi Mini Mikro  <p>WFH dari Sel...
                             ...                        
677    Marketing Creative Intern Responsibilities \n\...
678    Graphic Design (Intern) Kami Idea merupakan se...
679    Influencer Management Intern - Last semester s...
680    New Business Development Internship (Jakarta U...
681    Finance & Accounting Intern Kualifikasi :\n- M...
Name: combined_text, Length: 682, dtype: object

## **Case Folding**

In [6]:
data['result_case_folding_data'] = data['combined_text'].apply(lambda x: x.casefold())
data[['id', 'combined_text','result_case_folding_data']]

Unnamed: 0,id,combined_text,result_case_folding_data
0,b794e1a1-ecd6-449d-9ed9-0118a2e02626,Website Developer <p><strong>Kualifikasi :</st...,website developer <p><strong>kualifikasi :</st...
1,7c8623ef-5f9b-470d-bc7a-73b1dd706c8b,Software Architect Intern <p>To help architect...,software architect intern <p>to help architect...
2,bd95c730-b157-4fc5-8791-9dc9ea5e0e32,Product Marketing Intern <p>- Design and devel...,product marketing intern <p>- design and devel...
3,fc01895c-0d74-4874-a8f2-d1d46956d697,Asisten Finance <ol><li>Pemahaman Sistem Keuan...,asisten finance <ol><li>pemahaman sistem keuan...
4,b6062e51-4764-474c-a2b7-617affd56959,Marketing Asuransi Mini Mikro <p>WFH dari Sel...,marketing asuransi mini mikro <p>wfh dari sel...
...,...,...,...
677,7a2f6ef1-844d-4c70-99bc-03be7d5aac88,Marketing Creative Intern Responsibilities \n\...,marketing creative intern responsibilities \n\...
678,404ccd37-0c1e-43ab-bdfd-7b9a6fee551c,Graphic Design (Intern) Kami Idea merupakan se...,graphic design (intern) kami idea merupakan se...
679,5a704786-6865-40c9-9086-a2bd85dc5098,Influencer Management Intern - Last semester s...,influencer management intern - last semester s...
680,780902fd-e51a-463c-a012-e7a0941da47e,New Business Development Internship (Jakarta U...,new business development internship (jakarta u...


## **Remove Html Tags**

In [7]:
from lxml import etree

In [10]:
def remove_html_tags(text):
    parser = etree.HTMLParser()
    tree = etree.fromstring(text, parser)
    return etree.tostring(tree, encoding='unicode', method='text')

data['result_remove_html_tags'] = data['result_case_folding_data'].apply(remove_html_tags)
data[['id', 'result_case_folding_data' ,'result_remove_html_tags']]

Unnamed: 0,id,result_case_folding_data,result_remove_html_tags
0,b794e1a1-ecd6-449d-9ed9-0118a2e02626,website developer <p><strong>kualifikasi :</st...,website developer kualifikasi : wajib memiliki...
1,7c8623ef-5f9b-470d-bc7a-73b1dd706c8b,software architect intern <p>to help architect...,software architect intern to help architect te...
2,bd95c730-b157-4fc5-8791-9dc9ea5e0e32,product marketing intern <p>- design and devel...,product marketing intern - design and develop ...
3,fc01895c-0d74-4874-a8f2-d1d46956d697,asisten finance <ol><li>pemahaman sistem keuan...,asisten finance pemahaman sistem keuangan seko...
4,b6062e51-4764-474c-a2b7-617affd56959,marketing asuransi mini mikro <p>wfh dari sel...,marketing asuransi mini mikro wfh dari seluru...
...,...,...,...
677,7a2f6ef1-844d-4c70-99bc-03be7d5aac88,marketing creative intern responsibilities \n\...,marketing creative intern responsibilities \n\...
678,404ccd37-0c1e-43ab-bdfd-7b9a6fee551c,graphic design (intern) kami idea merupakan se...,graphic design (intern) kami idea merupakan se...
679,5a704786-6865-40c9-9086-a2bd85dc5098,influencer management intern - last semester s...,influencer management intern - last semester s...
680,780902fd-e51a-463c-a012-e7a0941da47e,new business development internship (jakarta u...,new business development internship (jakarta u...


## **Remove Unicode Character**

In [11]:
import re

In [18]:
data['result_remove_unicode_character'] = data['result_remove_html_tags'].apply(lambda x: re.sub(r"(@\[A-Za-z0-9]+)|([^0-9A-Za-z \t])|(\w+:\/\/\S+)|^rt|http.+?", "", x))
data[['id', 'result_remove_html_tags', 'result_remove_unicode_character']]

Unnamed: 0,id,result_remove_html_tags,result_remove_unicode_character
0,b794e1a1-ecd6-449d-9ed9-0118a2e02626,website developer kualifikasi : wajib memiliki...,website developer kualifikasi wajib memiliki l...
1,7c8623ef-5f9b-470d-bc7a-73b1dd706c8b,software architect intern to help architect te...,software architect intern to help architect te...
2,bd95c730-b157-4fc5-8791-9dc9ea5e0e32,product marketing intern - design and develop ...,product marketing intern design and develop s...
3,fc01895c-0d74-4874-a8f2-d1d46956d697,asisten finance pemahaman sistem keuangan seko...,asisten finance pemahaman sistem keuangan seko...
4,b6062e51-4764-474c-a2b7-617affd56959,marketing asuransi mini mikro wfh dari seluru...,marketing asuransi mini mikro wfh dari seluru...
...,...,...,...
677,7a2f6ef1-844d-4c70-99bc-03be7d5aac88,marketing creative intern responsibilities \n\...,marketing creative intern responsibilities sup...
678,404ccd37-0c1e-43ab-bdfd-7b9a6fee551c,graphic design (intern) kami idea merupakan se...,graphic design intern kami idea merupakan sebu...
679,5a704786-6865-40c9-9086-a2bd85dc5098,influencer management intern - last semester s...,influencer management intern last semester st...
680,780902fd-e51a-463c-a012-e7a0941da47e,new business development internship (jakarta u...,new business development internship jakarta ut...


## **Remove Stopwords**

In [16]:
import nltk.corpus
nltk.download('stopwords')
from nltk.corpus import stopwords

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\aszay\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


In [19]:
indo_stopwords = stopwords.words('indonesian')
eng_stopwords = stopwords.words('english')

list_stopwords = indo_stopwords + eng_stopwords

In [20]:
def remove_stopwords(text):
    text = [word for word in text.split() if word not in list_stopwords]
    return " ".join(text)

data['result_remove_stopwords'] = data['result_remove_unicode_character'].apply(remove_stopwords)
data[['id', 'result_remove_unicode_character', 'result_remove_stopwords']]

Unnamed: 0,id,result_remove_unicode_character,result_remove_stopwords
0,b794e1a1-ecd6-449d-9ed9-0118a2e02626,website developer kualifikasi wajib memiliki l...,website developer kualifikasi wajib memiliki l...
1,7c8623ef-5f9b-470d-bc7a-73b1dd706c8b,software architect intern to help architect te...,software architect intern help architect team ...
2,bd95c730-b157-4fc5-8791-9dc9ea5e0e32,product marketing intern design and develop s...,product marketing intern design develop sharep...
3,fc01895c-0d74-4874-a8f2-d1d46956d697,asisten finance pemahaman sistem keuangan seko...,asisten finance pemahaman sistem keuangan seko...
4,b6062e51-4764-474c-a2b7-617affd56959,marketing asuransi mini mikro wfh dari seluru...,marketing asuransi mini mikro wfh indonesiapen...
...,...,...,...
677,7a2f6ef1-844d-4c70-99bc-03be7d5aac88,marketing creative intern responsibilities sup...,marketing creative intern responsibilities sup...
678,404ccd37-0c1e-43ab-bdfd-7b9a6fee551c,graphic design intern kami idea merupakan sebu...,graphic design intern idea usaha bisnis berger...
679,5a704786-6865-40c9-9086-a2bd85dc5098,influencer management intern last semester st...,influencer management intern last semester stu...
680,780902fd-e51a-463c-a012-e7a0941da47e,new business development internship jakarta ut...,new business development internship jakarta ut...
