# Import Libraries



In [1]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer


nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

from google.colab import drive

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


# Load Dataset


In [2]:
drive.mount('/content/drive')
file_path = '/content/drive/MyDrive/data/Capgemini_Employee_Reviews_from_AmbitionBox.csv'

# Load the data
df = pd.read_csv(file_path)

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [3]:
pd.set_option('display.max_colwidth', None)

# Menampilkan 10 baris pertama dari kolom 'Likes' dan 'Dislikes'
print(df[['Likes', 'Dislikes']].head(10))


                                                                                                                                                                                                                                                                                                                                                                                                                                                                               Likes  \
0                                                                                                                                                                                                                                                                                                                                        Deserved candidates are promoted promptly.\nUnbiased in providing opportunities to employees, regardless of their gender or any other thing   
1                                                       

# Text Preprocessing


## Text Cleaning


### Lowercase


In [4]:
# Membuat salinan DataFrame dengan teks yang diubah menjadi huruf kecil
df_processed = df.copy()
df_processed['Likes_cleaned'] = df['Likes'].apply(lambda x: x.lower() if isinstance(x, str) else x)
df_processed['Dislikes_cleaned'] = df['Dislikes'].apply(lambda x: x.lower() if isinstance(x, str) else x)

pd.set_option('display.max_colwidth', None)

# Menampilkan hasilnya
print(df_processed[['Likes_cleaned', 'Dislikes_cleaned']].head(10))


                                                                                                                                                                                                                                                                                                                                                                                                                                                                       Likes_cleaned  \
0                                                                                                                                                                                                                                                                                                                                        deserved candidates are promoted promptly.\nunbiased in providing opportunities to employees, regardless of their gender or any other thing   
1                                                       

### Stop Words


In [5]:
import nltk
from nltk.corpus import stopwords

# Unduh daftar stopwords bahasa Inggris
nltk.download('stopwords')

# Daftar stopwords bahasa Inggris
stop_words = set(stopwords.words('english'))

# Fungsi untuk menghapus stopwords
def remove_stopwords(text):
    if not isinstance(text, str):
        return text  # Kembalikan teks yang tidak berbentuk string
    words = text.split()  # Pisahkan teks menjadi kata-kata
    filtered_words = [word for word in words if word.lower() not in stop_words]  # Hapus stopwords
    return ' '.join(filtered_words)  # Gabungkan kembali kata-kata menjadi teks

# Pastikan semua nilai dalam kolom 'Likes_cleaned' dan 'Dislikes_cleaned' adalah string
df_processed['Likes_cleaned'] = df_processed['Likes_cleaned'].astype(str).apply(remove_stopwords)
df_processed['Dislikes_cleaned'] = df_processed['Dislikes_cleaned'].astype(str).apply(remove_stopwords)

# Menampilkan hasilnya
print(df_processed[['Likes','Dislikes','Likes_cleaned', 'Dislikes_cleaned']].head(10))





                                                                                                                                                                                                                                                                                                                                                                                                                                                                               Likes  \
0                                                                                                                                                                                                                                                                                                                                        Deserved candidates are promoted promptly.\nUnbiased in providing opportunities to employees, regardless of their gender or any other thing   
1                                                       

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


### Remove Punctuation (. & ,)

In [6]:

# Fungsi untuk menghapus tanda baca
def remove_punctuation(text):
    # Menghapus tanda baca dan menggantinya dengan spasi
    cleaned_text = re.sub(r'[.,]', ' ', text)
    # Menghapus spasi ganda yang mungkin terbentuk
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    return cleaned_text


# Mengaplikasikan fungsi ke kolom teks yang diinginkan pada DataFrame yang baru
df_processed['Likes_cleaned'] = df_processed['Likes_cleaned'].apply(remove_punctuation)
df_processed['Dislikes_cleaned'] = df_processed['Dislikes_cleaned'].apply(remove_punctuation)

pd.set_option('display.max_colwidth', None)

# Menampilkan hasilnya
print(df_processed[['Likes','Dislikes','Likes_cleaned', 'Dislikes_cleaned']].head(10))

                                                                                                                                                                                                                                                                                                                                                                                                                                                                               Likes  \
0                                                                                                                                                                                                                                                                                                                                        Deserved candidates are promoted promptly.\nUnbiased in providing opportunities to employees, regardless of their gender or any other thing   
1                                                       

### Remove other punctuation, number


In [7]:

# Fungsi untuk menghapus karakter selain huruf dan spasi
def remove_non_alpha(text):
    # Menghapus karakter selain huruf dan spasi
    cleaned_text = re.sub(r'[^a-zA-Z\s]', '', text)
    cleaned_text = re.sub(r'\b 1st \b', '', cleaned_text)
    cleaned_text = re.sub(r'\b 2nd \b', '', cleaned_text)
    cleaned_text = re.sub(r'\b 3rd \b', '', cleaned_text)
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text)
    return cleaned_text

# Mengaplikasikan fungsi ke kolom teks yang diinginkan pada DataFrame yang baru
df_processed['Likes_cleaned'] = df_processed['Likes_cleaned'].apply(remove_non_alpha)
df_processed['Dislikes_cleaned'] = df_processed['Dislikes_cleaned'].apply(remove_non_alpha)

# Menampilkan hasilnya
print(df_processed[['Likes','Dislikes','Likes_cleaned', 'Dislikes_cleaned']].head(10))

                                                                                                                                                                                                                                                                                                                                                                                                                                                                               Likes  \
0                                                                                                                                                                                                                                                                                                                                        Deserved candidates are promoted promptly.\nUnbiased in providing opportunities to employees, regardless of their gender or any other thing   
1                                                       

### Slang Words


In [8]:
#Daftar kata-kata yang ingin diganti dan penggantinya
replacements = {
    r'\bcg\b': 'capgemini',
    r'\bnanipulstive\b':'manipulative',
    r'\bharashful\b':'harshful',
    r'\bcanspeak\b':'can speak',
    r'\bsallery\b':'salary',
    r'\bemloyees\b':'employees',
    r'\bharashment\b':'harassment',
    r'\bsallery\b':'salary',

    # Tambahkan lebih banyak pasangan kata yang ingin diganti di sini
}

# Fungsi untuk mengganti kata-kata sesuai dengan daftar replacements
def replace_words(text, replacements):
    for old_word, new_word in replacements.items():
        text = re.sub(old_word, new_word, text)
    return text

#Mengaplikasikan fungsi ke kolom 'Likes' dan 'Dislikes' dalam df_processed
df_processed['Likes_cleaned'] = df_processed['Likes_cleaned'].apply(lambda x: replace_words(x, replacements))
df_processed['Dislikes_cleaned'] = df_processed['Dislikes_cleaned'].apply(lambda x: replace_words(x, replacements))

#Menampilkan hasilnya
print(df_processed[['Likes','Dislikes','Likes_cleaned', 'Dislikes_cleaned']].head(10))

                                                                                                                                                                                                                                                                                                                                                                                                                                                                               Likes  \
0                                                                                                                                                                                                                                                                                                                                        Deserved candidates are promoted promptly.\nUnbiased in providing opportunities to employees, regardless of their gender or any other thing   
1                                                       

### Lemmatization


In [9]:
def custom_lemmatization(text, exclude_words=[]):
    lemmatizer = WordNetLemmatizer()
    words = text.split()
    lemmatized_words = [lemmatizer.lemmatize(word) if word.lower() not in exclude_words else word for word in words]
    lemmatized_text = ' '.join(lemmatized_words)
    return lemmatized_text

# Mengaplikasikan fungsi ke kolom teks yang diinginkan pada DataFrame yang baru
df_processed['Likes_cleaned'] = df_processed['Likes_cleaned'].apply(custom_lemmatization)
df_processed['Dislikes_cleaned'] = df_processed['Dislikes_cleaned'].apply(custom_lemmatization)

# Menampilkan hasilnya
print(df_processed[['Likes','Dislikes','Likes_cleaned', 'Dislikes_cleaned']].head(10))

                                                                                                                                                                                                                                                                                                                                                                                                                                                                               Likes  \
0                                                                                                                                                                                                                                                                                                                                        Deserved candidates are promoted promptly.\nUnbiased in providing opportunities to employees, regardless of their gender or any other thing   
1                                                       

In [10]:
# Simpan DataFrame ke dalam file CSV
df_processed.to_csv('/content/drive/MyDrive/preprocessing10.csv', index=False)