# Preparation

In [1]:
# Sources: 
# https://medium.com/@ksnugroho/dasar-text-preprocessing-dengan-python-a4fa52608ffe 
# https://www.datacamp.com/community/tutorials/text-analytics-beginners-nltk
# https://stackoverflow.com/questions/64719706/cleaning-twitter-data-pandas-python

import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [2]:
df = pd.read_csv('data_sesi29.csv')

In [3]:
df.head()

Unnamed: 0,umur,jenis_kelamin,pekerjaan,gaji,is_menikah,berat,tinggi,is_merokok,pendidikan,ever_stress
0,27.0,Perempuan,PNS,7957452.757,1.0,54.315053,170.428542,1.0,S1,1
1,53.0,Perempuan,PNS,7633002.755,1.0,72.873404,165.530097,0.0,S1,1
2,37.0,Perempuan,Pegawai swasta,6637624.864,1.0,46.321533,154.599388,0.0,S1,1
3,36.0,Perempuan,Pengangguran,3624871.391,1.0,51.539781,167.340481,1.0,SD,1
4,38.0,Laki-laki,Freelance,6031807.52,1.0,60.726909,165.514773,1.0,S2,0


In [4]:
df.columns

Index(['umur', 'jenis_kelamin', 'pekerjaan', 'gaji', 'is_menikah', 'berat',
       'tinggi', 'is_merokok', 'pendidikan', 'ever_stress'],
      dtype='object')

In [8]:
df = df.fillna(0)

# Feature Engineering

In [9]:
df['kualitas_gaji'] = df['gaji']/(df['ever_stress'] + 1)
df['gaji_per_umur'] = df['gaji']/(df['umur'] + 2)
df['ratio_tinggi_berat'] = df['tinggi']/(df['berat']+1)

In [10]:
from sklearn.preprocessing import PolynomialFeatures

In [11]:
polynom = PolynomialFeatures(degree=2)
df_transform = polynom.fit_transform(df.select_dtypes('number'))
df_transform = pd.DataFrame(df_transform,columns = polynom.get_feature_names())

In [12]:
df_transform

Unnamed: 0,1,x0,x1,x2,x3,x4,x5,x6,x7,x8,...,x6^2,x6 x7,x6 x8,x6 x9,x7^2,x7 x8,x7 x9,x8^2,x8 x9,x9^2
0,1.0,27.0,7.957453e+06,1.0,54.315053,170.428542,1.0,1.0,3.978726e+06,274394.922655,...,1.0,3.978726e+06,274394.922655,3.081052,1.583026e+13,1.091742e+12,1.225866e+07,7.529257e+10,845424.971029,9.492880
1,1.0,53.0,7.633003e+06,1.0,72.873404,165.530097,0.0,1.0,3.816501e+06,138781.868273,...,1.0,3.816501e+06,138781.868273,2.240727,1.456568e+13,5.296612e+11,8.551736e+06,1.926041e+10,310972.216717,5.020855
2,1.0,37.0,6.637625e+06,1.0,46.321533,154.599388,0.0,1.0,3.318812e+06,170195.509333,...,1.0,3.318812e+06,170195.509333,3.266999,1.101452e+13,5.648470e+11,1.084256e+07,2.896651e+10,556028.516448,10.673281
3,1.0,36.0,3.624871e+06,1.0,51.539781,167.340481,1.0,1.0,1.812436e+06,95391.352395,...,1.0,1.812436e+06,95391.352395,3.185024,3.284923e+12,1.728907e+11,5.772652e+06,9.099510e+09,303823.779503,10.144380
4,1.0,38.0,6.031808e+06,1.0,60.726909,165.514773,1.0,0.0,6.031808e+06,150795.188000,...,0.0,0.000000e+00,0.000000,0.000000,3.638270e+13,9.095675e+11,1.617371e+07,2.273919e+10,404342.801896,7.189927
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7912,1.0,32.0,6.024409e+06,1.0,44.432438,154.578859,0.0,1.0,3.012205e+06,177188.507294,...,1.0,3.012205e+06,177188.507294,3.402390,9.073377e+12,5.337280e+11,1.024869e+07,3.139577e+10,602864.347402,11.576255
7913,1.0,34.0,1.007043e+07,1.0,49.389914,158.782726,1.0,1.0,5.035213e+06,279734.050278,...,1.0,5.035213e+06,279734.050278,3.151082,2.535337e+13,1.408520e+12,1.586637e+07,7.825114e+10,881464.801678,9.929315
7914,1.0,58.0,8.010815e+06,1.0,54.351968,154.478087,1.0,1.0,4.005407e+06,133513.576900,...,1.0,4.005407e+06,133513.576900,2.790833,1.604329e+13,5.347763e+11,1.117842e+07,1.782588e+10,372614.068162,7.788748
7915,1.0,30.0,9.059906e+06,1.0,57.646930,163.377717,0.0,1.0,4.529953e+06,283122.066844,...,1.0,4.529953e+06,283122.066844,2.785785,2.052047e+13,1.282530e+12,1.261947e+07,8.015810e+10,788717.106917,7.760596


In [13]:
X = df.drop(['ever_stress'],1)
y = df['ever_stress']

# Train Test Split

In [14]:
from sklearn.model_selection import train_test_split
X_train,X_test,y_train,y_test = train_test_split(X,y,test_size = 0.3,stratify=y,random_state = 123)

In [15]:
y_train.value_counts()

1    3881
0    1660
Name: ever_stress, dtype: int64

# Imbalanced Dataset

In [16]:
pip install imblearn

Collecting imblearn
  Downloading imblearn-0.0-py2.py3-none-any.whl (1.9 kB)
Collecting imbalanced-learn
  Downloading imbalanced_learn-0.8.0-py3-none-any.whl (206 kB)
[K     |████████████████████████████████| 206 kB 1.3 MB/s eta 0:00:01
Collecting scikit-learn>=0.24
  Downloading scikit_learn-0.24.2-cp37-cp37m-macosx_10_13_x86_64.whl (7.2 MB)
[K     |████████████████████████████████| 7.2 MB 286 kB/s eta 0:00:01
Collecting threadpoolctl>=2.0.0
  Downloading threadpoolctl-2.2.0-py3-none-any.whl (12 kB)
Installing collected packages: threadpoolctl, scikit-learn, imbalanced-learn, imblearn
  Attempting uninstall: scikit-learn
    Found existing installation: scikit-learn 0.22.1
    Uninstalling scikit-learn-0.22.1:
      Successfully uninstalled scikit-learn-0.22.1
Successfully installed imbalanced-learn-0.8.0 imblearn-0.0 scikit-learn-0.24.2 threadpoolctl-2.2.0
Note: you may need to restart the kernel to use updated packages.


In [16]:
from imblearn.under_sampling import RandomUnderSampler
undersampling = RandomUnderSampler()
X_under, y_under = undersampling.fit_resample(X_train,y_train)


In [17]:
from imblearn.over_sampling import RandomOverSampler
oversampling = RandomOverSampler()
X_over, y_over = oversampling.fit_resample(X_train,y_train)


In [18]:
X_dummy = pd.get_dummies(X_train)
X_dummy = X_dummy.fillna(0)

In [19]:
from imblearn.over_sampling import SMOTE
smote = SMOTE()
X_smote, y_smote = smote.fit_resample(X_dummy,y_train)


In [20]:
y_under.value_counts()

1    1660
0    1660
Name: ever_stress, dtype: int64

In [21]:
y_over.value_counts()

1    3881
0    3881
Name: ever_stress, dtype: int64

In [22]:
y_smote.value_counts()

1    3881
0    3881
Name: ever_stress, dtype: int64

# Handling Text Data

## Cleaning

In [23]:
import nltk
nltk.download ('stopwords')
nltk.download ('punkt')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ahmadwaliradhi/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/ahmadwaliradhi/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [24]:
kalimat = "Berikut ini adalah 5 negara dengan pendidikan terbaik di dunia adalah Korea Selatan, Jepang, Singapura, Hong Kong, dan Finlandia."
lower_case = kalimat.lower()
print(lower_case)

berikut ini adalah 5 negara dengan pendidikan terbaik di dunia adalah korea selatan, jepang, singapura, hong kong, dan finlandia.


In [25]:
import re # impor modul regular expression
kalimat = "Berikut ini adalah 5 negara dengan pendidikan terbaik di dunia adalah Korea Selatan, Jepang, Singapura, Hong Kong, dan Finlandia."
hasil = re.sub(r"\d+", "", kalimat)
print(hasil)

Berikut ini adalah  negara dengan pendidikan terbaik di dunia adalah Korea Selatan, Jepang, Singapura, Hong Kong, dan Finlandia.


In [27]:
pip install inflect

Collecting inflect
  Downloading inflect-5.3.0-py3-none-any.whl (32 kB)
Installing collected packages: inflect
Successfully installed inflect-5.3.0
Note: you may need to restart the kernel to use updated packages.


In [28]:
import inflect
kalimat = "i have 3 shoes and 10 shirts"
p = inflect.engine()
hasil = []
for i in kalimat.split():
  if i.isnumeric():
    i = p.number_to_words(i)
  hasil.append(i)
hasil = " ".join(hasil)
print(hasil)

i have three shoes and ten shirts


In [29]:
import string
kalimat = "Ini &adalah [contoh] kalimat? {dengan} tanda. baca?!!"
hasil = kalimat.translate(str.maketrans("","",string.punctuation))
print(hasil)

Ini adalah contoh kalimat dengan tanda baca


In [30]:
kalimat = "halo @kawanku apa kabar? #pertemanan"
print(re.sub("#[A-Za-z0-9]+","",kalimat)) #tanpa hashtag
print(re.sub("@[A-Za-z0-9]+","",kalimat)) #tanpa mention

halo @kawanku apa kabar? 
halo  apa kabar? #pertemanan


In [31]:
kalimat = "bisa cek disini http://www.google.com gan"
print(re.sub(r"(?:\@|http?\://|https?\://|www)\S+", "", kalimat)) #Remove http links

bisa cek disini  gan


In [32]:
kalimat = " \t ini kalimat contoh\t "
hasil = kalimat.strip()
print(hasil)

ini kalimat contoh


## Tokenizing

In [33]:
kalimat = "rumah idaman adalah rumah yang bersih"
pisah = kalimat.split()
print(pisah)

['rumah', 'idaman', 'adalah', 'rumah', 'yang', 'bersih']


In [34]:
# import word_tokenize dari modul nltk
from nltk.tokenize import word_tokenize 
 
kalimat = "Andi kerap melakukan transaksi rutin secara daring atau online."
 
tokens = nltk.tokenize.word_tokenize(kalimat)
print(tokens)

['Andi', 'kerap', 'melakukan', 'transaksi', 'rutin', 'secara', 'daring', 'atau', 'online', '.']


In [35]:
# import sent_tokenize dari modul nltk
from nltk.tokenize import sent_tokenize
kalimat = "Andi kerap melakukan transaksi rutin secara daring atau online. Menurut Andi belanja online lebih praktis & murah."
 
tokens = nltk.tokenize.sent_tokenize(kalimat)
print(tokens)

['Andi kerap melakukan transaksi rutin secara daring atau online.', 'Menurut Andi belanja online lebih praktis & murah.']


In [36]:
from nltk.corpus import stopwords
 
kalimat = "Andi kerap melakukan transaksi rutin secara daring atau online. Menurut Andi belanja online lebih praktis & murah."
kalimat = kalimat.translate(str.maketrans('','',string.punctuation)).lower()
 
tokens = word_tokenize(kalimat)
listStopword =  set(stopwords.words('indonesian'))
 
removed = []
for t in tokens:
    if t not in listStopword:
        removed.append(t)
 
print(removed)


['andi', 'kerap', 'transaksi', 'rutin', 'daring', 'online', 'andi', 'belanja', 'online', 'praktis', 'murah']


In [37]:
list(listStopword)[10:20]

['seterusnya',
 'diantara',
 'pula',
 'serupa',
 'diberi',
 'siapakah',
 'ditanya',
 'setengah',
 'menyangkut',
 'kelihatannya']

In [41]:
pip install Sastrawi

Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[K     |████████████████████████████████| 209 kB 1.4 MB/s eta 0:00:01
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1
Note: you may need to restart the kernel to use updated packages.


In [42]:
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
factory = StemmerFactory()
stemmer = factory.create_stemmer()
 
kalimat = "Andi kerap melakukan transaksi rutin secara daring atau online. Menurut Andi belanja online lebih praktis & murah."
hasil = stemmer.stem(kalimat)
print(hasil)

andi kerap laku transaksi rutin cara daring atau online turut andi belanja online lebih praktis murah


#Vectorization (Bag of Words and TF-IDF)

In [43]:
dt = ["We love dogs! :*", 
      "We hate dogs and knitting :(", 
      "Knitting is our hobby and passion :)"]
dt = pd.DataFrame(dt, columns = ["text"])

In [44]:
dt

Unnamed: 0,text
0,We love dogs! :*
1,We hate dogs and knitting :(
2,Knitting is our hobby and passion :)


In [45]:
def cleaning_tokenizing(txt):
  txt = txt.lower() # menjadi huruf kecil semua
  txt = txt.translate(str.maketrans("","",string.punctuation)) #hapus symbol
  txt = nltk.tokenize.word_tokenize(txt) # tokenize
  return txt
dt["text"].apply(cleaning_tokenizing)

0                            [we, love, dogs]
1             [we, hate, dogs, and, knitting]
2    [knitting, is, our, hobby, and, passion]
Name: text, dtype: object

In [46]:
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer()
text_counts= cv.fit_transform(dt["text"])
pd.DataFrame(text_counts.toarray(), columns = cv.get_feature_names())

Unnamed: 0,and,dogs,hate,hobby,is,knitting,love,our,passion,we
0,0,1,0,0,0,0,1,0,0,1
1,1,1,1,0,0,1,0,0,0,1
2,1,0,0,1,1,1,0,1,1,0


In [47]:
from sklearn.feature_extraction.text import TfidfVectorizer
cv = TfidfVectorizer()
text_counts= cv.fit_transform(dt["text"])
pd.DataFrame(text_counts.toarray(), columns = cv.get_feature_names())

Unnamed: 0,and,dogs,hate,hobby,is,knitting,love,our,passion,we
0,0.0,0.517856,0.0,0.0,0.0,0.0,0.680919,0.0,0.0,0.517856
1,0.417796,0.417796,0.549351,0.0,0.0,0.417796,0.0,0.0,0.0,0.417796
2,0.334907,0.0,0.0,0.440362,0.440362,0.334907,0.0,0.440362,0.440362,0.0
