In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


# PREPROCESSING DATA
1. Loading Data
2. Cleaning Data
3. Case Folding
4. Tokenization
5. Filtering / Stopword Removal
6. Steaming Data

## 1. Loading Data


In [6]:
import pandas as pd
import re
import string
import nltk


In [13]:
data = pd.read_csv('DATASET_TUTOR_YOUTUBE.csv', sep=';')
data.head()


Unnamed: 0,created_at,id_str,full_text,quote_count,reply_count,retweet_count,favorite_count,lang,user_id_str,conversation_id_str,username,tweet_url
0,Sun Oct 08 23:45:58 +0000 2023,1711166073662525852,Haaii aku jual netflix 1 bulan sebulan 20k aja...,0,0,0,0,in,1412947412260319234,1711166073662525852,mixcrowave,https://twitter.com/mixcrowave/status/17111660...
1,Sun Oct 08 19:26:17 +0000 2023,1711100720181862547,Terlepas apakah yang dilakukan Netflix dengan ...,0,0,0,0,in,44882453,1711100720181862547,ddhymaz,https://twitter.com/ddhymaz/status/17111007201...
2,Sun Oct 08 18:24:09 +0000 2023,1711085086664171913,"Watched Ice Cold documentary on Netflix, the m...",0,0,0,2,en,140255354,1711085086664171913,ShireenAmir,https://twitter.com/ShireenAmir/status/1711085...
3,Sun Oct 08 20:19:51 +0000 2023,1711114200817275321,Ice Cold on Netflix is so good,0,0,0,0,en,3070574938,1711114200817275321,jennyglozer,https://twitter.com/jennyglozer/status/1711114...
4,Sun Oct 08 23:32:25 +0000 2023,1711162664553173260,https://t.co/dBHtffcYLr People who have alread...,0,0,0,0,en,1082040168,1711162664553173260,moi_nisa,https://twitter.com/moi_nisa/status/1711162664...


In [14]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6386 entries, 0 to 6385
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   created_at           6386 non-null   object
 1   id_str               6386 non-null   object
 2   full_text            6386 non-null   object
 3   quote_count          6386 non-null   object
 4   reply_count          6386 non-null   object
 5   retweet_count        6386 non-null   object
 6   favorite_count       6386 non-null   object
 7   lang                 6386 non-null   object
 8   user_id_str          6386 non-null   object
 9   conversation_id_str  6386 non-null   object
 10  username             6386 non-null   object
 11  tweet_url            6386 non-null   object
dtypes: object(12)
memory usage: 598.8+ KB


In [15]:
data.drop_duplicates('full_text', keep='first', inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 5964 entries, 0 to 6385
Data columns (total 12 columns):
 #   Column               Non-Null Count  Dtype 
---  ------               --------------  ----- 
 0   created_at           5964 non-null   object
 1   id_str               5964 non-null   object
 2   full_text            5964 non-null   object
 3   quote_count          5964 non-null   object
 4   reply_count          5964 non-null   object
 5   retweet_count        5964 non-null   object
 6   favorite_count       5964 non-null   object
 7   lang                 5964 non-null   object
 8   user_id_str          5964 non-null   object
 9   conversation_id_str  5964 non-null   object
 10  username             5964 non-null   object
 11  tweet_url            5964 non-null   object
dtypes: object(12)
memory usage: 605.7+ KB


In [16]:
df = pd.DataFrame(data['full_text'].iloc[0:150])
df.head()

Unnamed: 0,full_text
0,Haaii aku jual netflix 1 bulan sebulan 20k aja...
1,Terlepas apakah yang dilakukan Netflix dengan ...
2,"Watched Ice Cold documentary on Netflix, the m..."
3,Ice Cold on Netflix is so good
4,https://t.co/dBHtffcYLr People who have alread...


## 2. Cleaning Data
- Cleaning URL
- Cleaning HTML Text
- Cleaning Emojies
- Cleaning Number
- Cleaning Symbols

In [17]:
def remove_URL(tweet):
  url = re.compile(r'https?://S+|www\.\S+')
  return url.sub(r'', tweet)

def remove_html(tweet):
  html = re.compile(r'<.*?>')
  return html.sub(r'', tweet)

def remove_emoji(tweet):
  emoji_pattern = re.compile("[]"
    u"\U0001F600-\U0001F64F"
    u"\U0001F300-\U0001F5FF"
    u"\U0001F680-\U0001F6FF"
    u"\U0001F1E0-\U0001F1FF"
                          "]+", flags=re.UNICODE)
  return emoji_pattern.sub(r'', tweet)

def remove_numbers(tweet):
  tweet = re.sub(r'\d+', '', tweet)
  return tweet

def remove_symbols(tweet):
  tweet = re.sub(r'[^a-zA-Z0-9\s]', '', tweet)
  return tweet

df['cleaning'] = df['full_text'].apply(lambda x: remove_URL(x))
df['cleaning'] = df['cleaning'].apply(lambda x: remove_html(x))
df['cleaning'] = df['cleaning'].apply(lambda x: remove_emoji(x))
df['cleaning'] = df['cleaning'].apply(lambda x: remove_numbers(x))
df['cleaning'] = df['cleaning'].apply(lambda x: remove_symbols(x))


df.head()

Unnamed: 0,full_text,cleaning
0,Haaii aku jual netflix 1 bulan sebulan 20k aja...,Haaii aku jual netflix bulan sebulan k aja la...
1,Terlepas apakah yang dilakukan Netflix dengan ...,Terlepas apakah yang dilakukan Netflix dengan ...
2,"Watched Ice Cold documentary on Netflix, the m...",Watched Ice Cold documentary on Netflix the mo...
3,Ice Cold on Netflix is so good,Ice Cold on Netflix is so good
4,https://t.co/dBHtffcYLr People who have alread...,httpstcodBHtffcYLr People who have already wat...


## 3. Case Folding
- lowercase()

In [18]:
def case_folding(text):
  if isinstance(text, str):
    lowercase_text = text.lower()
    return lowercase_text
  else:
    return text
df['case_folding'] = df['cleaning'].apply(case_folding)


df.head(30)

Unnamed: 0,full_text,cleaning,case_folding
0,Haaii aku jual netflix 1 bulan sebulan 20k aja...,Haaii aku jual netflix bulan sebulan k aja la...,haaii aku jual netflix bulan sebulan k aja la...
1,Terlepas apakah yang dilakukan Netflix dengan ...,Terlepas apakah yang dilakukan Netflix dengan ...,terlepas apakah yang dilakukan netflix dengan ...
2,"Watched Ice Cold documentary on Netflix, the m...",Watched Ice Cold documentary on Netflix the mo...,watched ice cold documentary on netflix the mo...
3,Ice Cold on Netflix is so good,Ice Cold on Netflix is so good,ice cold on netflix is so good
4,https://t.co/dBHtffcYLr People who have alread...,httpstcodBHtffcYLr People who have already wat...,httpstcodbhtffcylr people who have already wat...
5,lebih detailnya lagi setelah netflix #Icecoldn...,lebih detailnya lagi setelah netflix Icecoldne...,lebih detailnya lagi setelah netflix icecoldne...
6,Penjelasan Dokter Ahli Sianida yang tampil dal...,Penjelasan Dokter Ahli Sianida yang tampil dal...,penjelasan dokter ahli sianida yang tampil dal...
7,"And that's kid, is how your mother procrastina...",And thats kid is how your mother procrastinate...,and thats kid is how your mother procrastinate...
8,woi beli netflix dmn yg murce bokap gua mau no...,woi beli netflix dmn yg murce bokap gua mau no...,woi beli netflix dmn yg murce bokap gua mau no...
9,"pgn beli Netflix lg buat nntn ice cold, soalny...",pgn beli Netflix lg buat nntn ice cold soalnya...,pgn beli netflix lg buat nntn ice cold soalnya...


## 4. Tokenization

In [20]:
def tokenize(text):
  tokens = text.split()
  return tokens

df['tokenize'] = df['case_folding'].apply(tokenize)

df.head(30)

Unnamed: 0,full_text,cleaning,case_folding,tokenize
0,Haaii aku jual netflix 1 bulan sebulan 20k aja...,Haaii aku jual netflix bulan sebulan k aja la...,haaii aku jual netflix bulan sebulan k aja la...,"[haaii, aku, jual, netflix, bulan, sebulan, k,..."
1,Terlepas apakah yang dilakukan Netflix dengan ...,Terlepas apakah yang dilakukan Netflix dengan ...,terlepas apakah yang dilakukan netflix dengan ...,"[terlepas, apakah, yang, dilakukan, netflix, d..."
2,"Watched Ice Cold documentary on Netflix, the m...",Watched Ice Cold documentary on Netflix the mo...,watched ice cold documentary on netflix the mo...,"[watched, ice, cold, documentary, on, netflix,..."
3,Ice Cold on Netflix is so good,Ice Cold on Netflix is so good,ice cold on netflix is so good,"[ice, cold, on, netflix, is, so, good]"
4,https://t.co/dBHtffcYLr People who have alread...,httpstcodBHtffcYLr People who have already wat...,httpstcodbhtffcylr people who have already wat...,"[httpstcodbhtffcylr, people, who, have, alread..."
5,lebih detailnya lagi setelah netflix #Icecoldn...,lebih detailnya lagi setelah netflix Icecoldne...,lebih detailnya lagi setelah netflix icecoldne...,"[lebih, detailnya, lagi, setelah, netflix, ice..."
6,Penjelasan Dokter Ahli Sianida yang tampil dal...,Penjelasan Dokter Ahli Sianida yang tampil dal...,penjelasan dokter ahli sianida yang tampil dal...,"[penjelasan, dokter, ahli, sianida, yang, tamp..."
7,"And that's kid, is how your mother procrastina...",And thats kid is how your mother procrastinate...,and thats kid is how your mother procrastinate...,"[and, thats, kid, is, how, your, mother, procr..."
8,woi beli netflix dmn yg murce bokap gua mau no...,woi beli netflix dmn yg murce bokap gua mau no...,woi beli netflix dmn yg murce bokap gua mau no...,"[woi, beli, netflix, dmn, yg, murce, bokap, gu..."
9,"pgn beli Netflix lg buat nntn ice cold, soalny...",pgn beli Netflix lg buat nntn ice cold soalnya...,pgn beli netflix lg buat nntn ice cold soalnya...,"[pgn, beli, netflix, lg, buat, nntn, ice, cold..."


## 5. Filtering / Stopword Removal

In [21]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = stopwords.words('indonesian')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


In [22]:
def remove_stopwords(text):
  return [word for word in text if word not in stop_words]

df['filtering/stopword removal'] = df['tokenize'].apply(lambda x: remove_stopwords(x))

df.head(30)

Unnamed: 0,full_text,cleaning,case_folding,tokenize,filtering/stopword removal
0,Haaii aku jual netflix 1 bulan sebulan 20k aja...,Haaii aku jual netflix bulan sebulan k aja la...,haaii aku jual netflix bulan sebulan k aja la...,"[haaii, aku, jual, netflix, bulan, sebulan, k,...","[haaii, jual, netflix, sebulan, k, aja, promo,..."
1,Terlepas apakah yang dilakukan Netflix dengan ...,Terlepas apakah yang dilakukan Netflix dengan ...,terlepas apakah yang dilakukan netflix dengan ...,"[terlepas, apakah, yang, dilakukan, netflix, d...","[terlepas, netflix, menayangkan, film, dokumen..."
2,"Watched Ice Cold documentary on Netflix, the m...",Watched Ice Cold documentary on Netflix the mo...,watched ice cold documentary on netflix the mo...,"[watched, ice, cold, documentary, on, netflix,...","[watched, ice, cold, documentary, on, netflix,..."
3,Ice Cold on Netflix is so good,Ice Cold on Netflix is so good,ice cold on netflix is so good,"[ice, cold, on, netflix, is, so, good]","[ice, cold, on, netflix, is, so, good]"
4,https://t.co/dBHtffcYLr People who have alread...,httpstcodBHtffcYLr People who have already wat...,httpstcodbhtffcylr people who have already wat...,"[httpstcodbhtffcylr, people, who, have, alread...","[httpstcodbhtffcylr, people, who, have, alread..."
5,lebih detailnya lagi setelah netflix #Icecoldn...,lebih detailnya lagi setelah netflix Icecoldne...,lebih detailnya lagi setelah netflix icecoldne...,"[lebih, detailnya, lagi, setelah, netflix, ice...","[detailnya, netflix, icecoldnetflix, httpstcos..."
6,Penjelasan Dokter Ahli Sianida yang tampil dal...,Penjelasan Dokter Ahli Sianida yang tampil dal...,penjelasan dokter ahli sianida yang tampil dal...,"[penjelasan, dokter, ahli, sianida, yang, tamp...","[penjelasan, dokter, ahli, sianida, tampil, do..."
7,"And that's kid, is how your mother procrastina...",And thats kid is how your mother procrastinate...,and thats kid is how your mother procrastinate...,"[and, thats, kid, is, how, your, mother, procr...","[and, thats, kid, is, how, your, mother, procr..."
8,woi beli netflix dmn yg murce bokap gua mau no...,woi beli netflix dmn yg murce bokap gua mau no...,woi beli netflix dmn yg murce bokap gua mau no...,"[woi, beli, netflix, dmn, yg, murce, bokap, gu...","[woi, beli, netflix, dmn, yg, murce, bokap, gu..."
9,"pgn beli Netflix lg buat nntn ice cold, soalny...",pgn beli Netflix lg buat nntn ice cold soalnya...,pgn beli netflix lg buat nntn ice cold soalnya...,"[pgn, beli, netflix, lg, buat, nntn, ice, cold...","[pgn, beli, netflix, lg, nntn, ice, cold, fyp,..."


## 6. Steaming Data

In [23]:
!pip install Sastrawi

from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from nltk.stem import PorterStemmer
from nltk.stem.snowball import SnowballStemmer

Collecting Sastrawi
  Downloading Sastrawi-1.0.1-py2.py3-none-any.whl (209 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/209.7 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/209.7 kB[0m [31m2.6 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.7/209.7 kB[0m [31m3.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: Sastrawi
Successfully installed Sastrawi-1.0.1


In [24]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()

def stem_text(text):
  return [stemmer.stem(word) for word in text]

df['stemming_data'] = df['filtering/stopword removal'].apply(lambda x: ' '.join(stem_text(x)))

df.head()

Unnamed: 0,full_text,cleaning,case_folding,tokenize,filtering/stopword removal,stemming_data
0,Haaii aku jual netflix 1 bulan sebulan 20k aja...,Haaii aku jual netflix bulan sebulan k aja la...,haaii aku jual netflix bulan sebulan k aja la...,"[haaii, aku, jual, netflix, bulan, sebulan, k,...","[haaii, jual, netflix, sebulan, k, aja, promo,...",haaii jual netflix bulan k aja promo yuukk pu ...
1,Terlepas apakah yang dilakukan Netflix dengan ...,Terlepas apakah yang dilakukan Netflix dengan ...,terlepas apakah yang dilakukan netflix dengan ...,"[terlepas, apakah, yang, dilakukan, netflix, d...","[terlepas, netflix, menayangkan, film, dokumen...",lepas netflix tayang film dokumenter ice cold ...
2,"Watched Ice Cold documentary on Netflix, the m...",Watched Ice Cold documentary on Netflix the mo...,watched ice cold documentary on netflix the mo...,"[watched, ice, cold, documentary, on, netflix,...","[watched, ice, cold, documentary, on, netflix,...",watched ice cold documentary on netflix the mo...
3,Ice Cold on Netflix is so good,Ice Cold on Netflix is so good,ice cold on netflix is so good,"[ice, cold, on, netflix, is, so, good]","[ice, cold, on, netflix, is, so, good]",ice cold on netflix is so good
4,https://t.co/dBHtffcYLr People who have alread...,httpstcodBHtffcYLr People who have already wat...,httpstcodbhtffcylr people who have already wat...,"[httpstcodbhtffcylr, people, who, have, alread...","[httpstcodbhtffcylr, people, who, have, alread...",httpstcodbhtffcylr people who have already wat...


In [25]:
df.to_csv('Hasil_Preprocessing_Data.csv', encoding='utf8', index=False)
