In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')

In [2]:
from nltk.tokenize import sent_tokenize, word_tokenize # (sent => sentence, word => word)
from nltk.corpus import stopwords
from string import punctuation
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
factory = StemmerFactory()
stemmer = factory.create_stemmer()

## Problems
***In times like today, the development of social media is very rapidly and had an impact on an existing business. When word of mouth is carried out by netizens on a business product, it will have a good impact and a bad impact. If the good impact is received, it will make profits for the business company, but if the bad impact is carried out by certain individuals such as for example hate speech on the product, badmouthing the product, insulting the product it will harmful the business company,  Therefore it is necessary to have a machine that can assist in filtering negative sentiments that entering our business products.***

## Goals
***Can provide a solution to the problem above by making a machine that can predict a sentence that has a negative or positive connotation, to prevent negative sentences from entering our product business.***

## Data Source
    Thanks to all the sources who have given permission to use the data by publishing it on the website.
    
    The Source all as follows:
    1. https://repo.telematika.org/project/louisowen6_nlp_bahasa_resources/
    2. https://www.kaggle.com/ilhamfp31/indonesian-abusive-and-hate-speech-twitter-text?select=data.csv
    3. https://rizalespe.github.io/Dataset-Sentimen-Analisis-Bahasa-Indonesia/
 

## Data Information
1. The datasets are combined all from the above source, and it is an Indonesian Text.
2. They are labeled into 1 (Positive Sentiment) and -1 (Negative Sentiment).
3. Assisted with another source such as StopWords Indonesian from nltk, Emoticon and Slang Words from repo.telematika.org.
4. This dataset including from Tweets, Instagram Comment, Single Word of Positive and Negative Sentiment Dictionary from repo.telematika.org.
5. Using Sastrawi Package for stemming.

# Data Cleaning & Data Preparation

In [3]:
# read file from txt type to csv type
read_file1 = pd.read_csv('negatif_ta.txt')
read_file1.to_csv('negatif_ta.csv', index=None)

read_file2 = pd.read_csv('negatif_ta2.txt', delimiter='/')
read_file2.to_csv('negatif_ta2.csv', index=None)

read_file3 = pd.read_csv('negative_add.txt')
read_file3.to_csv('negative_add.csv', index=None)

read_file4 = pd.read_csv('negative_keyword.txt', delimiter='/')
read_file4.to_csv('negative_keyword.csv', index=None)

read_file5 = pd.read_csv('positif_ta.txt')
read_file5.to_csv('positif_ta.csv', index=None)

read_file6 = pd.read_csv('positif_ta2.txt')
read_file6.to_csv('positif_ta2.csv', index=None)

read_file7 = pd.read_csv('positive_add.txt')
read_file7.to_csv('positive_add.csv', index=None)

read_file8 = pd.read_csv('positive_keyword.txt')
read_file8.to_csv('positive_keyword.csv', index=None)

In [4]:
# move csv type into a dataframe
df_1 = pd.read_csv('negatif_ta.csv')
df_2 = pd.read_csv('negatif_ta2.csv')
df_3 = pd.read_csv('negative_add.csv')
df_4 = pd.read_csv('negative_keyword.csv')

df_5 = pd.read_csv('positif_ta.csv')
df_6 = pd.read_csv('positif_ta2.csv')
df_7 = pd.read_csv('positive_add.csv')
df_8 = pd.read_csv('positive_keyword.csv')

df_9 = pd.read_csv('abusive.csv')
df_10 = pd.read_csv('data.csv', encoding='latin-1')
df_11 = pd.read_csv('dataset_tweet_sentiment_cellular_service_provider.csv', encoding='utf-8')
df_12 = pd.read_csv('master_emoji.csv', encoding='utf-8')
df_13 = pd.read_csv('dataset_komentar_instagram_cyberbullying.csv', encoding='utf-8')
df_14 = pd.read_csv('dataset_tweet_sentimen_tayangan_tv.csv', encoding='utf-8')
df_15 = pd.read_csv('dataset_tweet_sentiment_opini_film.csv', encoding='utf-8')
df_16 = pd.read_csv('dataset_tweet_sentiment_pilkada_DKI_2017.csv', encoding='utf-8')

In [5]:
# re-indexing dataframe
df_1 = df_1.T.reset_index().T.reset_index().drop(columns='index')
df_2 = df_2.T.reset_index().T.reset_index().drop(columns='index')
df_3 = df_3.T.reset_index().T.reset_index().drop(columns='index')
df_4 = df_4.T.reset_index().T.reset_index().drop(columns='index')

df_5 = df_5.T.reset_index().T.reset_index().drop(columns='index')
df_6 = df_6.T.reset_index().T.reset_index().drop(columns='index')
df_7 = df_7.T.reset_index().T.reset_index().drop(columns='index')
df_8 = df_8.T.reset_index().T.reset_index().drop(columns='index')

In [6]:
# insert the sentiment negative as -1 and positive as 1 into dataframe
df_1['sentiment'] = -1
df_2['sentiment'] = -1
df_3['sentiment'] = -1
df_4['sentiment'] = -1
df_9['sentiment'] = -1

df_5['sentiment'] = 1
df_6['sentiment'] = 1
df_7['sentiment'] = 1
df_8['sentiment'] = 1

In [7]:
# insert the sentiment if there are have 1 hate speech or abusive then it concluded to negative (-1)
# but if there are no hate speech or abusive then it concluded to positive (1)
df_10['sentiment'] = ""

In [8]:
for i in range(df_10.shape[0]):
    if df_10.loc[i]['HS'] == 0 and df_10.loc[i]['Abusive'] == 0 and df_10.loc[i]['HS_Individual'] == 0 and df_10.loc[i]['HS_Group'] == 0 and df_10.loc[i]['HS_Religion'] == 0 and df_10.loc[i]['HS_Race'] == 0 and df_10.loc[i]['HS_Physical'] == 0 and df_10.loc[i]['HS_Gender'] == 0 and df_10.loc[i]['HS_Other'] == 0 and df_10.loc[i]['HS_Weak'] == 0 and df_10.loc[i]['HS_Moderate'] == 0 and df_10.loc[i]['HS_Strong'] == 0:
        df_10['sentiment'][i] += "1"
    else:
        df_10['sentiment'][i] += "-1"

In [9]:
def makna_emoji(text):
    text_split = text.split(' ')
    text_join = '_'.join(text_split)
    return text_join 

df_12['Makna Emoji'] = df_12['Makna Emoji'].apply(lambda i: makna_emoji(i))

In [10]:
df_10_new = df_10[['Tweet','sentiment']]
df_11_new = df_11[['Text Tweet','Sentiment']]
df_12_new = df_12[['Makna Emoji','Sentiment']]
df_13_new = df_13[['Instagram Comment Text','Sentiment']]
df_14_new = df_14[['Text Tweet','Sentiment']]
df_15_new = df_15[['Text Tweet','Sentiment']]
df_16_new = df_16[['Text Tweet','Sentiment']]

In [11]:
# displaying dataframe contains negative word
print("Dataframe 1 (Negative Words)", df_1.shape)
display(df_1.sample())
print(" ")
print("Dataframe 2 (Negative Words)", df_2.shape)
display(df_2.sample())
print(" ")
print("Dataframe 3 (Negative Words)", df_3.shape)
display(df_3.sample())
print(" ")
print("Dataframe 4 (Negative Words)", df_4.shape)
display(df_4.sample())
print(" ")
print("Dataframe 9 (Negative Words)", df_9.shape)
display(df_9.sample())

Dataframe 1 (Negative Words) (306, 2)


Unnamed: 0,0,sentiment
137,basi,-1


 
Dataframe 2 (Negative Words) (3829, 2)


Unnamed: 0,0,sentiment
857,dikasari,-1


 
Dataframe 3 (Negative Words) (154, 2)


Unnamed: 0,0,sentiment
27,bubarkan,-1


 
Dataframe 4 (Negative Words) (3523, 2)


Unnamed: 0,0,sentiment
2061,mengoyak-oyak,-1


 
Dataframe 9 (Negative Words) (125, 2)


Unnamed: 0,ABUSIVE,sentiment
61,ngentot,-1


In [12]:
# displaying dataframe contains positive word
print("Dataframe 5 (Positive Words)", df_5.shape)
display(df_5.sample())
print(" ")
print("Dataframe 6 (Positive Words)", df_6.shape)
display(df_6.sample())
print(" ")
print("Dataframe 7 (Positive Words)", df_7.shape)
display(df_7.sample())
print(" ")
print("Dataframe 8 (Positive Words)", df_8.shape)
display(df_8.sample())

Dataframe 5 (Positive Words) (385, 2)


Unnamed: 0,0,sentiment
302,ridha,1


 
Dataframe 6 (Positive Words) (1678, 2)


Unnamed: 0,0,sentiment
1307,nyaman,1


 
Dataframe 7 (Positive Words) (40, 2)


Unnamed: 0,0,sentiment
4,bimbingan,1


 
Dataframe 8 (Positive Words) (1293, 2)


Unnamed: 0,0,sentiment
1079,secepatnya,1


In [13]:
# displaying dataframe from tweet and emoji
print("Dataframe 10 (Tweet from User)", df_10_new.shape)
display(df_10_new.sample())
print(" ")
print("Dataframe 11 (Tweet from User)", df_11_new.shape)
display(df_11_new.sample())
print(" ")
print("Dataframe 12 (Emoji)", df_12_new.shape)
display(df_12_new.sample())

Dataframe 10 (Tweet from User) (13169, 2)


Unnamed: 0,Tweet,sentiment
11821,RT USER USER USER USER Bener kamu ngga ada apa...,-1


 
Dataframe 11 (Tweet from User) (300, 2)


Unnamed: 0,Text Tweet,Sentiment
90,Sinyal kami paling kuat KATANYA. internetku di...,negative


 
Dataframe 12 (Emoji) (165, 2)


Unnamed: 0,Makna Emoji,Sentiment
26,Speak-No-Evil_Monkey,negative


In [14]:
# rename column 0 into Text
df_1 = df_1.rename(columns={0: 'Text'})
df_2 = df_2.rename(columns={0: 'Text'})
df_3 = df_3.rename(columns={0: 'Text'})
df_4 = df_4.rename(columns={0: 'Text'})
df_9 = df_9.rename(columns={'ABUSIVE': 'Text'})

df_5 = df_5.rename(columns={0: 'Text'})
df_6 = df_6.rename(columns={0: 'Text'})
df_7 = df_7.rename(columns={0: 'Text'})
df_8 = df_8.rename(columns={0: 'Text'})

df_9 = df_9.rename(columns={'ABUSIVE': 'Text'})
df_10_new = df_10_new.rename(columns={'Tweet': 'Text'})
df_11_new = df_11_new.rename(columns={'Text Tweet': 'Text', 'Sentiment' : 'sentiment'})
df_12_new = df_12_new.rename(columns={'Makna Emoji': 'Text', 'Sentiment' : 'sentiment'})
df_13_new = df_13_new.rename(columns={'Instagram Comment Text': 'Text', 'Sentiment' : 'sentiment'})
df_14_new = df_14_new.rename(columns={'Text Tweet': 'Text', 'Sentiment' : 'sentiment'})
df_15_new = df_15_new.rename(columns={'Text Tweet': 'Text', 'Sentiment' : 'sentiment'})
df_16_new = df_16_new.rename(columns={'Text Tweet': 'Text', 'Sentiment' : 'sentiment'})

df_11_new = df_11_new.replace({'positive': '1', 'negative' : '-1'})
df_12_new = df_12_new.replace({'positive': '1', 'negative' : '-1'})
df_13_new = df_13_new.replace({'positive': '1', 'negative' : '-1'})
df_14_new = df_14_new.replace({'positive': '1', 'negative' : '-1'})
df_15_new = df_15_new.replace({'positive': '1', 'negative' : '-1'})
df_16_new = df_16_new.replace({'positive': '1', 'negative' : '-1'})

In [15]:
# Now the dataframes is ready to concat
df = pd.concat([df_1, df_2, df_3, df_4, df_5, df_6, df_7, df_8, df_9, df_10_new, df_11_new, df_12_new, df_13_new, df_14_new, df_15_new, df_16_new], ignore_index=True)
df

Unnamed: 0,Text,sentiment
0,inkonsisten,-1
1,porno,-1
2,teroris,-1
3,sesat,-1
4,tuntut,-1
...,...,...
26862,"Kali saja bpk @aniesbaswedan @sandiuno lihat, ...",1
26863,Kita harus dapat merangkul semua orang tanpa b...,1
26864,Ini jagoanku dibidang digital <Smiling Face Wi...,1
26865,#PesanBijak #OkeOce #GubernurGu3 ...,1


In [16]:
df['sentiment'] = df['sentiment'].astype(int)

In [17]:
pd.crosstab(index=df['sentiment'], columns='count', normalize=True)

col_0,count
sentiment,Unnamed: 1_level_1
-1,0.612052
1,0.387948


In [18]:
#df.to_csv('Sentiment_Analysis.csv')

# Handling Data

In [19]:
# handling missing value, there is no missing value on this dataset
df.isna().sum()

Text         0
sentiment    0
dtype: int64

In [20]:
# handling duplicates data
print('before drop duplicates data :', len(df))
print('unique data :', df['Text'].nunique())

before drop duplicates data : 26867
unique data : 20298


In [21]:
df.drop_duplicates(['Text'], inplace=True)
print('after drop duplicates data :', len(df))
print('unique data :', df['Text'].nunique())

after drop duplicates data : 20298
unique data : 20298


In [22]:
df = df.reset_index().drop(columns='index')

In [23]:
# the dataframe is ready to use
df

Unnamed: 0,Text,sentiment
0,inkonsisten,-1
1,porno,-1
2,teroris,-1
3,sesat,-1
4,tuntut,-1
...,...,...
20293,"Kali saja bpk @aniesbaswedan @sandiuno lihat, ...",1
20294,Kita harus dapat merangkul semua orang tanpa b...,1
20295,Ini jagoanku dibidang digital <Smiling Face Wi...,1
20296,#PesanBijak #OkeOce #GubernurGu3 ...,1


In [24]:
# changing emoji dataframe to dictionary
df_12_emoji = df_12[['Emoji', 'Makna Emoji']]
df_12_emoji = df_12_emoji.set_index('Emoji')
df_12_emoji_new = df_12_emoji.to_dict()
df_12_emoji_new = df_12_emoji_new['Makna Emoji']

In [25]:
# changing slang words dataframe to dictionary
df_13 = pd.read_csv('new_kamusalay.csv', encoding='latin-1')
df_13 = df_13.T.reset_index().T.reset_index().drop(columns='index')
df_13 = df_13.set_index(0)
df_13_new = df_13.to_dict()
df_13_new = df_13_new[1]

In [26]:
df['Text'][12001]

"RT USER: Yang beginian Kecoak dan Belatung pasti gak paham. Yang mereka tahu hanya bagaimana mengambil kebijakan seenak jidatnya sendiri\\xe2\\x80\\xa6'"

In [27]:
df['Text'][13067]

'Dongkrak bonus dr dealer ga enak amat ya kek ngerakit"...\\n.\\n.\\n.\\n.\\nPengen beli dongkrak (buaya)\''

In [28]:
df['Text'][13197]

'Ratusan Massa Ormas Islam Geruduk LBH Jakarta: Ganyang PKI! URL URL'

since I found text like this above, which contains the text 'USER', 'RT or retweet', 'http', 'url', '\n', and unicode UTF-8 character in literal and also I can't convert this utf-8 character emoji, therefore I handle it and decide to drop the words containing like xa, xf, x, etc without numbers and '\' refer to source https://www.utf8-chartable.de/unicode-utf8-table.pl?start=128512&number=1024&names=2&utf8=string-literal

In [29]:
exception = ['user', 'rt', 'http', 'url', 'n', 'x', 'xa', 'xb', 'xc', 'xd', 'xe', 'xf', 'xaa', 'xab', 'xac', 'xad', 'xae', 'xaf', 'xba', 'xbb', 'xbc', 'xbd', 'xbe', 'xbf']

# Text Processing

There are a few steps for text processing :
1. Lowering all words from Text
2. Tokenize the sentence from Text
3. Removing all numbers from Text
4. Removing punctuation from Text
5. Removing like user, retweet, http, etc from Text
6. Removing Indonesian stopwords from Text
7. Converting Indonesian slang words from Text
8. Converting emoji into Meaning of the Emoji
9. Using Sastrawi for stemming process
10. Tokenize the stemmed text
11. Removing Indonesian stopwords  after stemmed

## ==================================================

### 1. Lowering all words from Text

In [30]:
# lowering all words from text
df['text_lower'] = [i.lower() for i in df['Text']] 

In [31]:
df.sample()

Unnamed: 0,Text,sentiment,text_lower
15149,USER USER USER Admin USER ini emang kebangetan...,-1,user user user admin user ini emang kebangetan...


### 2. Tokenize the sentence from Text

In [32]:
# tokenize the sentence
df['text_token'] = [word_tokenize(i) for i in df['text_lower']] 

In [33]:
df.sample()

Unnamed: 0,Text,sentiment,text_lower,text_token
19698,Bukan kah #AHY sdh menunjukkan Jiwa besar dala...,1,bukan kah #ahy sdh menunjukkan jiwa besar dala...,"[bukan, kah, #, ahy, sdh, menunjukkan, jiwa, b..."


### 3. Removing all numbers from Text

In [34]:
# removing numeric
def removedigit(text):
    nondigit = ' '.join(i for i in text if not i.isdigit())
    return word_tokenize(nondigit)

In [35]:
df['text_without_number'] = df['text_token'].apply(lambda i: removedigit(i))

In [36]:
df.sample()

Unnamed: 0,Text,sentiment,text_lower,text_token,text_without_number
18813,Terima kasih pak.... Sudah mau membantu kami u...,1,terima kasih pak.... sudah mau membantu kami u...,"[terima, kasih, pak, ...., sudah, mau, membant...","[terima, kasih, pak, ...., sudah, mau, membant..."


### 4. Removing punctuation from Text

In [37]:
# removing punctuation
def remove_punctuation(text):
    return word_tokenize(' '.join(i for i in text if i not in punctuation))

In [38]:
df['text_no_punctuation'] = df['text_without_number'].apply(lambda i: remove_punctuation(i))

In [39]:
df.sample()

Unnamed: 0,Text,sentiment,text_lower,text_token,text_without_number,text_no_punctuation
19331,"Papa mama aku juga ikut nonton , mereka termas...",1,"papa mama aku juga ikut nonton , mereka termas...","[papa, mama, aku, juga, ikut, nonton, ,, merek...","[papa, mama, aku, juga, ikut, nonton, ,, merek...","[papa, mama, aku, juga, ikut, nonton, mereka, ..."


### 5. Removing like user, retweet, http, etc from Text

In [40]:
# removing like user, retweet, http, url, etc...
def remove_exception(text):
    return word_tokenize(' '.join(i for i in text if (i not in exception) and (i.isalnum())))

In [41]:
df['text_exception'] = df['text_no_punctuation'].apply(lambda i: remove_exception(i))

In [42]:
df.sample()

Unnamed: 0,Text,sentiment,text_lower,text_token,text_without_number,text_no_punctuation,text_exception
250,pupus,-1,pupus,[pupus],[pupus],[pupus],[pupus]


### 6. Removing Indonesian stopwords from Text

In [43]:
# removing stopwords
def remove_stopwords(text):
    return word_tokenize(' '.join(i for i in text if i not in stopwords.words('indonesian')))

In [44]:
df['text_no_stopwords'] = df['text_exception'].apply(lambda i: remove_stopwords(i))

In [45]:
df.sample()

Unnamed: 0,Text,sentiment,text_lower,text_token,text_without_number,text_no_punctuation,text_exception,text_no_stopwords
6635,"Pada momen kegiatan budaya ini, telah direncan...",1,"pada momen kegiatan budaya ini, telah direncan...","[pada, momen, kegiatan, budaya, ini, ,, telah,...","[pada, momen, kegiatan, budaya, ini, ,, telah,...","[pada, momen, kegiatan, budaya, ini, telah, di...","[pada, momen, kegiatan, budaya, ini, telah, di...","[momen, kegiatan, budaya, direncanakan, ribuan..."


### 7. Converting Indonesian slang words from Text

In [46]:
# converting Indonesian slang words
def converter_slangwords(text):
    for i in text:
        if i in list(df_13_new.keys()):
            text[text.index(i)] = df_13_new[i]
        else:
            pass
    text_converted = word_tokenize(" ".join(text))
    return text_converted

In [47]:
df['convert_slangwords'] = df['text_no_stopwords'].apply(lambda i: converter_slangwords(i))

In [48]:
df.sample()

Unnamed: 0,Text,sentiment,text_lower,text_token,text_without_number,text_no_punctuation,text_exception,text_no_stopwords,convert_slangwords
7001,USER Jan kenceng2 ketawanya kaya kunti aja',1,user jan kenceng2 ketawanya kaya kunti aja',"[user, jan, kenceng2, ketawanya, kaya, kunti, ...","[user, jan, kenceng2, ketawanya, kaya, kunti, ...","[user, jan, kenceng2, ketawanya, kaya, kunti, ...","[jan, kenceng2, ketawanya, kaya, kunti, aja]","[jangan, cepat cepat, ketawanya, kayak, kunti,...","[jangan, cepat, cepat, ketawanya, kayak, kunti..."


### 8. Converting emoji into Meaning of the Emoji

In [49]:
# converting Emoji into Meaning of the Emoji
def converter_emojimeaning(text):
    for i in text:
        if i in list(df_12_emoji_new.keys()):
            text[text.index(i)] = df_12_emoji_new[i]
        else:
            pass
    text_converted_emoji = word_tokenize(" ".join(text))
    return text_converted_emoji

In [50]:
df['convert_emojimeaning'] = df['convert_slangwords'].apply(lambda i: converter_emojimeaning(i))

In [51]:
df.sample()

Unnamed: 0,Text,sentiment,text_lower,text_token,text_without_number,text_no_punctuation,text_exception,text_no_stopwords,convert_slangwords,convert_emojimeaning
13060,USER Telusuri cepat pertemuan Hatta Ali dengan...,-1,user telusuri cepat pertemuan hatta ali dengan...,"[user, telusuri, cepat, pertemuan, hatta, ali,...","[user, telusuri, cepat, pertemuan, hatta, ali,...","[user, telusuri, cepat, pertemuan, hatta, ali,...","[telusuri, cepat, pertemuan, hatta, ali, denga...","[telusuri, cepat, pertemuan, hatta, ali, setya...","[telusuri, cepat, pertemuan, hatta, ali, setya...","[telusuri, cepat, pertemuan, hatta, ali, setya..."


### Unpacking from word tokenize into a sentence

In [52]:
def unpacking(text):
    return (' '.join(i for i in text))

In [53]:
df['unpacking_sentence'] = df['convert_emojimeaning'].apply(lambda i: unpacking(i))

In [54]:
df.sample()

Unnamed: 0,Text,sentiment,text_lower,text_token,text_without_number,text_no_punctuation,text_exception,text_no_stopwords,convert_slangwords,convert_emojimeaning,unpacking_sentence
6044,Aku heran deh kenapa yaa kamu gak capek lari-l...,1,aku heran deh kenapa yaa kamu gak capek lari-l...,"[aku, heran, deh, kenapa, yaa, kamu, gak, cape...","[aku, heran, deh, kenapa, yaa, kamu, gak, cape...","[aku, heran, deh, kenapa, yaa, kamu, gak, cape...","[aku, heran, deh, kenapa, yaa, kamu, gak, cape...","[heran, deh, ya, tidak, cape, pikiranku]","[heran, deh, ya, tidak, cape, pikiranku]","[heran, deh, ya, tidak, cape, pikiranku]",heran deh ya tidak cape pikiranku


In [55]:
df.iloc[10195]['unpacking_sentence']

'begitu ajakan belum diam ya divisi kerja bikin set'

### 9. Using Sastrawi for stemming process

In [56]:
def stemming(text):
    stem_text = stemmer.stem(text)
    return stem_text

In [57]:
df['stemmed'] = df['unpacking_sentence'].apply(lambda i: stemming(i))

In [58]:
df.sample()

Unnamed: 0,Text,sentiment,text_lower,text_token,text_without_number,text_no_punctuation,text_exception,text_no_stopwords,convert_slangwords,convert_emojimeaning,unpacking_sentence,stemmed
1193,gelepar,-1,gelepar,[gelepar],[gelepar],[gelepar],[gelepar],[gelepar],[gelepar],[gelepar],gelepar,gelepar


### 10. Tokenize the stemmed text

In [59]:
# tokenize the stemmed text
df['text_token_stemmed'] = [word_tokenize(i) for i in df['stemmed']] 

In [60]:
df.sample()

Unnamed: 0,Text,sentiment,text_lower,text_token,text_without_number,text_no_punctuation,text_exception,text_no_stopwords,convert_slangwords,convert_emojimeaning,unpacking_sentence,stemmed,text_token_stemmed
11641,USER Padahal shua cuma saudara jauhnya kunti l...,1,user padahal shua cuma saudara jauhnya kunti l...,"[user, padahal, shua, cuma, saudara, jauhnya, ...","[user, padahal, shua, cuma, saudara, jauhnya, ...","[user, padahal, shua, cuma, saudara, jauhnya, ...","[padahal, shua, cuma, saudara, jauhnya, kunti,...","[shua, saudara, jauhnya, kunti, lah, oppa]","[shua, saudara, jauhnya, kunti, lah, oppa]","[shua, saudara, jauhnya, kunti, lah, oppa]",shua saudara jauhnya kunti lah oppa,shua saudara jauh kunti lah oppa,"[shua, saudara, jauh, kunti, lah, oppa]"


### 11. Removing Indonesian stopwords from Text

In [61]:
# removing stopwords
def remove_stopwords(text):
    return word_tokenize(' '.join(i for i in text if i not in stopwords.words('indonesian')))

In [62]:
df['text_no_stopwords_stemmed'] = df['text_token_stemmed'].apply(lambda i: remove_stopwords(i))

In [63]:
df.sample()

Unnamed: 0,Text,sentiment,text_lower,text_token,text_without_number,text_no_punctuation,text_exception,text_no_stopwords,convert_slangwords,convert_emojimeaning,unpacking_sentence,stemmed,text_token_stemmed,text_no_stopwords_stemmed
18232,Upload file yang besar juga sangat mudah denga...,1,upload file yang besar juga sangat mudah denga...,"[upload, file, yang, besar, juga, sangat, muda...","[upload, file, yang, besar, juga, sangat, muda...","[upload, file, yang, besar, juga, sangat, muda...","[upload, file, yang, besar, juga, sangat, muda...","[unggah, file, mudah, jaringan, internetnya, c...","[unggah, file, mudah, jaringan, internetnya, c...","[unggah, file, mudah, jaringan, internetnya, c...",unggah file mudah jaringan internetnya cepat a...,unggah file mudah jaring internetnya cepat alh...,"[unggah, file, mudah, jaring, internetnya, cep...","[unggah, file, mudah, jaring, internetnya, cep..."


### Unpacking from word tokenize into a sentence

In [65]:
def unpacking(text):
    return (' '.join(i for i in text))

In [66]:
df['final'] = df['text_no_stopwords_stemmed'].apply(lambda i: unpacking(i))

In [69]:
df.sample()

Unnamed: 0,Text,sentiment,text_lower,text_token,text_without_number,text_no_punctuation,text_exception,text_no_stopwords,convert_slangwords,convert_emojimeaning,unpacking_sentence,stemmed,text_token_stemmed,text_no_stopwords_stemmed,final
15784,USER Bajingan. Inter itu tim top dan hebat. La...,-1,user bajingan. inter itu tim top dan hebat. la...,"[user, bajingan, ., inter, itu, tim, top, dan,...","[user, bajingan, ., inter, itu, tim, top, dan,...","[user, bajingan, inter, itu, tim, top, dan, he...","[bajingan, inter, itu, tim, top, dan, hebat, l...","[bajingan, inter, tim, top, hebat, lazio, yang...","[bajingan, inter, tim, top, hebat, lazio, yang...","[bajingan, inter, tim, top, hebat, lazio, yang...",bajingan inter tim top hebat lazio yang sampah,bajing inter tim top hebat lazio yang sampah,"[bajing, inter, tim, top, hebat, lazio, yang, ...","[bajing, inter, tim, top, hebat, lazio, sampah]",bajing inter tim top hebat lazio sampah


## ==================================================

### All steps have been done, we move to next Notebook

In [70]:
df.to_csv('sentiment_analysis_clean.csv')