# Proyek Data Science Lanjut: Sentiment Analysis
- **Nama:** Daffa Suada
- **Email:** suadaadaffa@gmail.com
- **ID Dicoding:** daffa_suada_i9ug

# Import Dataset

In [30]:
import pandas as pd
import numpy as np

In [31]:
# Import Dataset
sentiment_df = pd.read_csv('ulasan_coc_english.csv')

sentiment_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 25000 entries, 0 to 24999
Data columns (total 11 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   reviewId              25000 non-null  object 
 1   userName              25000 non-null  object 
 2   userImage             25000 non-null  object 
 3   content               25000 non-null  object 
 4   score                 25000 non-null  int64  
 5   thumbsUpCount         25000 non-null  int64  
 6   reviewCreatedVersion  21478 non-null  object 
 7   at                    25000 non-null  object 
 8   replyContent          0 non-null      float64
 9   repliedAt             0 non-null      float64
 10  appVersion            21478 non-null  object 
dtypes: float64(2), int64(2), object(7)
memory usage: 2.1+ MB


In [32]:
# Hitung duplikat
sentiment_df.duplicated().sum()

0

In [33]:
# Hitung kosong
sentiment_df.isna().sum()

reviewId                    0
userName                    0
userImage                   0
content                     0
score                       0
thumbsUpCount               0
reviewCreatedVersion     3522
at                          0
replyContent            25000
repliedAt               25000
appVersion               3522
dtype: int64

In [34]:
# Handle Missing Value
sentiment_df['reviewCreatedVersion'] = sentiment_df['reviewCreatedVersion'].fillna('0')
sentiment_df['replyContent'] = sentiment_df['replyContent'].fillna('0')
sentiment_df['repliedAt'] = sentiment_df['repliedAt'].fillna('0')
sentiment_df['appVersion'] = sentiment_df['appVersion'].fillna('0')

In [35]:
sentiment_df.isna().sum()

reviewId                0
userName                0
userImage               0
content                 0
score                   0
thumbsUpCount           0
reviewCreatedVersion    0
at                      0
replyContent            0
repliedAt               0
appVersion              0
dtype: int64

In [36]:
drop_col = ['userName', 'userImage', 'reviewCreatedVersion', 'thumbsUpCount', 'reviewCreatedVersion','at', 'replyContent', 'repliedAt', 'appVersion']

sentiment_df.drop(columns=drop_col, inplace=True)

sentiment_df.head(3)

Unnamed: 0,reviewId,content,score
0,59725dd0-533b-47a5-874d-110733215800,This last year has brought some amazing change...,5
1,c6c50aae-4e27-4ce3-a8b6-672f812a359a,I've been playing this game for well over 10 y...,3
2,0f198656-b900-4b4e-a3ec-383eb3abfa9e,For november 25th version. There are several b...,1


# Preprocessing Data

In [37]:
import re
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

In [38]:
# Membersihkan Text

def cleaningText(text):
    text = re.sub(r'@[A-Za-z0-9]+', '', text) # menghapus mention
    text = re.sub(r'#[A-Za-z0-9]+', '', text) # menghapus hashtag
    text = re.sub(r'RT[\s]', '', text) # menghapus RT
    text = re.sub(r"http\S+", '', text) # menghapus link
    text = re.sub(r'[0-9]+', '', text) # menghapus angka
    text = re.sub(r'[^\w\s]', '', text) # menghapus karakter selain huruf dan angka

    text = text.replace('\n', ' ') # mengganti baris baru dengan spasi
    text = text.translate(str.maketrans('', '', string.punctuation)) # menghapus semua tanda baca
    text = text.strip(' ') # menghapus karakter spasi dari kiri dan kanan teks
    return text

def casefoldingText(text): # Mengubah semua karakter dalam teks menjadi huruf kecil
    text = text.lower()
    return text

def tokenizingText(text): # Memecah atau membagi string, teks menjadi daftar token
    text = word_tokenize(text)
    return text

def filteringText(text):
    listStopwords = set(stopwords.words('english'))
    listStopwords.update(["i", "iam","me", "my", "myself", "we", "our", "ours", "ourselves",
    "you", "your", "yours", "yourself", "yourselves", "he", "him",
    "his", "himself", "she", "her", "hers", "herself", "it", "its",
    "itself", "they", "them", "their", "theirs", "themselves", "what",
    "which", "who", "whom", "this", "that", "these", "those", "am",
    "is", "are", "was", "were", "be", "been", "being", "have", "has",
    "had", "having", "do", "does", "did", "doing", "a", "an", "the",
    "and", "but", "if", "or", "because", "as", "until", "while",
    "of", "at", "by", "for", "with", "about", "against", "between"])
    # custom_stopwords = [
    # "game", "play", "playing", "supercell", "clash", "games", "clan", "troops", "attack", "builder", "hall", "town", "level", "base", "th", "years", "long", "time", 'get', 'one', 'new', 'please', 'would', 'also', 'even', 'im', 'still', 'make', 'since', 'thing', 'take', 'way', 'want', 'things', 'times', 'every', 'people', 'give', 'days',
    # 'update', 'updates', 'upgrade', 'upgrading', 'upgrades', 'account', 
    #                 'building', 'buildings', 'village', 'townhall', 'bases', 'walls', 
    #                 'trophies', 'stars', 'star', 'gems', 'gold', 'resources', 'cost', 
    #                 'chat', 'screen', 'system', 'option', 'super', 'level', 'levels', 
    #                 'phone', 'app', 'mobile', 'thing', 'things', 'something', 'everything', 
    #                 'nothing', 'stuff', 'see', 'know', 'think', 'say', 'id', 'us', 'theres', 
    #                 'though', 'doesnt', 'thats', 'used', 'another', 'making', 'different', 
    #                 'sometimes', 'first', 'last', 'higher', 'almost', 'going', 'try', 
    #                 'getting', 'started', 'wait', 'experience', 'gameplay', 'progress', 
    #                 'hard', 'find', 'attacks', 'attacking', 'keep', 'lose', 'win', 'work', 
    #                 'support', 'thank', 'thanks', 'hope', 'free', 'buy', 'money', 'spend', "like", "really", "dont", "much", "back", "many", "lot", "always", "ever", "could", "got", "use", "u", "well", 
    # "nice", "able", "without", "start", "made", "old", "overall", "max", "bit", "come", "hours", "easy", "keeps", 
    # "less", "anything", "guys", "two", "high", "feel", "put", "reason", "however", "waiting", "pretty", "done", 
    # "trying", "needs", "next", "due", "enough", "someone", "months", "request", "especially", "real", "already", 
    # "soon", "using", "far", "wish", "amount", "working", "came", "right", "maybe", "look", "ago", "point", "longer", 
    # "anymore", "reduce", "end", "review", "says"]
    # listStopwords.update(custom_stopwords)
    filtered = []
    for txt in text:
        if txt not in listStopwords:
            filtered.append(txt)
    text = filtered
    return text

def stemmingText(text): # Mengurangi kata ke bentuk dasarnya yang menghilangkan imbuhan awalan dan akhiran atau ke akar kata
    # Membuat objek stemmer
    factory = StemmerFactory()
    stemmer = factory.create_stemmer()
 
    # Memecah teks menjadi daftar kata
    words = text
 
    # Menerapkan stemming pada setiap kata dalam daftar
    stemmed_words = [stemmer.stem(word) for word in words]
 
    # Menggabungkan kata-kata yang telah distem
    stemmed_text = ' '.join(stemmed_words)
 
    return stemmed_text

def toSentence(list_words): # Mengubah daftar kata menjadi kalimat
    sentence = ' '.join(word for word in list_words)
    return sentence

slangwords = {"coc": "clash of clans",
    "u": "you",
    "ur": "your",
    "dont": "do not",
    "cant": "cannot",
    "wont": "will not",
    "ive": "i have",
    "im": "i am",
    "id": "i would",
    "thats": "that is",
    "doesnt": "does not",
    "isnt": "is not",
    "didnt": "did not",
    "wanna": "want to",
    "gonna": "going to",
    "aint": "is not",
    "gimme": "give me",
    "lemme": "let me",
    "ya": "you",
    "nah": "no",
    "cuz": "because",
    "tho": "though",
    "btw": "by the way",
    "gg": "good game",
    "ez": "easy",
    "op": "overpowered",
    "nerf": "reduce power",
    "buff": "increase power",
    "gr8": "great",
    "ty": "thank you"}

def fix_slangword(text):
    words = text.split()
    fixed_words = []

    for word in words:
        if word.lower() in slangwords :
            fixed_words.append(slangwords[word.lower()])
        else :
            fixed_words.append(word)

    fixed_text = ' '.join(fixed_words)
    return fixed_text


In [39]:
# Membersihkan text dan menyimpannya ke `text_clean`
sentiment_df['text_clean'] = sentiment_df['content'].apply(cleaningText)

# Mengubah huruf dalam text menjadi huruf kecil
sentiment_df['text_casefoldingText'] = sentiment_df['text_clean'].apply(casefoldingText)

# Mengganti kata kata slang dengan kata kata standar
sentiment_df['text_slangwords'] = sentiment_df['text_casefoldingText'].apply(fix_slangword)

# Memecah teks menjadi token (kata-kata)
sentiment_df['text_tokenizingText'] = sentiment_df['text_slangwords'].apply(tokenizingText)

# Menghapus kata-kata stop atau kata kata umum
sentiment_df['text_stopword'] = sentiment_df['text_tokenizingText'].apply(filteringText)

# Menggabungkan token-token menjadi kalimat
sentiment_df['text_akhir'] = sentiment_df['text_stopword'].apply(toSentence)


sentiment_df.head(3)

Unnamed: 0,reviewId,content,score,text_clean,text_casefoldingText,text_slangwords,text_tokenizingText,text_stopword,text_akhir
0,59725dd0-533b-47a5-874d-110733215800,This last year has brought some amazing change...,5,This last year has brought some amazing change...,this last year has brought some amazing change...,this last year has brought some amazing change...,"[this, last, year, has, brought, some, amazing...","[last, year, brought, amazing, changes, classi...",last year brought amazing changes classic game...
1,c6c50aae-4e27-4ce3-a8b6-672f812a359a,I've been playing this game for well over 10 y...,3,Ive been playing this game for well over year...,ive been playing this game for well over year...,i have been playing this game for well over ye...,"[i, have, been, playing, this, game, for, well...","[playing, game, well, years, great, latest, up...",playing game well years great latest update ho...
2,0f198656-b900-4b4e-a3ec-383eb3abfa9e,For november 25th version. There are several b...,1,For november th version There are several bugs...,for november th version there are several bugs...,for november th version there are several bugs...,"[for, november, th, version, there, are, sever...","[november, th, version, several, bugs, regardi...",november th version several bugs regarding cla...


In [40]:
# Cari kata kata yang sering muncul. 

from collections import Counter
import nltk
nltk.download('punkt')

text_sentiment = sentiment_df['text_akhir']
text = " ".join(text_sentiment).lower()
words = nltk.word_tokenize(text.lower())
word_freq = Counter(words)
print(word_freq.most_common(150))

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/daffasuada/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


[('game', 39812), ('time', 10795), ('like', 7705), ('play', 6855), ('playing', 6563), ('good', 6417), ('get', 6338), ('upgrade', 5766), ('update', 5337), ('one', 5220), ('new', 5189), ('base', 5187), ('would', 5125), ('years', 4887), ('fun', 4464), ('clash', 4398), ('really', 4365), ('great', 4170), ('attack', 4154), ('clans', 4144), ('supercell', 4098), ('please', 4080), ('troops', 3975), ('also', 3813), ('much', 3788), ('love', 3786), ('even', 3780), ('clan', 3713), ('still', 3601), ('make', 3511), ('th', 3425), ('fix', 3385), ('problem', 3376), ('back', 3062), ('long', 3031), ('games', 2998), ('level', 2899), ('best', 2868), ('builder', 2858), ('since', 2803), ('hall', 2691), ('account', 2624), ('many', 2603), ('town', 2597), ('takes', 2577), ('players', 2547), ('thing', 2513), ('want', 2418), ('lot', 2391), ('way', 2344), ('played', 2308), ('take', 2265), ('need', 2243), ('money', 2209), ('things', 2169), ('times', 2073), ('always', 2042), ('every', 2037), ('people', 2035), ('war',

# Labeling Data

In [41]:
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from textblob import TextBlob

## Laveling Vader

In [42]:
# Labeling Data Using Vader
sentiment_vader_df = sentiment_df.copy()

labeling = SentimentIntensityAnalyzer()

# Hitung skor sentimen dengan VADER
sentiment_vader_df['scores'] = sentiment_vader_df['content'].apply(
    lambda text : labeling.polarity_scores(text)
)

# Ambil nilai compound score
sentiment_vader_df['compound'] = sentiment_vader_df['scores'].apply(lambda score: score['compound'])

# Kategorisasi sentimen sesuai aturan VADER
sentiment_vader_df['sentiment_type_Vanbes'] = sentiment_vader_df['compound'].apply(
    lambda x: 'POSITIVE' if x > 0 else 'NEGATIVE' if x < -0 else 'NEUTRAL'
)

# Hitung jumlah masing-masing sentimen
print(sentiment_vader_df['sentiment_type_Vanbes'].value_counts())
sentiment_vader_df[['sentiment_type_Vanbes','compound', 'text_akhir']].head(2)

sentiment_type_Vanbes
POSITIVE    17378
NEGATIVE     7349
NEUTRAL       273
Name: count, dtype: int64


Unnamed: 0,sentiment_type_Vanbes,compound,text_akhir
0,POSITIVE,0.9678,last year brought amazing changes classic game...
1,POSITIVE,0.785,playing game well years great latest update ho...


## Labeling TextBlob

In [43]:
# Labeling Data Using TextBlob
sentiment_textblob_df = sentiment_df.copy()
sentiment_textblob_df['polarity'] = sentiment_textblob_df['text_akhir'].apply(
    lambda text : TextBlob(text).sentiment.polarity
)

sentiment_textblob_df['label'] = sentiment_textblob_df['polarity'].apply(
    lambda polarity : 'Positife' if polarity > 0 else 'Negative' if polarity < -0 else 'Neutral'
)

print(sentiment_textblob_df[['text_akhir', 'polarity','label']].head())
sentiment_textblob_df['label'].value_counts()

                                          text_akhir  polarity     label
0  last year brought amazing changes classic game...  0.156239  Positife
1  playing game well years great latest update ho...  0.128571  Positife
2  november th version several bugs regarding cla...  0.075000  Positife
3  still getting disconnected every time open app... -0.268519  Negative
4  play years catch someone downloaded game month... -0.255556  Negative


label
Positife    13586
Negative    10915
Neutral       499
Name: count, dtype: int64

# Skema Pelatihan

In [44]:
from sklearn.model_selection import train_test_split

from sklearn.feature_extraction.text import TfidfVectorizer
from gensim.models import Word2Vec

from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

from sklearn.metrics import accuracy_score, classification_report

### Ekstraksi Fitur: TF-IDF,    Pembagian Data: 80/20, Labeling: TextBlob

In [45]:
# Pisahkan data menjadi fitur (tweet) dan label (sentimen)
X = sentiment_textblob_df['text_akhir']
y = sentiment_textblob_df['label']

# Ekstraksi Fitur : TF-IDF
vectorizer = TfidfVectorizer(max_features=3500, min_df=8, max_df=0.90)
X_tfidf = vectorizer.fit_transform(X)

# Bagi data menjadi data latih dan data uji
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.2, random_state=42, stratify=y)

#### Support Vector Machine(SVM)

In [46]:
svm_model = SVC(kernel='rbf',random_state=42, class_weight= 'balanced')
svm_model.fit(X_train, y_train)

# Prediksi
y_pred_train = svm_model.predict(X_train)
y_pred_test = svm_model.predict(X_test)

# Evaluasi model
accuracy_train = accuracy_score(y_pred_train, y_train)
accuracy_test = accuracy_score(y_pred_test, y_test)

# Menampilkan akurasi
print('Support Vector Machine - accuracy_train:', accuracy_train)
print('Support Vector Machine - accuracy_test:', accuracy_test)

Support Vector Machine - accuracy_train: 0.98675
Support Vector Machine - accuracy_test: 0.8772


#### Logistic Regression

In [47]:
lr_model = LogisticRegression(max_iter=500, random_state=42)
lr_model.fit(X_train, y_train)

# Prediksi
y_pred_train = lr_model.predict(X_train)
y_pred_test = lr_model.predict(X_test)

#Evaluasi model
accuracy_train = accuracy_score(y_pred_train, y_train)
accuracy_test = accuracy_score(y_pred_test, y_test)

# Menampilkan akurasi
print('Logistic Regression - accuracy_train:', accuracy_train)
print('Logistic Regression - accuracy_test:', accuracy_test)

Logistic Regression - accuracy_train: 0.941
Logistic Regression - accuracy_test: 0.9006


#### Random Forest

In [48]:
random_forest = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
random_forest.fit(X_train, y_train)

# Prediksi
y_pred_train = random_forest.predict(X_train)
y_pred_test = random_forest.predict(X_test)

# Evaluasi model
accuracy_train = accuracy_score(y_pred_train, y_train)
accuracy_test = accuracy_score(y_pred_test, y_test)

# Menampilkan akurasi
print('Random Forest - accuracy_train:', accuracy_train)
print('Random Forest - accuracy_test:', accuracy_test)

Random Forest - accuracy_train: 0.7256
Random Forest - accuracy_test: 0.6904


### Ekstraksi Fitur: Word2Vec,    Pembagian Data: 80/20, Labeling: Vader

In [49]:
# Tokenisasi teks
sentiment_vader_df["tokens"] = sentiment_vader_df["text_akhir"].apply(word_tokenize)
sentences = sentiment_vader_df["tokens"]

# Train Word2Vec
model = Word2Vec(sentences, vector_size=100, window=5, min_count=5, workers=4)
def sentence_vector(sentence, model):
    vectors = [model.wv[word] for word in sentence if word in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(100)

# Konversi semua teks menjadi vektor
X = np.array([sentence_vector(sentence, model) for sentence in sentences])
y = sentiment_vader_df["sentiment_type_Vanbes"]

# Bagi data menjadi data latih dan data uji
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

#### Support Vector Machine(SVM)

In [50]:
svm_model = SVC(kernel='rbf',random_state=42, class_weight= 'balanced')
svm_model.fit(X_train, y_train)

# Prediksi
y_pred_train = svm_model.predict(X_train)
y_pred_test = svm_model.predict(X_test)

# Evaluasi model
accuracy_train = accuracy_score(y_pred_train, y_train)
accuracy_test = accuracy_score(y_pred_test, y_test)

# Menampilkan akurasi
print('Support Vector Machine - accuracy_train:', accuracy_train)
print('Support Vector Machine - accuracy_test:', accuracy_test)

Support Vector Machine - accuracy_train: 0.7842
Support Vector Machine - accuracy_test: 0.7536


#### Logistic Regression

In [51]:
lr_model = LogisticRegression(max_iter=500, random_state=42)
lr_model.fit(X_train, y_train)

# Prediksi
y_pred_train = lr_model.predict(X_train)
y_pred_test = lr_model.predict(X_test)

#Evaluasi model
accuracy_train = accuracy_score(y_pred_train, y_train)
accuracy_test = accuracy_score(y_pred_test, y_test)

# Menampilkan akurasi
print('Logistic Regression - accuracy_train:', accuracy_train)
print('Logistic Regression - accuracy_test:', accuracy_test)

Logistic Regression - accuracy_train: 0.8138
Logistic Regression - accuracy_test: 0.7974


#### Random Forest

In [52]:
random_forest = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
random_forest.fit(X_train, y_train)

# Prediksi
y_pred_train = random_forest.predict(X_train)
y_pred_test = random_forest.predict(X_test)

# Evaluasi model
accuracy_train = accuracy_score(y_pred_train, y_train)
accuracy_test = accuracy_score(y_pred_test, y_test)

# Menampilkan akurasi
print('Random Forest - accuracy_train:', accuracy_train)
print('Random Forest - accuracy_test:', accuracy_test)

Random Forest - accuracy_train: 0.88555
Random Forest - accuracy_test: 0.7844


### Ekstraksi Fitur: TF-IDF,    Pembagian Data: 70/30    , Labeling: TextBlob

In [53]:
# Pisahkan data menjadi fitur (tweet) dan label (sentimen)
X = sentiment_textblob_df['text_akhir']
y = sentiment_textblob_df['label']

# Ekstraksi Fitur : TF-IDF
vectorizer = TfidfVectorizer(max_features=3500, min_df=8, max_df=0.90)
X_tfidf = vectorizer.fit_transform(X)

# Bagi data menjadi data latih dan data uji
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y, test_size=0.3, random_state=42, stratify=y)

#### Support Vector Machine(SVM)

In [54]:
svm_model = SVC(kernel='rbf',random_state=42, class_weight= 'balanced')
svm_model.fit(X_train, y_train)

# Prediksi
y_pred_train = svm_model.predict(X_train)
y_pred_test = svm_model.predict(X_test)

# Evaluasi model
accuracy_train = accuracy_score(y_pred_train, y_train)
accuracy_test = accuracy_score(y_pred_test, y_test)

# Menampilkan akurasi
print('Support Vector Machine - accuracy_train:', accuracy_train)
print('Support Vector Machine - accuracy_test:', accuracy_test)

Support Vector Machine - accuracy_train: 0.9866285714285714
Support Vector Machine - accuracy_test: 0.8705333333333334


#### Logistic Regression

In [55]:
lr_model = LogisticRegression(max_iter=500, random_state=42)
lr_model.fit(X_train, y_train)

# Prediksi
y_pred_train = lr_model.predict(X_train)
y_pred_test = lr_model.predict(X_test)

#Evaluasi model
accuracy_train = accuracy_score(y_pred_train, y_train)
accuracy_test = accuracy_score(y_pred_test, y_test)

# Menampilkan akurasi
print('Logistic Regression - accuracy_train:', accuracy_train)
print('Logistic Regression - accuracy_test:', accuracy_test)

Logistic Regression - accuracy_train: 0.9397142857142857
Logistic Regression - accuracy_test: 0.8945333333333333


#### Random Forest

In [56]:
random_forest = RandomForestClassifier(n_estimators=100, max_depth=10, random_state=42)
random_forest.fit(X_train, y_train)

# Prediksi
y_pred_train = random_forest.predict(X_train)
y_pred_test = random_forest.predict(X_test)

# Evaluasi model
accuracy_train = accuracy_score(y_pred_train, y_train)
accuracy_test = accuracy_score(y_pred_test, y_test)

# Menampilkan akurasi
print('Random Forest - accuracy_train:', accuracy_train)
print('Random Forest - accuracy_test:', accuracy_test)

Random Forest - accuracy_train: 0.7335428571428572
Random Forest - accuracy_test: 0.6878666666666666
