<a href="https://colab.research.google.com/github/AngelitaPrettyciaHarefa/AngelitaPrettyciaHarefa.github.io/blob/main/SISTEM.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# SCRAPING DATA

In [None]:
!pip install -qq google-play-scraper

In [None]:
import json
import pandas as pd

from tqdm import tqdm
from pygments import highlight
from pygments.lexers import JsonLexer
from pygments.formatters import TerminalFormatter
from google_play_scraper import Sort, reviews, app

In [None]:
from google.colab import drive

drive.mount('/content/gDrive/', force_remount=True)
path = "gDrive/MyDrive/"

In [None]:
app_packages = [
    'com.simasmobile.co.id'
]

In [None]:
def print_json(json_object):
  json_str = json.dumps(
    json_object,
    indent=2,
    sort_keys=True,
    default=str
  )
  print(highlight(json_str, JsonLexer(), TerminalFormatter()))

In [None]:
app_reviews = []

for ap in tqdm(app_packages):
  rvs, continuation_token = reviews(
      ap,
      lang = 'id',
      country = 'id',
      sort = Sort.MOST_RELEVANT,
      count=1000,
      filter_score_with = None
      )
  app_reviews.extend(rvs)
  rvs, _ = reviews(
      ap,
      continuation_token=continuation_token
  )

print_json(app_reviews[0])

In [None]:
df_app_reviews = pd.DataFrame(app_reviews)
df_app_reviews.head()

In [None]:
df_app_reviews = df_app_reviews.sort_values(by="at", ascending=False)
df_app_reviews.head()

In [None]:
len(app_reviews)

In [None]:
df_app_reviews.info()

In [None]:
df_app_reviews.to_csv(path+'Reviews.csv', index=None, header=True)

# PRE-PROCESSING DATA

In [None]:
import pandas as pd
import numpy as np
import string
import re
import nltk

from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

nltk.download("punkt")
nltk.download("wordnet")

In [None]:
df_app_reviews = pd.read_csv(path+"Reviews.csv")
df_review = df_app_reviews[['content', 'score']]

df_review.head()

In [None]:
# =============== CLEANING ===============

def remove_content_special(text):
    # remove tab, new line, ans back slice
    text = text.replace('\\t'," ").replace('\\n'," ").replace('\\u'," ").replace('\\',"")
    # remove non ASCII (emoticon, chinese word, .etc)
    text = text.encode('ascii', 'replace').decode('ascii')
    # remove mention, link, hashtag
    text = ' '.join(re.sub("([@#][A-Za-z0-9]+)|(\w+:\/\/\S+)"," ", text).split())
    # remove incomplete URL
    return text.replace("http://", " ").replace("https://", " ")
df_review['content_cleaning'] = df_review['content'].apply(remove_content_special)

#remove number
def remove_number(text):
    return  re.sub(r"\d+", "", text)
df_review['content_cleaning'] = df_review['content_cleaning'].apply(remove_number)

#remove punctuation
def remove_punctuation(text):
    return text.translate(str.maketrans("","",string.punctuation))
df_review['content_cleaning'] = df_review['content_cleaning'].apply(remove_punctuation)

#remove whitespace leading & trailing
def remove_whitespace_LT(text):
    return text.strip()
df_review['content_cleaning'] = df_review['content_cleaning'].apply(remove_whitespace_LT)

#remove multiple whitespace into single whitespace
def remove_whitespace_multiple(text):
    return re.sub('\s+',' ',text)
df_review['content_cleaning'] = df_review['content_cleaning'].apply(remove_whitespace_multiple)

#remove single char
def remove_singl_char(text):
    return re.sub(r"\b[a-zA-Z]\b", "", text)
df_review['content_cleaning'] = df_review['content_cleaning'].apply(remove_singl_char)

df_review.head()

In [None]:
# =============== CASE FOLDING ===============

def case_folding(text):
  text = text.lower()
  return text
df_review['content_case_folding'] = df_review['content_cleaning'].apply(case_folding)

df_review.head()

In [None]:
# =============== TOKENIZING ===============

def word_tokenize_wrapper(text):
    return word_tokenize(text)
df_review['content_tokens'] = df_review['content_case_folding'].apply(word_tokenize_wrapper)

df_review.head()

In [None]:
# ================ STOPWORDS REMOVAL ===============
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords

stopwords = stopwords.words('indonesian')
list_stopwords = ['sinarmas', 'yg', 'gak', 'gk', 'dmn', 'nya', 'tp', 'bgs', 'dgn', 'bkn']
stopwords.extend(list_stopwords)

print(stopwords)

In [None]:
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords

stopwords = stopwords.words('indonesian')
list_stopwords = ['sinarmas', 'yg', 'gak', 'gk', 'dmn', 'nya', 'tp', 'bgs', 'dgn', 'bkn']
stopwords.extend(list_stopwords)

#remove stopword pada list token
def stopwords_removal(words):
    return [word for word in words if word not in stopwords]
df_review['content_stopwords_removal'] = df_review['content_tokens'].apply(stopwords_removal)

df_review.head()

In [None]:
# =============== STEMMING ===============

!pip install Sastrawi
!pip install swifter

from sklearn.pipeline import Pipeline
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
import swifter

In [None]:
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# Membuat fungsi untuk stemming
def stemming(text):
  text = [stemmer.stem(word) for word in text]
  return text
df_review['content_stemmed'] = df_review['content_stopwords_removal'].swifter.apply(stemming)

df_review.head()

In [None]:
df_review['content_preprocessing'] = df_review['content_stemmed'].apply(' '.join)
df_review.head()

In [None]:
df_review['content_preprocessing'] = df_review['content_preprocessing'].astype('str')

df_review.info()

In [None]:
df_review.to_csv(path+"Text_Preprocessing.csv", index=None, header=True)

# PELABELAN (SENTIMEN)

In [None]:
import pandas as pd
df_preprocessing = pd.read_csv(path+"Text_Preprocessing.csv")

df_preprocessing.head()

In [None]:
df_preprocessing['score'].value_counts()

In [None]:
# =============== PELABELAN ===============
label = []

for i, row in df_preprocessing.iterrows():
  if row['score'] > 3:
    label.append(1)
  elif row['score'] < 3:
    label.append(-1)
  else:
    label.append(0)

df_preprocessing['label'] = label

df_preprocessing.head()

In [None]:
df_preprocessing['label'].value_counts(normalize=True)

In [None]:
df_preprocessing.info()

In [None]:
# =============== WORDCLOUD ===============

from wordcloud import WordCloud
import matplotlib.pyplot as plt

In [None]:
# Polarity == 1 Positif
train_positif = df_preprocessing[df_preprocessing['label'] == 1]
all_text_positif = ' '.join(map(str, [word for word in train_positif['content_join']]))
wordcloud = WordCloud(colormap='Blues', width=1000, height=1000, mode='RGBA', background_color='white').generate(all_text_positif)
plt.figure(figsize=(10,10))
plt.title('Visualisasi Sentimen Positif')
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.margins(x=0, y=0)
plt.show()

In [None]:
# Polarity == 0 Netral
train_netral = df_preprocessing[df_preprocessing["label"] == 0]
all_text_netral = ' '.join(map(str, [word for word in train_netral["content_join"]]))
wordcloud = WordCloud(colormap='Greens', width=1000, height=1000, mode='RGBA', background_color='white').generate(all_text_netral)
plt.figure(figsize=(10,10))
plt.title('Visualisasi Sentimen Netral')
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.margins(x=0, y=0)
plt.show()

In [None]:
# Polarity == -1 Negatif
train_negatif = df_preprocessing[df_preprocessing["label"] == -1]
all_text_negatif = ' '.join(map(str, [word for word in train_negatif["content_join"]]))
wordcloud = WordCloud(colormap='Reds', width=1000, height=1000, mode='RGBA', background_color='white').generate(all_text_negatif)
plt.figure(figsize=(10,10))
plt.title('Visualisasi Sentimen Negatif')
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.margins(x=0, y=0)
plt.show()

In [None]:
import seaborn as sns
%matplotlib inline
sns.set_style("whitegrid")

#Shape of the dataset, and breakdown of the classes
print(f"rating 1 = {len(df_preprocessing[df_preprocessing['score']==1])} rows")
print(f"rating 2 = {len(df_preprocessing[df_preprocessing['score']==2])} rows")
print(f"rating 3 = {len(df_preprocessing[df_preprocessing['score']==3])} rows")
print(f"rating 4 = {len(df_preprocessing[df_preprocessing['score']==4])} rows")
print(f"rating 5 = {len(df_preprocessing[df_preprocessing['score']==5])} rows")

# Missing values in the dataset
print(f"Number of null in label: { df_preprocessing['score'].isnull().sum() }")
print(f"Number of null in text: { df_preprocessing['score'].isnull().sum()}")

In [None]:
sns.countplot(x='score', data=df_preprocessing);

In [None]:
import seaborn as sns
%matplotlib inline
sns.set_style("whitegrid")

#Shape of the dataset, and breakdown of the classes
print(f"1 = {len(df_preprocessing[df_preprocessing['label']==1])} rows")
print(f"0 = {len(df_preprocessing[df_preprocessing['label']==0])} rows")
print(f"-1 = {len(df_preprocessing[df_preprocessing['label']==-1])} rows")

# Missing values in the dataset
print(f"Number of null in label: { df_preprocessing['label'].isnull().sum() }")
print(f"Number of null in text: { df_preprocessing['label'].isnull().sum()}")

In [None]:
plt.title('Rating Score Ulasan')
sns.countplot(x='label', data=df_preprocessing);

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

data = df_preprocessing['score'].value_counts()

# Membuat plot bar chart
plt.figure(figsize=(6, 6))
data.plot(kind='bar')
plt.xlabel('Rating')
plt.ylabel('Jumlah Ulasan')
plt.title('Rating Score Ulasan')

# Menampilkan plot
plt.show()

In [None]:
import pandas as pd
import matplotlib.pyplot as plt

# Membuat plot bar chart
plt.figure(figsize=(6, 6))

plt.axes().bar(df_preprocessing['label'], df_preprocessing['score'])

plt.xlabel('Rating')
plt.ylabel('Score')
plt.title('Rating Score Ulasan')

# Menampilkan plot
plt.show()

In [None]:
import matplotlib.pyplot as plt

data = df_preprocessing['score'].value_counts()
explode = [0, 0, 0, 0, 0]

# Membuat plot pie chart
plt.figure(figsize=(7, 7))
data.plot(kind='pie', autopct='%1.1f%%', startangle=90, explode=explode, label='')

# Menampilkan plot
plt.title('Rating Score Ulasan')
plt.show()

In [None]:
import matplotlib.pyplot as plt

data = df_preprocessing['label'].value_counts()
explode = [0, 0, 0]

# Membuat plot pie chart
plt.figure(figsize=(7, 7))
data.plot(kind='pie', autopct='%1.1f%%', startangle=90, explode=explode, label='')

# Menampilkan plot
plt.title('Sentimen Score Ulasan')
plt.show()

In [None]:
df_preprocessing.to_csv(path+"Pelabelan_Text.csv", index=None, header=True)

# TF-IDF

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer

vectorizer = TfidfVectorizer()
text_tf = vectorizer.fit_transform(df_preprocessing['content_join'].astype('U'))
print(text_tf)

In [None]:
vectorizer.get_feature_names_out()

In [None]:
text_tf.todense()

In [None]:
print(text_tf.todense().shape)

In [None]:
import pandas as pd

df = pd.DataFrame(text_tf.todense().T,
                  index=vectorizer.get_feature_names_out(),
                  columns=[f'D{i+1}' for i in range(len(df_preprocessing['content_join']))])
df

# PEMBAGIAN DATA TRAINING DAN DATA TESTING
80:20

In [None]:
# SPLITING DATA coba k fold close validation
from sklearn.model_selection import train_test_split

x = text_tf
y = df_preprocessing['label']

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size = 0.2, random_state=42)

print(x_train.shape)
print(x_test.shape)
print(y_train.shape)
print(y_test.shape)

In [None]:
#OVERSAMPLING
print("Before Positif : {}" .format(sum(y_train == 1)))
print("Before Negatif : {}" .format(sum(y_train == -1)))
print("Before Netral  : {} \n".format(sum(y_train == 0)))

from imblearn.over_sampling import SMOTE
sm = SMOTE(random_state = 42)
x_train_res, y_train_res = sm.fit_resample(x_train, y_train.ravel())

print("After x_train : {}".format(x_train_res.shape))
print("After y_train : {} \n".format(y_train_res.shape))

print("After Positif : {}".format(sum(y_train_res == 1)))
print("After Negatif : {}".format(sum(y_train_res == -1)))
print("After Netral  : {}".format(sum(y_train_res == 0)))

# IMPLEMENTASI METODE SVM

In [None]:
from sklearn import svm

elf = svm.SVC(kernel='linear').fit(x_train_res, y_train_res)
predicted = elf.predict(x_test)
print("SVM Accuracy ", accuracy_score(y_test, predicted))
print("SVM Precision ", precision_score(y_test, predicted, average="macro", pos_label="-1"))
print("SVM Recall ", recall_score(y_test, predicted, average="macro", pos_label="-1"))
print("SVM f1_Score ", f1_score(y_test, predicted, average="macro", pos_label="-1"))

print(f'confusion Matrix : \n {confusion_matrix(y_test, predicted)}')

print(classification_report(y_test, predicted, zero_division=0))

In [None]:
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix

# Data aktual
y_actual = ["positif", "negatif", "netral", "positif", "netral", "negatif", "positif", "netral"]

# Data prediksi
y_pred = ["positif", "negatif", "netral", "positif", "positif", "negatif", "netral", "negatif"]

# Definisikan label sentimen
labels = ["positif", "negatif", "netral"]

# Hitung confusion matrix
cm = confusion_matrix(y_actual, y_pred, labels=labels)

# Visualisasikan confusion matrix
plt.figure(figsize=(8, 6))
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=labels, yticklabels=labels)
plt.title("Confusion Matrix")
plt.show()

# TEST
Irwansight

In [None]:
#y = score

In [None]:
# Handling imbalanced
#from imblearn.over_sampling import SMOTE
#smote = SMOTE(k_neighbors = 1)
#x_sm, y_sm = smote.fit_resample(X_tfidf,y)

In [None]:
#from sklearn import model_selection
#Train_X, Test_X, Train_Y, Test_Y =model_selection.train_test_split(x_sm, y_sm, test_size = 0.1, random_state = 0)

In [None]:
#from sklearn.metrics import f1_score
#from sklearn.svm import SVC
# clf = make_pipeline(StandardScaler(with_mean=False), SVC(gamma='auto'))
#svm = SVC()
#svm.fit(Train_X, Train_Y)
#Pred_Y = svm.predict(Test_X)

In [None]:
#f1_score(Test_Y, Pred_Y, average='macro')

In [None]:
#f1_score(Test_Y, Pred_Y, average='micro')

In [None]:
#f1_score(Test_Y, Pred_Y, average='weighted')

In [None]:
#f1_score(Test_Y, Pred_Y, average=None)