# Packages

In [1]:
!pip install PySastrawi

Collecting PySastrawi
  Downloading PySastrawi-1.2.0-py2.py3-none-any.whl (210 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/210.6 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━[0m [32m143.4/210.6 kB[0m [31m4.2 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m210.6/210.6 kB[0m [31m4.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PySastrawi
Successfully installed PySastrawi-1.2.0


In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

# Corpus

In [3]:
file_paths = {
    "all_locations": "/content/all_locations.csv",
    # "bandung": "/content/bandung.csv",
    # "banjarbaru": "/content/banjarbaru.csv",
    # "bengkulu": "/content/bengkulu.csv",
    # "denpasar": "/content/denpasar.csv",
    # "jakarta": "/content/jakarta.csv",
    # "jayapura": "/content/jayapura.csv",
    # "maluku": "/content/maluku.csv",
    # "semarang": "/content/semarang.csv",
    # "surabaya": "/content/surabaya.csv",
    # "yogyakarta": "/content/yogyakarta.csv"
}

datasets = {key: pd.read_csv(path) for key, path in file_paths.items()}

# Data preprocessing

## Stemming

In [4]:
stemmed_documents = []

def stemming(city, column):
  corpus = datasets[city][column]

  # stop words with PySastrawi
  stopword_factory = StopWordRemoverFactory()
  remover = stopword_factory.create_stop_word_remover()

  # stemming with PySastrawi
  stemming_factory = StemmerFactory()
  stemmer = stemming_factory.create_stemmer()

  # store in stemmed documents
  for doc in corpus:
    stemmed_doc = [stemmer.stem(word) for word in doc.split()]
    cleaned_text = remover.remove(' '.join(stemmed_doc))
    stemmed_documents.append(cleaned_text)

  return stemmed_documents

## Tokenization

In [5]:
# Parameters
num_words = None
oov_tok = "<OOV>"
lower=True
char_level = False
filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'

def tokenizer(city, column, save, stemmed_documents):
  # Define the tokenizer
  tokenizer = Tokenizer(num_words=num_words,
                        filters=filters,
                        oov_token=oov_tok,
                        lower=lower,
                        char_level=char_level)

  # Fit tokenizer on texts
  tokenizer.fit_on_texts(stemmed_documents)

  # Word index
  word_index = tokenizer.word_index
  word_index_df = pd.DataFrame(list(word_index.items()), columns=['word', 'index'])

  # Word counts
  word_counts = tokenizer.word_counts
  word_counts_df = pd.DataFrame(list(word_counts.items()), columns=['word', 'count']).sort_values(by='count', ascending=False)

  # Save the data frame
  print(word_index_df)
  print(word_counts_df)

## Preprocessing

In [6]:
def preprocessing(city, column, save):
  stemming(city, column)
  tokenizer(city, column, save, stemmed_documents)

In [7]:
preprocessing("all_locations", "metadata", True)

              word  index
0            <OOV>      1
1           pantai      2
2           wisata      3
3             alam      4
4             kota      5
...            ...    ...
5833     relaksasi   5834
5834      humboldt   5835
5835        guinea   5836
5836  menetralisir   5837
5837        rabual   5838

[5838 rows x 2 columns]
        word  count
238   pantai   1000
104   wisata    781
194     alam    559
50      kota    500
93     taman    479
...      ...    ...
3320    type      1
3321   iklim      1
3322   curah      1
3323      mm      1
5836  rabual      1

[5837 rows x 2 columns]


# Word Embedding

# TF-IDF

### TF-IDF in One Corpus

In [8]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

def tfidf_corpus(stemmed_documents, save):
    # Initialize vectorizer
    vectorizer = TfidfVectorizer()

    # Fit vectorizer on all documents
    response = vectorizer.fit_transform(stemmed_documents)

    # Get feature names (words)
    feature_names = vectorizer.get_feature_names_out()

    # Calculate TF-IDF values
    tfidf_values = response.sum(axis=0).A1

    # Save or return DataFrame
    if save is True:
        df = pd.DataFrame({'word': feature_names, 'tfidf': tfidf_values})
        df = df.sort_values(by='tfidf', ascending=False)
        df.to_excel('tfidf_corpus_keywords.xlsx', index=False)
        return df
    else:
        return None

In [9]:
tfidf_corpus(stemmed_documents, True)

Unnamed: 0,word,tfidf
3873,pantai,52.542565
5763,wisata,28.693025
5158,taman,26.394633
456,alam,22.466254
431,air,21.207028
...,...,...
1839,gouverments,0.018955
2858,lalulintas,0.018955
3994,pengantin,0.018955
6,023,0.018955


### TF-IDF in All Documents

In [10]:
def tfidf_docs(stemmed_documents, save):
    # Initialize vectorizer
    vectorizer = TfidfVectorizer()

    # Fit vectorizer on each document
    response = vectorizer.fit_transform(stemmed_documents)

    if save is True:
        feature_names = vectorizer.get_feature_names_out()
        tfidf_values = response.todense().tolist()
        df = pd.DataFrame(tfidf_values, columns=feature_names)
        df = df.transpose()
        df.to_excel('tfidf_docs_keywords.xlsx', index=True)
        return df
    else:
        return None

In [11]:
tfidf_docs(stemmed_documents, True)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,732,733,734,735,736,737,738,739,740,741
00,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
000,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
01,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
016,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
02,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
zheng,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
ziarah,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zona,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.107393,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
zoo,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.000000,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
