# Installation

In [2]:
!pip install gensim
!pip install PySastrawi
!pip install requests

Collecting PySastrawi
  Downloading PySastrawi-1.2.0-py2.py3-none-any.whl (210 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m210.6/210.6 kB[0m [31m3.9 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: PySastrawi
Successfully installed PySastrawi-1.2.0


# Packages

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
import gensim
import nltk
from collections import OrderedDict
from tensorflow.keras.preprocessing.text import Tokenizer
from sklearn.feature_extraction.text import TfidfVectorizer
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from sklearn.metrics.pairwise import cosine_distances
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

# Dataset

In [4]:
file_paths = {
    "bandung": "/content/bandung.csv",
}

datasets = {key: pd.read_csv(path) for key, path in file_paths.items()}

# Data Preprocessing

In [5]:
class TextPreprocessing():
  def __init__(self, data, city):
    self.data = data
    self.city = city
    self.column = 'metadata'

  def stemming(self):
    stemmed_documents = []
    corpus = self.data[self.city][self.column]

    # stop words with PySastrawi
    stopword_factory = StopWordRemoverFactory()
    remover = stopword_factory.create_stop_word_remover()

    # stemming with PySastrawi
    stemming_factory = StemmerFactory()
    stemmer = stemming_factory.create_stemmer()

    # store in stemmed documents
    for doc in corpus:
      stemmed_doc = [stemmer.stem(word) for word in doc.split()]
      cleaned_text = remover.remove(' '.join(stemmed_doc))
      stemmed_documents.append(cleaned_text)

    return stemmed_documents

  def tokenizer(self, stemmed_documents):
    num_words = None
    oov_tok = "<OOV>"
    lower=True
    char_level = False
    filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n'

    # Define the tokenizer
    tokenizer = Tokenizer(num_words=num_words,
                          filters=filters,
                          oov_token=oov_tok,
                          lower=lower,
                          char_level=char_level)

    # Fit tokenizer on texts
    tokenizer.fit_on_texts(stemmed_documents)

    tokenized_texts = tokenizer.texts_to_sequences(stemmed_documents)

    tokenized_strings = tokenizer.sequences_to_texts(tokenized_texts)

    self.data[self.city]['metadata_tokenized'] = tokenized_strings

    return datasets, tokenized_strings

# User Input

In [7]:
class User():
  def __init__(self):
    self.city = None
    self.budget = None
    self.duration = None
    self.user_preferences = None
    self.combined_list = None

  def input(self, city, budget, duration, user_preferences_1, user_preferences_2=None):
    self.city = str(city)
    self.budget = float(budget)
    self.duration = int(duration)
    self.user_preferences = [user_preferences_1]

    if user_preferences_2 is not None:
      self.user_preferences.append(user_preferences_2)

    self.combined_list = datasets[city]['metadata_tokenized'].tolist()
    return self.combined_list, self.user_preferences, self.duration, self.budget

# Models

## Word2vec

In [9]:
class SumPreferences():
  def __init__(self):
    self.user_preferences = None
    self.dataset = None
    self.vector_size = 100
    self.window = 500
    self.min_count = 2
    self.workers = 100

  def fit(self, dataset, user_preferences):
    self.user_preferences = user_preferences
    self.dataset = dataset

    if len(self.user_preferences) != 1:
      tokenized_data = [gensim.utils.simple_preprocess(sentence) for sentence in dataset]
      model = gensim.models.Word2Vec(tokenized_data, vector_size=self.vector_size, window=self.window, min_count=self.min_count, workers=self.workers)
      similarity = model.wv.similarity(w1=self.user_preferences[0], w2=self.user_preferences[1])
      similar_words = model.wv.most_similar(positive=[self.user_preferences[0], self.user_preferences[1]])
      user_preferences.append(similar_words[0][0])
    else:
      pass
    return user_preferences[2]

## Recommender System

In [10]:
class RecommenderSystem:
  def __init__(self, data, city, metadata_tokenized):
    self.data = data
    self.city = data[city]
    self.metadata_tokenized = metadata_tokenized
    self.vectorizer = None
    self.documents = None
    self.recommendation_result = None

  def fit(self):
    # encode with TF-IDF
    self.vectorizer = TfidfVectorizer()
    self.documents = self.vectorizer.fit_transform(self.metadata_tokenized)

  def recommend(self, user_preferences, top_recommend=10):
    self.list_appended = []

    for preference in user_preferences:
      preference_vector = self.vectorizer.transform([preference])
      distance = cosine_distances(preference_vector, self.documents)
      distance_sort = distance.argsort()[0, 0:top_recommend]
      self.list_appended.append(distance_sort)
    return self.list_appended

  def postprocessing(self):
    combined = list(OrderedDict.fromkeys(np.concatenate(self.list_appended)))
    self.recommendation_result = self.city.iloc[combined]
    return self.recommendation_result

## Filtering

In [17]:
class Filtering:
    def __init__(self, data):
        self.data = data
        self.recommendation_budget = None
        self.recommendation_distance = None

    def budgeting(self, budget):
        self.total_price = 0
        self.selected_rows = []

        for index, row in self.data.iterrows():
            if self.total_price + row['price'] <= budget:
                self.total_price += row['price']
                self.selected_rows.append(row)

        self.recommendation_budget = pd.DataFrame(self.selected_rows)
        return self.recommendation_budget

    def haversine_distance(self, lat1, lon1, lat2, lon2):
        R = 6371
        dlat = np.radians(lat2 - lat1)
        dlon = np.radians(lon2 - lon1)
        a = np.sin(dlat/2) * np.sin(dlat/2) + np.cos(np.radians(lat1)) * np.cos(np.radians(lat2)) * np.sin(dlon/2) * np.sin(dlon/2)
        c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1-a))
        distance = R * c
        return distance

    def nearest_neighbor(self, points, start_lat, start_lon):
        self.route = []
        self.total_distance = None
        start_point = np.array([[start_lat, start_lon]])
        points = np.concatenate((points, start_point), axis=0)

        n = len(points)
        visited = np.zeros(n, dtype=bool)
        visited[-1] = True
        route = [n - 1]
        total_dist = 0

        for i in range(n - 1):
            last_point = route[-1]
            min_dist = float('inf')
            nearest_point = None
            for j in range(n):
                if not visited[j] and j != last_point:
                    dist = self.haversine_distance(points[last_point][0], points[last_point][1], points[j][0], points[j][1])
                    if dist < min_dist:
                        min_dist = dist
                        nearest_point = j
            visited[nearest_point] = True
            route.append(nearest_point)
            total_dist += min_dist

        route.remove(n - 1)
        total_dist += self.haversine_distance(points[route[-1]][0], points[route[-1]][1], points[route[0]][0], points[route[0]][1])

        return route, total_dist

    def distance(self, start_lat, start_long):
        self.start_lat = start_lat
        self.start_long = start_long
        self.coordinates = self.recommendation_budget[['lat', 'long']].values

        route, total_distance = self.nearest_neighbor(self.coordinates, self.start_lat, self.start_long)
        self.route = route
        self.total_distance = total_distance
        corresponding_indices = [self.recommendation_budget.iloc[i]['id'] for i in self.route]
        self.recommendation_distance = self.recommendation_budget.iloc[route]
        return self.route, self.total_distance, self.recommendation_distance

# Carbon Footprint

In [40]:
class CarbonFootprint():
    def __init__(self):
        self.co2_emissions = None

    def car(self):
        emission_factors = 0.1639
        return emission_factors

    def bus(self):
        emission_factors = 0.1022
        return emission_factors

    def motorbike(self):
        emission_factors = 0.1133
        return emission_factors

    # select type of vehicle
    def vehicle_type(self, vehicle_type):
        if vehicle_type == "car":
            return self.car()
        elif vehicle_type == "bus":
            return self.bus()
        elif vehicle_type == "motorbike":
            return self.motorbike()

    # calculate
    def calculate(self, vehicle_type, distance):
        self.co2_emissions = self.vehicle_type(vehicle_type) * distance
        return round(self.co2_emissions, 1)

    def calculate_all(self, distance):
      self.distance = distance
      results = []
      vehicle_type = ['car', 'bus', 'motorbike']
      for vehicle in vehicle_type:
        result = self.calculate(vehicle, self.distance)
        results.append(f"{result} kg CO2")
      return results

# Pipeline

In [41]:
def make_pipeline(datasets, city, budget, duration, user_preferences_1, user_preferences_2=None):
  start_latitude = -3.801742
  start_longitude = 102.226509

  preprocess = TextPreprocessing(datasets, city)
  stemmed_documents = preprocess.stemming()
  datasets, tokenized_strings = preprocess.tokenizer(stemmed_documents)
  # User Input
  user = User()
  combined, user_preferences, duration, budget = user.input(city, budget, duration, user_preferences_1, user_preferences_2)
  # Preferences
  preferences = SumPreferences()
  user_preferences_new = preferences.fit(combined, user_preferences)
  # Recommender System
  recommender = RecommenderSystem(datasets, city, tokenized_strings)
  recommender.fit()
  recommender.recommend(user_preferences)
  recommendation_result = recommender.postprocessing()
  # Filtering
  filter = Filtering(recommendation_result)
  recommendation_budget = filter.budgeting(budget)
  route, total_distance, recommendation_distance = filter.distance(start_latitude, start_longitude)
  # Calculate Carbon Footprint
  footprint = CarbonFootprint()
  results = footprint.calculate_all(total_distance)
  return results

results = make_pipeline(datasets, "bandung", 100000, 1, "wisata", "bandung")
results

['132.6 kg CO2', '82.7 kg CO2', '91.6 kg CO2']