In [9]:
!pip install -q contractions scikit-learn Sastrawi googletrans==4.0.0-rc1 langdetect gdown

import joblib
from google.colab import files
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from bs4 import BeautifulSoup
import nltk
import re
import gdown
import unicodedata
from googletrans import Translator
import contractions
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory

nltk.download('stopwords')
nltk.download('punkt')

# Load the models from Google Drive
file_id_tfidf = '134JrTPXdmm6lXZH84ZGk-xsQsEyvvnrL'
file_id_rf = '1zhdKOAbGP_wsQRRrhbbxRjQ6Wep_3LKu'
url_tfidf = f'https://drive.google.com/uc?id={file_id_tfidf}'
url_rf = f'https://drive.google.com/uc?id={file_id_rf}'

output_tfidf = 'tfidf_vectorizer.joblib'
output_rf = 'random_forest_model.joblib'

# Download models from Google Drive
gdown.download(url_tfidf, output_tfidf, quiet=False)
gdown.download(url_rf, output_rf, quiet=False)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
Downloading...
From: https://drive.google.com/uc?id=134JrTPXdmm6lXZH84ZGk-xsQsEyvvnrL
To: /content/tfidf_vectorizer.joblib
100%|██████████| 181k/181k [00:00<00:00, 13.0MB/s]
Downloading...
From: https://drive.google.com/uc?id=1zhdKOAbGP_wsQRRrhbbxRjQ6Wep_3LKu
To: /content/random_forest_model.joblib
100%|██████████| 7.88M/7.88M [00:00<00:00, 95.3MB/s]


'random_forest_model.joblib'

**Input Data**

In [22]:
new_text = "ormer FTX executive (yes, FTX, Mas SBF), launched a new crypto currency exchange called Backpack Exchange.This exchange aims to avoid mistakes that cause the fall of FTX by using an independent custody wallet that gives users full control of their funds.Backpack Exchange is looking for an investment of $ 100 million with 10% shares"

In [23]:
# Fungsi-fungsi pra-pemrosesan teks
def strip_html_tags(text):
    soup = BeautifulSoup(text, "html.parser")
    [s.extract() for s in soup(['iframe', 'script'])]
    stripped_text = soup.get_text()
    stripped_text = re.sub(r'[\r|\n|\r\n]+', '\n', stripped_text)
    return stripped_text

def remove_accented_chars(text):
    text = unicodedata.normalize('NFKD', text).encode('ascii', 'ignore').decode('utf-8', 'ignore')
    return text

def pre_process_text(text, language):
    text = text.lower()
    text = strip_html_tags(text)
    text = text.translate(text.maketrans("\n\t\r", "   "))
    text = remove_accented_chars(text)
    text = contractions.fix(text)
    text = re.sub(r'[^a-zA-Z0-9\s]', '', text, re.I | re.A)
    text = re.sub(' +', ' ', text)
    if language == 'indonesian':
        text = preprocess_text_sastrawi(text)
    return text

# Fungsi pra-pemrosesan teks khusus Bahasa Indonesia
def preprocess_text_sastrawi(text):
    factory1 = StopWordRemoverFactory()
    stopword_sastrawi = factory1.create_stop_word_remover()

    factory2 = StemmerFactory()
    stemmer_sastrawi = factory2.create_stemmer()

    tokens = nltk.word_tokenize(text)
    tokens = [stopword_sastrawi.remove(token) for token in tokens]
    tokens = [stemmer_sastrawi.stem(token) for token in tokens if token != '']
    return " ".join(tokens)

# Load the models
tfidf_vectorizer = joblib.load(output_tfidf)
rf_classifier = joblib.load(output_rf)

# Preprocess the new text
preprocessed_text = pre_process_text(new_text, 'indonesian')

# Convert the preprocessed text to TF-IDF features using the loaded tfidf_vectorizer
new_text_tfidf = tfidf_vectorizer.transform([preprocessed_text])

# Predict the label for the new text using the loaded rf_classifier
predicted_label = rf_classifier.predict(new_text_tfidf)

translator = Translator()
translated_text = translator.translate(new_text, dest='en').text

# Convert the translated text to TF-IDF features using the loaded tfidf_vectorizer
translated_text_tfidf = tfidf_vectorizer.transform([translated_text])

# Display the sentiment prediction for the translated text
predicted_sentiment = rf_classifier.predict(translated_text_tfidf)
sentiment_probability = rf_classifier.predict_proba(translated_text_tfidf)[0, 1]

threshold = 0.5  # Threshold bisa diatur sesuai kebutuhan
sentiment = "Positive" if sentiment_probability > threshold else "Negative"

# Print hasil prediksi
print("\nText:", translated_text)
print("Sentiment Probability:", sentiment_probability)
print("Sentiment:", sentiment)


Text: ormer FTX executive (yes, FTX, Mas SBF), launched a new crypto currency exchange called Backpack Exchange.This exchange aims to avoid mistakes that cause the fall of FTX by using an independent custody wallet that gives users full control of their funds.Backpack Exchange is looking for an investment of $ 100 million with 10% shares
Sentiment Probability: 0.7
Sentiment: Positive
