# **Melakukan scraping data pada Trip.com**

In [None]:
# Install library yang dibutuhkan
!pip install google-play-scraper
!pip install pandas numpy scikit-learn

In [None]:
# Import library
from google_play_scraper import Sort, reviews
import pandas as pd
import numpy as np

In [None]:
# Scraping data ulasan
review, continuation_token = reviews(
    'ctrip.english',                    # ID aplikasi Trip.com
    lang='id',                          # Bahasa ulasan (Indonesia)
    country='id',                       # Negara
    sort=Sort.MOST_RELEVANT,            # Sortir berdasarkan relevansi
    count=300                           # Jumlah ulasan yang ingin diambil
)

In [None]:
# Simpan data dalam DataFrame
comments = pd.DataFrame(review)
comments

In [None]:
# Simpan ke CSV
comments.to_csv('tripcom-review.csv', index=False)

# ***Exploratory Data Analysis* (EDA)**

In [None]:
# Import library
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [None]:
# Informasi umum tentang dataset
comments_cleaned.info()

# Cek jumlah nilai yang hilang (missing values)
comments_cleaned.isnull().sum()

In [None]:
# Distribusi skor ulasan
sns.countplot(x='score', data=comments_cleaned, palette='viridis')
plt.title('Distribusi Skor Ulasan')
plt.xlabel('Skor Ulasan')
plt.ylabel('Jumlah Ulasan')
plt.show()

In [None]:
# Tambahkan kolom jumlah karakter pada setiap ulasan
comments_cleaned['content_length'] = comments_cleaned['content'].apply(len)

# Visualisasi distribusi panjang konten ulasan
sns.histplot(comments_cleaned['content_length'], bins=30, kde=True, color='blue')
plt.title('Distribusi Panjang Ulasan')
plt.xlabel('Panjang (Jumlah Karakter)')
plt.ylabel('Frekuensi')
plt.show()


In [None]:
# Import library tambahan
import re
from wordcloud import WordCloud
from nltk.corpus import stopwords

# Bersihkan teks (hilangkan simbol, angka, dll.)
def clean_text(text):
    text = re.sub(r'[^A-Za-z\s]', '', text) # Hapus karakter non-huruf
    text = text.lower()                     # Ubah ke huruf kecil
    return text

# Terapkan pembersihan pada kolom 'content'
comments_cleaned['cleaned_content'] = comments_cleaned['content'].apply(clean_text)

# Gabungkan semua ulasan menjadi satu string
all_words = ' '.join([text for text in comments_cleaned['cleaned_content']])

# Buat WordCloud
wordcloud = WordCloud(stopwords=set(stopwords.words('indonesian')),
                      background_color='white',
                      max_words=100,
                      width=800,
                      height=400).generate(all_words)

# Visualisasikan WordCloud
plt.figure(figsize=(10, 5))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title('Kata yang sering muncul')
plt.show()


In [None]:
# Visualisasi jumlah ulasan yang puas dan tidak puas (berdasarkan skor)
comments_cleaned['sentimen'] = np.where(comments_cleaned['score'] >= 4, 'Puas', 'Tidak Puas')

# Visualisasi distribusi sentimen
sns.countplot(x='sentimen', data=comments_cleaned, palette='pastel')
plt.title('Distribusi Sentimen Ulasan (Berdasarkan Skor)')
plt.xlabel('Sentimen')
plt.ylabel('Jumlah Ulasan')
plt.show()

# **Cleaning Data**

In [None]:
import re

# Hapus kolom yang tidak diperlukan
comments_cleaned = comments[['content', 'score']]  # Hanya ambil kolom 'content' dan 'Skor'

# Hapus komentar yang kosong
comments_cleaned = comments_cleaned.dropna(subset=['content', 'score'])

# Menghapus karakter khusus (tanda baca), angka, dan URL
comments_cleaned['content'] = comments_cleaned['content'].apply(lambda x: re.sub(r'http\S+|www\S+|\S+@\S+', '', x))  # Hapus URL
comments_cleaned['content'] = comments_cleaned['content'].apply(lambda x: re.sub(r'[^a-zA-Z\s]', '', x))  # Hapus karakter selain huruf

# Hapus komentar yang kosong
comments_cleaned = comments_cleaned.dropna(subset=['content'])

# Tampilkan beberapa contoh setelah data cleaning
comments_cleaned.head()

# **Case Folding**

In [None]:
# Terapkan case folding untuk mengubah semua teks menjadi huruf kecil
comments_cleaned['content'] = comments_cleaned['content'].str.lower()

# Tampilkan beberapa contoh setelah case folding
comments_cleaned.head()

#**Tokenisasi**

In [None]:
# Tokenisasi (Pisahkan Teks menjadi Daftar Kata)
comments_cleaned['tokens'] = comments_cleaned['content'].apply(word_tokenize)
comments_cleaned.head()

# **Penghapusan STOPWORD**

In [None]:
import nltk
from nltk.tokenize import word_tokenize  # Impor word_tokenize
from nltk.corpus import stopwords  # Impor stopwords
nltk.data.clear_cache()
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords')

In [None]:
# Hapus Stopwords (Hapus Kata-kata Umum)
try:
    stop_words = set(stopwords.words('indonesian'))  # Daftar stopwords bahasa Indonesia
except OSError:
    print("Stopwords bahasa Indonesia tidak tersedia di NLTK. Gunakan pustaka Sastrawi atau masukkan stopwords secara manual.")
    stop_words = set()  # Jika tidak ada stopwords Indonesia, gunakan set kosong

comments_cleaned['tokens_no_stopwords'] = comments_cleaned['tokens'].apply(lambda x: [word for word in x if word.lower() not in stop_words])  # Hapus stopwords

comments_cleaned.head()

# **STEMMING**

In [None]:
#Stemming data
!pip install Sastrawi
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory

# Inisialisasi Stemmer
factory = StemmerFactory()
stemmer = factory.create_stemmer()

comments_cleaned['stemmed_tokens'] = comments_cleaned['tokens_no_stopwords'].apply(lambda x: [stemmer.stem(word) for word in x])

comments_cleaned.head()

# **LABELING**

In [None]:
#Pelabelan
def label_based_on_score(score):
    """Buat label puas atau tidak puas berdasarkan skor ulasan."""
    if score >= 4:
        return 1  # Puas
    elif score <= 3:
        return 0  # Tidak Puas

comments_cleaned['sentiment_label'] = comments_cleaned['score'].apply(label_based_on_score)

# comments_cleaned['sentiment'] = comments_cleaned['content'].apply(label_sentiment)
comments_cleaned.head()

In [None]:
#Simpan ke CSV
comments_cleaned.to_csv('cleaned_comments_tripcom.csv', index=False)

# **VECTORISASI**

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer


# TF-IDF Vectorizer (Ubah teks ke dalam bentuk vektor numerik)
tfidf_vectorizer = TfidfVectorizer()
X = tfidf_vectorizer.fit_transform(comments_cleaned['content'])  # Fitur (X)
y = comments_cleaned['sentiment_label']  # Label (y)

#Fitur(Kata-kata unik) dari TF-IDF
tfidf_vectorizer.get_feature_names_out()


In [None]:
tfidf_matrix = X.toarray()
tfidf_df = pd.DataFrame(tfidf_matrix, columns=tfidf_vectorizer.get_feature_names_out())
tfidf_df.head(10)
# tfidf_df.to_csv('tfidf_matrix.csv', index=False)

# **SPLITTING DATA**

In [None]:
#Skenario 1# data latih (80%) dan data uji (20%)

from sklearn.model_selection import train_test_split

# Memisahkan fitur dan target
X = comments_cleaned['content']  # Ulasan yang akan dianalisis
y = comments_cleaned['sentiment_label']  # Sentimen yang sudah dilabeli

# Membagi data menjadi data latih (80%) dan data uji (20%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Menampilkan bentuk data latih dan data uji
print("Data Latih:", X_train.shape)
print("Data Uji:", X_test.shape)

In [None]:
#Skenario 2# data latih (60%) dan data uji (40%)

from sklearn.model_selection import train_test_split

# Memisahkan fitur dan target
X = comments_cleaned['content']  # Ulasan yang akan dianalisis
y = comments_cleaned['sentiment_label']  # Sentimen yang sudah dilabeli

# Membagi data menjadi data latih (60%) dan data uji (40%)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.4, random_state=42)

# Menampilkan bentuk data latih dan data uji
print("Data Latih:", X_train.shape)
print("Data Uji:", X_test.shape)

# **UJI MODEL**

**DecissionTree**

In [None]:
#DecisionTree

import joblib
from sklearn.tree import DecisionTreeClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report

# TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Model Decision Tree
model_dt = DecisionTreeClassifier(random_state=42)
model_dt.fit(X_train_tfidf, y_train)

# Prediksi dan evaluasi
y_pred_dt = model_dt.predict(X_test_tfidf)
print("Decision Tree - Accuracy:", accuracy_score(y_test, y_pred_dt))
print("Decision Tree - Classification Report:\n", classification_report(y_test, y_pred_dt))

# Simpan Model dan TF-IDF ke File
joblib.dump(model_dt, 'decision_tree_model.pkl')  # Menyimpan model Decision Tree
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')  # Menyimpan vektorisator
print("Model Decision Tree dan TF-IDF berhasil disimpan!")


**Logistic**

In [None]:
#Logistic

import joblib
from sklearn.linear_model import LogisticRegression  # Import Logistic Regression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report

# TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Model Logistic Regression
model_lr = LogisticRegression()  # Menggunakan Logistic Regression
model_lr.fit(X_train_tfidf, y_train)

# Prediksi dan evaluasi
y_pred_lr = model_lr.predict(X_test_tfidf)
print("Logistic Regression - Accuracy:", accuracy_score(y_test, y_pred_lr))
print("Logistic Regression - Classification Report:\n", classification_report(y_test, y_pred_lr))

# Simpan Model dan TF-IDF ke File
joblib.dump(model_lr, 'logistic_model.pkl')  # Menyimpan model Logistic Regression
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')  # Menyimpan vektorisator
print("Model Logistic Regression dan TF-IDF berhasil disimpan!")

**RandomForest**

In [None]:
import joblib
from sklearn.ensemble import RandomForestClassifier  # Import RandomForestClassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report

# TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Model Random Forest
model_rf = RandomForestClassifier(n_estimators=100, random_state=42)  # Menggunakan Random Forest
model_rf.fit(X_train_tfidf, y_train)

# Prediksi dan evaluasi
y_pred_rf = model_rf.predict(X_test_tfidf)
print("Random Forest - Accuracy:", accuracy_score(y_test, y_pred_rf))
print("Random Forest - Classification Report:\n", classification_report(y_test, y_pred_rf))

# Simpan Model dan TF-IDF ke File
joblib.dump(model_rf, 'random_forest_model.pkl')  # Menyimpan model Random Forest
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')  # Menyimpan vektorisator
print("Model Random Forest dan TF-IDF berhasil disimpan!")

**Naive Bayes**

In [None]:
import joblib
from sklearn.naive_bayes import MultinomialNB  # Import Multinomial Naive Bayes
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, classification_report

# TF-IDF Vectorizer
vectorizer = TfidfVectorizer(max_features=5000)
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Model Naive Bayes
model_nb = MultinomialNB()  # Menggunakan Naive Bayes
model_nb.fit(X_train_tfidf, y_train)

# Prediksi dan evaluasi
y_pred_nb = model_nb.predict(X_test_tfidf)
print("Naive Bayes - Accuracy:", accuracy_score(y_test, y_pred_nb))
print("Naive Bayes - Classification Report:\n", classification_report(y_test, y_pred_nb))

# Simpan Model dan TF-IDF ke File
joblib.dump(model_nb, 'naive_bayes_model.pkl')  # Menyimpan model Naive Bayes
joblib.dump(vectorizer, 'tfidf_vectorizer.pkl')  # Menyimpan vektorisator
print("Model Naive Bayes dan TF-IDF berhasil disimpan!")


# **DEPLOYMENT TO STREAMLIT**

In [None]:
!pip install streamlit

In [None]:
%%writefile app.py
import streamlit as st
import joblib
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import re
from sklearn.feature_extraction.text import TfidfVectorizer  # Gunakan TfidfVectorizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from google_play_scraper import reviews, Sort
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, classification_report
import nltk
nltk.download('punkt')
nltk.download('stopwords')

# Memuat model dan vektorisator
model = joblib.load('naive_bayes_model.pkl')  # Pastikan menggunakan model yang benar
vectorizer = joblib.load('tfidf_vectorizer.pkl')  # Memuat vektorisator yang sesuai

# Fungsi untuk prediksi sentimen
def predict_sentiment(text):
    # Mengubah teks input menjadi fitur numerik menggunakan TF-IDF
    text_tfidf = vectorizer.transform([text])
    # Prediksi sentimen
    sentiment = model.predict(text_tfidf)[0]
    return sentiment

# Fungsi pembersihan teks
def clean_text(text):
    text = re.sub(r'[^A-Za-z\s]', '', text)  # Hapus karakter non-huruf
    text = text.lower()  # Ubah ke huruf kecil
    return text

# **EDA Fungsi**
def perform_eda(data):
    st.subheader('1. Informasi Data')
    st.write(data.head())
    st.write("Jumlah Data:", data.shape[0])
    st.write("Jumlah Kolom:", data.shape[1])
    st.write(data.info())

    st.subheader('2. Distribusi Skor Ulasan')
    fig, ax = plt.subplots()
    sns.countplot(x='score', data=data, palette='viridis', ax=ax)
    ax.set_title('Distribusi Skor Ulasan')
    ax.set_xlabel('Skor Ulasan')
    ax.set_ylabel('Jumlah Ulasan')
    st.pyplot(fig)

    st.subheader('3. Distribusi Panjang Ulasan')
    data['content_length'] = data['content'].apply(len)
    fig, ax = plt.subplots()
    sns.histplot(data['content_length'], bins=30, kde=True, color='blue', ax=ax)
    ax.set_title('Distribusi Panjang Ulasan')
    ax.set_xlabel('Panjang Ulasan (Jumlah Karakter)')
    ax.set_ylabel('Frekuensi')
    st.pyplot(fig)

    st.subheader('4. WordCloud Kata Dominan')
    all_words = ' '.join([clean_text(text) for text in data['content']])
    wordcloud = WordCloud(stopwords=set(stopwords.words('indonesian')),
                          background_color='white',
                          max_words=100,
                          width=800,
                          height=400).generate(all_words)
    plt.figure(figsize=(10, 5))
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis('off')
    st.pyplot()

    st.subheader('5. Sentimen Berdasarkan Skor')
    data['sentimen'] = np.where(data['score'] >= 4, 'Puas', 'Tidak Puas')
    fig, ax = plt.subplots()
    sns.countplot(x='sentimen', data=data, palette='pastel', ax=ax)
    ax.set_title('Distribusi Sentimen (Berdasarkan Skor)')
    ax.set_xlabel('Sentimen')
    ax.set_ylabel('Jumlah Ulasan')
    st.pyplot(fig)

#FUNGSI PREPOCESSING DATA

# Inisialisasi Stemmer Sastrawi
factory = StemmerFactory()
stemmer = factory.create_stemmer()

# **Fungsi Pembersihan Data**
def clean_text(text):
    text = re.sub(r'http\S+|www\S+|\S+@\S+', '', text)  # Hapus URL dan email
    text = re.sub(r'[^A-Za-z\s]', '', text)  # Hapus karakter selain huruf
    text = text.lower()  # Ubah ke huruf kecil (case folding)
    return text

# **Fungsi Tokenisasi**
def tokenize_text(text):
    return word_tokenize(text)

# **Fungsi untuk Hapus Stopwords**
def remove_stopwords(tokens):
    stop_words = set(stopwords.words('indonesian'))  # Stopwords bahasa Indonesia
    return [word for word in tokens if word.lower() not in stop_words]

# **Fungsi untuk Stemming**
def stem_tokens(tokens):
    return [stemmer.stem(word) for word in tokens]

# **Fungsi untuk Memberikan Label Sentimen**
def label_based_on_score(score):
    """Buat label puas atau tidak puas berdasarkan skor ulasan."""
    if score >= 4:
        return 1  # Puas
    elif score <= 3:
        return 0  # Tidak Puas

# **Fungsi Preprocessing**
def preprocess_data(data):
    # Hanya ambil kolom 'content' dan 'score'
    if 'content' in data.columns and 'score' in data.columns:
        comments_cleaned = data[['content', 'score']].copy()
    else:
        st.error("File CSV harus memiliki kolom 'content' dan 'score'.")
        return None

    # Hapus nilai kosong
    comments_cleaned = comments_cleaned.dropna(subset=['content', 'score'])

    # Pembersihan teks (URL, karakter khusus)
    comments_cleaned['content'] = comments_cleaned['content'].apply(clean_text)

    # Tokenisasi
    comments_cleaned['tokens'] = comments_cleaned['content'].apply(tokenize_text)

    # Hapus stopwords
    comments_cleaned['tokens_no_stopwords'] = comments_cleaned['tokens'].apply(remove_stopwords)

    # Stemming
    comments_cleaned['stemmed_tokens'] = comments_cleaned['tokens_no_stopwords'].apply(stem_tokens)

    # Memberikan label berdasarkan skor
    comments_cleaned['sentiment_label'] = comments_cleaned['score'].apply(label_based_on_score)


    # Proses TF-IDF Vectorization
    tfidf_vectorizer = TfidfVectorizer()
    X = tfidf_vectorizer.fit_transform(comments_cleaned['content'])  # Fitur (X)
    y = comments_cleaned['sentiment_label']  # Label (y)

    # Simpan ke DataFrame
    tfidf_matrix = X.toarray()
    tfidf_df = pd.DataFrame(tfidf_matrix, columns=tfidf_vectorizer.get_feature_names_out())

    return comments_cleaned, tfidf_df

# **Fungsi untuk Split Data, Latih Model, dan Evaluasi**
def train_and_evaluate_model(data):
    X = data['content']
    y = data['sentiment_label']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    tfidf_vectorizer = TfidfVectorizer(max_features=5000)
    X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
    X_test_tfidf = tfidf_vectorizer.transform(X_test)

    model_nb = MultinomialNB()
    model_nb.fit(X_train_tfidf, y_train)
    y_pred_nb = model_nb.predict(X_test_tfidf)
    acc_nb = accuracy_score(y_test, y_pred_nb)
    report_nb = classification_report(y_test, y_pred_nb)
    joblib.dump(model_nb, 'naive_bayes_model.pkl')

    model_lr = LogisticRegression(max_iter=1000)
    model_lr.fit(X_train_tfidf, y_train)
    y_pred_lr = model_lr.predict(X_test_tfidf)
    acc_lr = accuracy_score(y_test, y_pred_lr)
    report_lr = classification_report(y_test, y_pred_lr)
    joblib.dump(model_lr, 'logistic_regression_model.pkl')

    model_rf = RandomForestClassifier(n_estimators=100, random_state=42)
    model_rf.fit(X_train_tfidf, y_train)
    y_pred_rf = model_rf.predict(X_test_tfidf)
    acc_rf = accuracy_score(y_test, y_pred_rf)
    report_rf = classification_report(y_test, y_pred_rf)
    joblib.dump(model_rf, 'random_forest_model.pkl')

    joblib.dump(tfidf_vectorizer, 'tfidf_vectorizer.pkl')

    return {
        'Naive Bayes': {'accuracy': acc_nb, 'report': report_nb},
        'Logistic Regression': {'accuracy': acc_lr, 'report': report_lr},
        'Random Forest': {'accuracy': acc_rf, 'report': report_rf}
    }
#__________________________________________________________________________________________________________________________________________________________________________________________________

# **Streamlit App Layout**
st.title('Sentiment Analysis & EDA')
st.write("Masukkan ulasan untuk menganalisis sentimen atau unggah file CSV untuk melakukan EDA.")

# **Tab Navigasi**
tab1, tab2, tab3, tab4, tab5 = st.tabs(["📊 EDA", "📈 Prediksi Sentimen", "Scapping", "Preprocessing Data", "Training"])

# **TAB 1: EDA**
with tab1:
    st.header('📊 Eksplorasi Data Ulasan (EDA)')
    uploaded_file = st.file_uploader("Unggah file CSV untuk EDA", type=["csv"])

    if uploaded_file is not None:
        # Membaca file CSV
        data = pd.read_csv(uploaded_file)

        # Pastikan hanya kolom 'content' dan 'score' yang diambil
        if 'content' in data.columns and 'score' in data.columns:
            perform_eda(data[['content', 'score']])
        else:
            st.error("File CSV harus memiliki kolom 'content' dan 'score'.")
    else:
        st.write("Unggah file CSV untuk memulai EDA.")

# **TAB 2: Prediksi Sentimen**
with tab2:
    st.header('📈 Prediksi Sentimen')

    # Input teks dari pengguna
    user_input = st.text_area("Masukkan teks ulasan")

    # Jika pengguna mengklik tombol, tampilkan prediksi
    if st.button('Prediksi Sentimen'):
        if user_input:
            sentiment = predict_sentiment(user_input)
            if sentiment == 1:
                st.success("Sentimen: **Puas** 😄")
            else:
                st.error("Sentimen: **Tidak Puas** 😞")
        else:
            st.warning("Tolong masukkan teks untuk analisis.")

#**TAB 3: Scrapping**
with tab3:
    st.header("Scraping Ulasan Google Play Store")
    app_id = st.text_input("Masukkan ID Aplikasi Google Play", 'ctrip.english')
    jumlah_ulasan = st.number_input("Jumlah ulasan yang ingin diambil", min_value=10, max_value=1000, value=300, step=10)

    if st.button('Scrape Data Ulasan'):
        if app_id:
            with st.spinner('Mengambil ulasan dari Google Play Store...'):
                try:
                    review, _ = reviews(
                        app_id=app_id,
                        lang='id',
                        country='id',
                        count=int(jumlah_ulasan),  # Pastikan jumlah ulasan adalah integer
                        sort=Sort.MOST_RELEVANT  # Gunakan Sort.MOST_RELEVANT bukan string
                    )
                    # Validasi data yang diambil
                    if isinstance(review, str):  # Jika review adalah string, berarti ada kesalahan
                        st.error(f"Terjadi kesalahan saat scraping: {review}")
                    else:
                        data = pd.DataFrame(review)
                        st.success(f"Berhasil mengambil ulasan dari Google Play Store!")

                        # Tampilkan DataFrame
                        st.write("📋 **Tampilan Data**")
                        st.dataframe(data.head(10))  # Menampilkan 10 data pertama

                        # Simpan file CSV
                        st.download_button(
                            label="📁 Unduh Data Ulasan Sebagai CSV",
                            data=data.to_csv(index=False),
                            file_name='ulasan_google_play.csv',
                            mime='text/csv',
                        )

                        # Simpan data ke session state
                        st.session_state['scraped_data'] = data

                except Exception as e:
                    st.error(f"Terjadi kesalahan saat scraping: {e}")
        else:
            st.error("Masukkan ID aplikasi Google Play.")

#**TAB 4: Preprocessing Data**
with tab4:
    st.header("Preprocessing Data")
    uploaded_file = st.file_uploader("Unggah file CSV untuk Preprocessing", type=["csv"])

    if uploaded_file is not None:
        # Baca file CSV
        data = pd.read_csv(uploaded_file)

        # Tampilkan preview data
        st.write("📋 **Preview Data Sebelum Preprocessing**")
        st.dataframe(data.head(10))

        if st.button('Mulai Preprocessing'):
            with st.spinner('Proses preprocessing sedang berjalan...'):
                preprocessed_data, tfidf_df = preprocess_data(data)
                if preprocessed_data is not None:
                    st.success('Preprocessing selesai!')
                    st.write("📋 **Preview Data Setelah Preprocessing**")
                    st.dataframe(preprocessed_data.head(10))

                    # Simpan file hasil preprocessing
                    csv_file = preprocessed_data.to_csv(index=False)
                    st.download_button(
                        label="📁 Unduh Data Preprocessed Sebagai CSV",
                        data=csv_file,
                        file_name='preprocessed_comments.csv',
                        mime='text/csv',
                    )

                    st.write("📋 **Matriks TF-IDF (10 Baris Pertama)**")
                    st.dataframe(tfidf_df.head(10))

                    # Simpan file hasil TF-IDF
                    csv_tfidf_file = tfidf_df.to_csv(index=False)
                    st.download_button(
                        label="📁 Unduh Matriks TF-IDF Sebagai CSV",
                        data=csv_tfidf_file,
                        file_name='tfidf_matrix.csv',
                        mime='text/csv',
                    )
    else:
        st.info("Silakan unggah file CSV dengan kolom 'content' dan 'score'.")

    st.header('📈 Hasil Preprocessing')

    if 'preprocessed_data' in st.session_state:
        st.write("📋 **Data Setelah Preprocessing**")
        preprocessed_data = st.session_state['preprocessed_data']
        st.dataframe(preprocessed_data.head(10))

        st.download_button(
            label="📁 Unduh Data Preprocessed Sebagai CSV",
            data=preprocessed_data.to_csv(index=False),
            file_name='preprocessed_comments.csv',
            mime='text/csv',
        )
    else:
        st.info("Silakan jalankan preprocessing di tab sebelumnya.")

#**TAB 5: Training Model
with tab5:
    st.header("Training Model")
    uploaded_file = st.file_uploader("Unggah file CSV hasil preprocessing", type=["csv"])
    if uploaded_file is not None:
        data = pd.read_csv(uploaded_file)
        st.dataframe(data.head())
        if st.button('Latih dan Uji Model'):
            results = train_and_evaluate_model(data)
            for model_name, result in results.items():
                st.subheader(f'{model_name}')
                st.write(f'**Akurasi:** {result["accuracy"]:.2f}')
                st.text(result['report'])

Overwriting app.py


# **RUN STREAMLIT**

In [1]:
!streamlit cache clear
!streamlit run app.py & npx localtunnel --port 8501

/bin/bash: line 1: streamlit: command not found
/bin/bash: line 1: streamlit: command not found
[1G[0K⠙[1G[0K⠹[1G[0K⠸[1G[0K⠼[1G[0K⠴[1G[0K⠦[1G[0K⠧[1G[0K⠇[1G[0K⠏[1G[0K[1G[0JNeed to install the following packages:
localtunnel@2.0.2
Ok to proceed? (y) [20G^C
