In [1]:
import pandas as pd

# Membaca file CSV
df_imdb = pd.read_csv('IMDB Dataset.csv')

# Membaca file Excel
df_indo = pd.read_excel('Indonlu_Sentiment.xlsx')

# Menampilkan kedua dataset
print("Dataset dari CSV (IMdb Review):")
print(df_imdb)

print("\nDataset dari Excel (Ulasan Produk):")
print(df_indo)


Dataset dari CSV (IMdb Review):
                                                  review sentiment
0      One of the other reviewers has mentioned that ...  positive
1      A wonderful little production. <br /><br />The...  positive
2      I thought this was a wonderful way to spend ti...  positive
3      Basically there's a family where a little boy ...  negative
4      Petter Mattei's "Love in the Time of Money" is...  positive
...                                                  ...       ...
49995  I thought this movie did a down right good job...  positive
49996  Bad plot, bad dialogue, bad acting, idiotic di...  negative
49997  I am a Catholic taught in parochial elementary...  negative
49998  I'm going to have to disagree with the previou...  negative
49999  No one expects the Star Trek movies to be high...  negative

[50000 rows x 2 columns]

Dataset dari Excel (Ulasan Produk):
                                                   Tweet     Label
0      warung ini dimiliki oleh pe

In [4]:
# Import untuk preprocessing
import re
import string
from nltk.corpus import stopwords
import nltk

nltk.download('stopwords')

# Fungsi membersihkan teks
def clean_text(text):
    text = text.lower()  # Konversi teks menjadi lowercase
    text = re.sub(r'\[.*?\]', '', text)  # Menghapus teks dalam tanda []
    text = re.sub(r'\w*\d\w*', '', text)  # Menghapus angka
    text = re.sub(r'[%s]' % re.escape(string.punctuation), '', text)  # Menghapus tanda baca
    text = re.sub(r'\n', '', text)  # Menghapus newline
    text = re.sub(r'\s+', ' ', text)  # Menghapus spasi ganda
    return text

# Fungsi menghapus stopwords
def remove_stopwords(text, language='english'):
    stop_words = set(stopwords.words(language))
    return ' '.join([word for word in text.split() if word not in stop_words])

# Load dataset
df_imdb = pd.read_csv('IMDB Dataset.csv')
df_indo = pd.read_excel('Indonlu_Sentiment.xlsx')

# Preprocessing dataset IMDB
df_imdb['cleaned_text'] = df_imdb['review'].apply(clean_text)
df_imdb['cleaned_text'] = df_imdb['cleaned_text'].apply(remove_stopwords, language='english')

# Preprocessing dataset bahasa Indonesia
df_indo['cleaned_text'] = df_indo['Tweet'].apply(clean_text)
df_indo['cleaned_text'] = df_indo['cleaned_text'].apply(remove_stopwords, language='indonesian')

# Menampilkan hasil pembersihan
print("Dataset IMDB setelah pembersihan:")
print(df_imdb[['review', 'cleaned_text']].head())

print("\nDataset Bahasa Indonesia setelah pembersihan:")
print(df_indo[['Tweet', 'cleaned_text']].head())


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Risma\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Dataset IMDB setelah pembersihan:
                                              review  \
0  One of the other reviewers has mentioned that ...   
1  A wonderful little production. <br /><br />The...   
2  I thought this was a wonderful way to spend ti...   
3  Basically there's a family where a little boy ...   
4  Petter Mattei's "Love in the Time of Money" is...   

                                        cleaned_text  
0  one reviewers mentioned watching oz episode yo...  
1  wonderful little production br br filming tech...  
2  thought wonderful way spend time hot summer we...  
3  basically theres family little boy jake thinks...  
4  petter matteis love time money visually stunni...  

Dataset Bahasa Indonesia setelah pembersihan:
                                               Tweet  \
0  warung ini dimiliki oleh pengusaha pabrik tahu...   
1  mohon ulama lurus dan k212 mmbri hujjah partai...   
2  lokasi strategis di jalan sumatera bandung . t...   
3  betapa bahagia nya diri i

In [28]:
# Import untuk vektorisasi
from sklearn.feature_extraction.text import TfidfVectorizer

# Vektorisasi dataset IMDB
vectorizer_imdb = TfidfVectorizer(max_features=1000, max_df=0.95, min_df=5)  # Membuat instance baru untuk dataset IMDB
X_imdb = vectorizer_imdb.fit_transform(df_imdb['cleaned_text'])  # Vektorisasi
terms_imdb = vectorizer_imdb.get_feature_names_out()  # Mendapatkan nama fitur

# Menampilkan beberapa hasil vektorisasi IMDB
print("Vektorisasi IMDB (beberapa baris):")
print(X_imdb[0:5, :].toarray())  # Menampilkan 5 baris pertama
print("Terms IMDB:")
print(terms_imdb[:10])  # Menampilkan 10 istilah pertama

# Vektorisasi dataset Bahasa Indonesia
vectorizer_indo = TfidfVectorizer(max_features=1000, max_df=0.95, min_df=5)  # Membuat instance baru untuk dataset Bahasa Indonesia
X_indo = vectorizer_indo.fit_transform(df_indo['cleaned_text'])  # Vektorisasi
terms_indo = vectorizer_indo.get_feature_names_out()  # Mendapatkan nama fitur

# Menampilkan beberapa hasil vektorisasi Bahasa Indonesia
print("\nVektorisasi Bahasa Indonesia (beberapa baris):")
print(X_indo[0:5, :].toarray())  # Menampilkan 5 baris pertama
print("Terms Bahasa Indonesia:")
print(terms_indo[:10])  # Menampilkan 10 istilah pertama


MemoryError: 

In [22]:
from sklearn.model_selection import train_test_split

# Membagi dataset menjadi training dan testing (80% training, 20% testing)
X_train_imdb, X_test_imdb, y_train_imdb, y_test_imdb = train_test_split(X_imdb, y_imdb, test_size=0.2, random_state=42)
X_train_indo, X_test_indo, y_train_indo, y_test_indo = train_test_split(X_indo, y_indo, test_size=0.2, random_state=42)

# Menampilkan bentuk data latih dan data uji untuk dataset IMDB
print(f'Train Shape IMDB: {X_train_imdb.shape}, Test Shape IMDB: {X_test_imdb.shape}')

# Menampilkan bentuk data latih dan data uji untuk dataset Bahasa Indonesia
print(f'Train Shape Indonesian: {X_train_indo.shape}, Test Shape Indonesian: {X_test_indo.shape}')


Train Shape IMDB: (40000, 4000), Test Shape IMDB: (10000, 4000)
Train Shape Indonesian: (10208, 4000), Test Shape Indonesian: (2552, 4000)


In [25]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report

# Inisialisasi model Logistic Regression
lr_model = LogisticRegression(max_iter=200)

# Melatih model dengan data latih IMDB
lr_model.fit(X_train_imdb, y_train_imdb)

# Memprediksi data uji IMDB
y_pred_lr_imdb = lr_model.predict(X_test_imdb)

# Menghitung akurasi IMDB
print("Akurasi Logistic Regression (IMDB):", accuracy_score(y_test_imdb, y_pred_lr_imdb))
print(classification_report(y_test_imdb, y_pred_lr_imdb))

# Melatih model dengan data latih Indo
lr_model.fit(X_train_indo, y_train_indo)

# Memprediksi data uji Indo
y_pred_lr_indo = lr_model.predict(X_test_indo)

# Menghitung akurasi Indo
print("Akurasi Logistic Regression (Indo):", accuracy_score(y_test_indo, y_pred_lr_indo))
print(classification_report(y_test_indo, y_pred_lr_indo))


MemoryError: Unable to allocate 1.18 GiB for an array with shape (39743, 4001) and data type float64