In [None]:
!pip install pandas scikit-learn tensorflow numpy matplotlib seaborn pydrive Sastrawi simpletransformers

In [None]:
import nltk
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from simpletransformers.language_representation import RepresentationModel
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, GlobalAveragePooling1D

In [None]:
#from google.colab import drive
#drive.mount('/content/gdrive')

In [None]:
# Membuat variabel untuk dataset
nama_file = 'dataset_pembullyan.csv'

# Membaca dataset
dataset = pd.read_csv(nama_file)

In [None]:
dataset.shape

In [None]:
dataset.head(10)

In [None]:
# Memisahkan dataset berdasarkan nilai pada kolom 'class'
grouped_datasets = dict(tuple(dataset.groupby('class')))

# Menampilkan isi dari setiap subdataset
for class_value, subdataset in grouped_datasets.items():
    print(f"\nClass: {class_value}")
    print(subdataset)

In [None]:
# Menampilkan banyak dari kolom kelas dengan seaborn
plt.figure(figsize=(10, 6))
sns.countplot(x='class', data=dataset, palette='pastel')  # Gunakan palet warna 'pastel' untuk tampilan yang lembut
plt.title('Banyak Data untuk Setiap Type Kekerasan')
plt.xlabel('Tingkat Kekerasan')
plt.ylabel('Jumlah')
plt.show()

In [None]:
# Membaca dataset kata-kata
kata_kata_file_path = 'dataset_pembullyan.csv'
kata_kata_dataset = pd.read_csv(kata_kata_file_path)

# Membaca dataset user
user_file_path = 'dataset_user.csv'
user_dataset = pd.read_csv(user_file_path)

# Pemisahan dataset menjadi set pelatihan dan pengujian
X_train, X_test, y_train, y_test = train_test_split(
    kata_kata_dataset['text'], kata_kata_dataset['class'], test_size=0.2, random_state=42
)

# Menggunakan TF-IDF untuk mengubah teks menjadi vektor fitur
vectorizer = TfidfVectorizer()
X_train_tfidf = vectorizer.fit_transform(X_train)
X_test_tfidf = vectorizer.transform(X_test)

# Konversi label kategori menjadi angka
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)
y_test_encoded = label_encoder.transform(y_test)

# Membangun model deep learning
model = Sequential()
model.add(Embedding(input_dim=len(vectorizer.get_feature_names_out()), output_dim=16, input_length=X_train_tfidf.shape[1]))
model.add(GlobalAveragePooling1D())
model.add(Dense(16, activation='relu'))
model.add(Dense(3, activation='softmax'))  # 3 karena terdapat 3 kategori: rendah, sedang, parah

model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Melatih model
model.fit(X_train_tfidf.toarray(), y_train_encoded, epochs=1000, validation_data=(X_test_tfidf.toarray(), y_test_encoded))

# Melakukan prediksi pada set pengujian
y_pred_probs = model.predict(X_test_tfidf.toarray())
y_pred = y_pred_probs.argmax(axis=1)

# Evaluasi performa model
accuracy = accuracy_score(y_test_encoded, y_pred)
classification_report_result = classification_report(y_test_encoded, y_pred, target_names=label_encoder.classes_)

print(f'Accuracy: {accuracy}')
print('Classification Report:')
print(classification_report_result)


In [None]:
# Menggunakan model untuk prediksi pada kalimat user
user_tfidf = vectorizer.transform(user_dataset['kalimat'])
user_predictions_probs = model.predict(user_tfidf.toarray())
user_predictions = user_predictions_probs.argmax(axis=1)
user_dataset['Predicted_Class'] = label_encoder.inverse_transform(user_predictions)

print('\nHasil untuk Kalimat User:')
print(user_dataset[['kalimat', 'Predicted_Class']])