In [None]:
#attention LSTM+word2vec
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras_self_attention import SeqSelfAttention
from gensim.models import Word2Vec


def process_msg(text):
    text = re.sub(r'(@|https?)\S+|#', '', text)
    filters = '@!"\'#$%&()*+,-/:;<=>[\\]^_`{|}~\t\n'
    translate_dict = dict((c, " ") for c in filters)
    translate_map = str.maketrans(translate_dict)
    text = text.translate(translate_map)
    text = text.lower()
    text = text.strip()
    stop_words = set(stopwords.words('turkish'))
    words = word_tokenize(text)
    words_filtered = [word for word in words if word.isalpha() and word not in stop_words]
    return ' '.join(words_filtered)


xt = pd.read_excel('/content/drive/My Drive/TCK_hakaret_tespiti/half29072024_teknofest.xlsx')
xt = xt.dropna()
xt['mesaj'] = xt['mesaj'].astype(str).apply(process_msg)


X = xt['mesaj']
y = xt['label']


label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)


X_processed = [text.split() for text in X]
word2vec_model = Word2Vec(sentences=X_processed, vector_size=128, window=5, min_count=1, workers=4, sg=1)


max_len = 100
embedding_matrix = np.zeros((len(word2vec_model.wv.key_to_index) + 1, 128))
for i, word in enumerate(word2vec_model.wv.index_to_key):
    embedding_matrix[i + 1] = word2vec_model.wv[word]


def get_word2vec_sequences(texts):
    sequences = []
    for text in texts:
        sequence = []
        for word in text.split():
            if word in word2vec_model.wv:
                sequence.append(word2vec_model.wv.key_to_index[word] + 1)
        sequences.append(sequence)
    return pad_sequences(sequences, maxlen=max_len)

X_pad = get_word2vec_sequences(X)


def create_model():
    model = Sequential()
    model.add(Embedding(input_dim=len(word2vec_model.wv.key_to_index) + 1, output_dim=128, weights=[embedding_matrix], input_length=max_len, trainable=False))
    model.add(LSTM(128, return_sequences=True))
    model.add(SeqSelfAttention(attention_activation='sigmoid'))
    model.add(Dropout(0.5))
    model.add(LSTM(64, return_sequences=True))
    model.add(Dropout(0.5))
    model.add(LSTM(64))
    model.add(Dropout(0.5))
    model.add(Dense(len(np.unique(y_encoded)), activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


kfold = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = []
all_y_pred = []
all_y_test = []

for train, test in kfold.split(X_pad, y_encoded):
    model = create_model()
    model.fit(X_pad[train], y_encoded[train], epochs=10, batch_size=32, verbose=0)
    scores = model.evaluate(X_pad[test], y_encoded[test], verbose=0)
    cv_scores.append(scores[1])

    y_pred = model.predict(X_pad[test])
    y_pred_classes = np.argmax(y_pred, axis=1)
    all_y_pred.extend(y_pred_classes)
    all_y_test.extend(y_encoded[test])

print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {np.mean(cv_scores)}")


conf_matrix = confusion_matrix(all_y_test, all_y_pred)
print("Confusion Matrix:")
print(conf_matrix)

class_report = classification_report(all_y_test, all_y_pred, target_names=label_encoder.classes_)
print("\nClassification Report:")
print(class_report)


plt.figure(figsize=(10, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title('LSTM+word2vecConfusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# Kullanıcıdan mesaj al ve sınıflandır
def classify_message(message):

    message = process_msg(message)
    

    message_seq = tokenizer.texts_to_sequences([message])
    

    message_pad = pad_sequences(message_seq, maxlen=max_len)
    

    model = create_model()
    prediction = model.predict(message_pad)
    prediction_class = np.argmax(prediction, axis=1)
    

    return label_encoder.inverse_transform(prediction_class)[0]


message = input("Sınıflandırmak istediğiniz mesajı girin: ")
prediction = classify_message(message)
print(f"Tahmin: {prediction}")

In [None]:
#attention GRU+word2vec
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras_self_attention import SeqSelfAttention
from gensim.models import Word2Vec

def process_msg(text):
    text = re.sub(r'(@|https?)\S+|#', '', text)
    filters = '@!"\'#$%&()*+,-./:;<=>[\\]^_`{|}~\t\n'
    translate_dict = dict((c, " ") for c in filters)
    translate_map = str.maketrans(translate_dict)
    text = text.translate(translate_map)
    text = text.lower()
    text = text.strip()
    stop_words = set(stopwords.words('turkish'))
    words = word_tokenize(text)
    words_filtered = [word for word in words if word.isalpha() and word not in stop_words]
    return ' '.join(words_filtered)


xt = pd.read_excel('/content/drive/My Drive/TCK_hakaret_tespiti/half29072024_teknofest.xlsx')
xt = xt.dropna()
xt['mesaj'] = xt['mesaj'].astype(str).apply(process_msg)


X = xt['mesaj']
y = xt['label']
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)


X_processed = [text.split() for text in X]
word2vec_model = Word2Vec(sentences=X_processed, vector_size=128, window=5, min_count=1, workers=4, sg=1)
max_len = 100
embedding_matrix = np.zeros((len(word2vec_model.wv.key_to_index) + 1, 128))
for i, word in enumerate(word2vec_model.wv.index_to_key):
    embedding_matrix[i + 1] = word2vec_model.wv[word]


def get_word2vec_sequences(texts):
    sequences = []
    for text in texts:
        sequence = []
        for word in text.split():
            if word in word2vec_model.wv:
                sequence.append(word2vec_model.wv.key_to_index[word] + 1)
        sequences.append(sequence)
    return pad_sequences(sequences, maxlen=max_len)

X_pad = get_word2vec_sequences(X)
max_words = len(word2vec_model.wv.key_to_index) + 1
num_classes = len(np.unique(y_encoded))


def create_model():
    model = Sequential()
    model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
    model.add(Bidirectional(GRU(128, return_sequences=True)))
    model.add(SeqSelfAttention(attention_activation='sigmoid'))  # Attention katmanı eklendi
    model.add(Dropout(0.5))
    model.add(GRU(64))
    model.add(Dropout(0.5))
    model.add(Dense(len(np.unique(y_encoded)), activation='softmax'))  # Çıkış katmanı
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


kfold = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = []
all_y_pred = []
all_y_test = []

for train, test in kfold.split(X_pad, y_encoded):
    model = create_model(max_words, max_len, num_classes)
    model.fit(X_pad[train], y_encoded[train], epochs=10, batch_size=32, verbose=0)
    scores = model.evaluate(X_pad[test], y_encoded[test], verbose=0)
    cv_scores.append(scores[1])

    y_pred = model.predict(X_pad[test])
    y_pred_classes = np.argmax(y_pred, axis=1)
    all_y_pred.extend(y_pred_classes)
    all_y_test.extend(y_encoded[test])


conf_matrix = confusion_matrix(all_y_test, all_y_pred)
target_names = [str(cls) for cls in label_encoder.classes_]
class_report = classification_report(all_y_test, all_y_pred, target_names=target_names)

print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {np.mean(cv_scores)}")
print("\nClassification Report:")
print(class_report)

plt.figure(figsize=(10, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=target_names, yticklabels=target_names)
plt.title('CNN+Word2Vec Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# Kullanıcıdan mesaj al ve sınıflandır
def classify_message(message):

    message = process_msg(message)
    

    message_seq = tokenizer.texts_to_sequences([message])
    

    message_pad = pad_sequences(message_seq, maxlen=max_len)
    

    model = create_model()
    prediction = model.predict(message_pad)
    prediction_class = np.argmax(prediction, axis=1)
    
 
    return label_encoder.inverse_transform(prediction_class)[0]


message = input("Sınıflandırmak istediğiniz mesajı girin: ")
prediction = classify_message(message)
print(f"Tahmin: {prediction}")

In [None]:
#attention CNN+word2vec
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, GlobalMaxPooling1D, Dense, Dropout
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras_self_attention import SeqSelfAttention
from gensim.models import Word2Vec


def process_msg(text):
    text = re.sub(r'(@|https?)\S+|#', '', text)
    filters = '@!"\'#$%&()*+,-./:;<=>[\\]^_`{|}~\t\n'
    translate_dict = dict((c, " ") for c in filters)
    translate_map = str.maketrans(translate_dict)
    text = text.translate(translate_map)
    text = text.lower()
    text = text.strip()
    stop_words = set(stopwords.words('turkish'))
    words = word_tokenize(text)
    words_filtered = [word for word in words if word.isalpha() and word not in stop_words]
    return ' '.join(words_filtered)


xt = pd.read_excel('/content/drive/My Drive/TCK_hakaret_tespiti/half29072024_teknofest.xlsx')
xt = xt.dropna()
xt['mesaj'] = xt['mesaj'].astype(str).apply(process_msg)


X = xt['mesaj']
y = xt['label']
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)


X_processed = [text.split() for text in X]
word2vec_model = Word2Vec(sentences=X_processed, vector_size=128, window=5, min_count=1, workers=4, sg=1)
max_len = 100
embedding_matrix = np.zeros((len(word2vec_model.wv.key_to_index) + 1, 128))
for i, word in enumerate(word2vec_model.wv.index_to_key):
    embedding_matrix[i + 1] = word2vec_model.wv[word]


def get_word2vec_sequences(texts):
    sequences = []
    for text in texts:
        sequence = []
        for word in text.split():
            if word in word2vec_model.wv:
                sequence.append(word2vec_model.wv.key_to_index[word] + 1)
        sequences.append(sequence)
    return pad_sequences(sequences, maxlen=max_len)

X_pad = get_word2vec_sequences(X)
max_words = len(word2vec_model.wv.key_to_index) + 1
num_classes = len(np.unique(y_encoded))


def create_model():
    model = Sequential()
    model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
    model.add(Conv1D(filters=64, kernel_size=5, activation='relu'))  # Konvolüsyon katmanı
    model.add(MaxPooling1D(pool_size=2))  # Max pooling katmanı
    model.add(Conv1D(filters=64, kernel_size=5, activation='relu'))  # İkinci konvolüsyon katmanı
    model.add(MaxPooling1D(pool_size=2))  # İkinci max pooling katmanı
    model.add(SeqSelfAttention(attention_activation='sigmoid'))  # Attention katmanı
    model.add(GlobalMaxPooling1D())  # Global max pooling katmanı
    model.add(Dropout(0.5))  # Dropout katmanı
    model.add(Dense(len(np.unique(y_encoded)), activation='softmax'))  # Çıkış katmanı
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


kfold = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = []
all_y_pred = []
all_y_test = []

for train, test in kfold.split(X_pad, y_encoded):
    model = create_model(max_words, max_len, num_classes)
    model.fit(X_pad[train], y_encoded[train], epochs=10, batch_size=32, verbose=0)
    scores = model.evaluate(X_pad[test], y_encoded[test], verbose=0)
    cv_scores.append(scores[1])

    y_pred = model.predict(X_pad[test])
    y_pred_classes = np.argmax(y_pred, axis=1)
    all_y_pred.extend(y_pred_classes)
    all_y_test.extend(y_encoded[test])


conf_matrix = confusion_matrix(all_y_test, all_y_pred)
target_names = [str(cls) for cls in label_encoder.classes_]
class_report = classification_report(all_y_test, all_y_pred, target_names=target_names)

print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {np.mean(cv_scores)}")
print("\nClassification Report:")
print(class_report)

plt.figure(figsize=(10, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=target_names, yticklabels=target_names)
plt.title('CNN+Word2Vec Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

In [None]:
# Kullanıcıdan mesaj al ve sınıflandır
def classify_message(message):

    message = process_msg(message)
    

    message_seq = tokenizer.texts_to_sequences([message])
    

    message_pad = pad_sequences(message_seq, maxlen=max_len)
    

    model = create_model()
    prediction = model.predict(message_pad)
    prediction_class = np.argmax(prediction, axis=1)
    

    return label_encoder.inverse_transform(prediction_class)[0]


message = input("Sınıflandırmak istediğiniz mesajı girin: ")
prediction = classify_message(message)
print(f"Tahmin: {prediction}")

In [None]:
#attention BiLSTM+word2vec
import pandas as pd
import numpy as np
import re
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import seaborn as sns
import matplotlib.pyplot as plt
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from tensorflow.keras.preprocessing.sequence import pad_sequences
from keras_self_attention import SeqSelfAttention
from gensim.models import Word2Vec


def process_msg(text):
    text = re.sub(r'(@|https?)\S+|#', '', text)
    filters = '@!"\'#$%&()*+,-/:;<=>[\\]^_`{|}~\t\n'
    translate_dict = dict((c, " ") for c in filters)
    translate_map = str.maketrans(translate_dict)
    text = text.translate(translate_map)
    text = text.lower()
    text = text.strip()
    stop_words = set(stopwords.words('turkish'))
    words = word_tokenize(text)
    words_filtered = [word for word in words if word.isalpha() and word not in stop_words]
    return ' '.join(words_filtered)


xt = pd.read_excel('/content/drive/My Drive/TCK_hakaret_tespiti/half29072024_teknofest.xlsx')
xt = xt.dropna()
xt['mesaj'] = xt['mesaj'].astype(str).apply(process_msg)


X = xt['mesaj']
y = xt['label']


label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)


X_processed = [text.split() for text in X]
word2vec_model = Word2Vec(sentences=X_processed, vector_size=128, window=5, min_count=1, workers=4, sg=1)


max_len = 100
embedding_matrix = np.zeros((len(word2vec_model.wv.key_to_index) + 1, 128))
for i, word in enumerate(word2vec_model.wv.index_to_key):
    embedding_matrix[i + 1] = word2vec_model.wv[word]


def get_word2vec_sequences(texts):
    sequences = []
    for text in texts:
        sequence = []
        for word in text.split():
            if word in word2vec_model.wv:
                sequence.append(word2vec_model.wv.key_to_index[word] + 1)
        sequences.append(sequence)
    return pad_sequences(sequences, maxlen=max_len)

X_pad = get_word2vec_sequences(X)


def create_model():
    model = Sequential()
    model.add(Embedding(input_dim=len(word2vec_model.wv.key_to_index) + 1, output_dim=128, weights=[embedding_matrix], input_length=max_len, trainable=False))
    model.add(Bidirectional(LSTM(128, return_sequences=True)))  # BiLSTM katmanı
    model.add(SeqSelfAttention(attention_activation='sigmoid'))  # Attention katmanı
    model.add(Dropout(0.5))
    model.add(Bidirectional(LSTM(64)))  # BiLSTM katmanı
    model.add(Dropout(0.5))
    model.add(Dense(len(np.unique(y_encoded)), activation='softmax'))  # Çıkış katmanı
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


kfold = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = []
all_y_pred = []
all_y_test = []

for train, test in kfold.split(X_pad, y_encoded):
    model = create_model()
    model.fit(X_pad[train], y_encoded[train], epochs=10, batch_size=32, verbose=0)
    scores = model.evaluate(X_pad[test], y_encoded[test], verbose=0)
    cv_scores.append(scores[1])

    y_pred = model.predict(X_pad[test])
    y_pred_classes = np.argmax(y_pred, axis=1)
    all_y_pred.extend(y_pred_classes)
    all_y_test.extend(y_encoded[test])

print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {np.mean(cv_scores)}")


conf_matrix = confusion_matrix(all_y_test, all_y_pred)
print("Confusion Matrix:")
print(conf_matrix)

class_report = classification_report(all_y_test, all_y_pred, target_names=label_encoder.classes_)
print("\nClassification Report:")
print(class_report)


plt.figure(figsize=(10, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title('BiLSTM+word2vec Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()s