In [None]:
#one-hot encoding+Attention head LSTM classification
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from keras_self_attention import SeqSelfAttention
import matplotlib.pyplot as plt



def process_msg(text):
    text = re.sub(r'(@|https?)\S+|#', '', text)
    filters = '@!"\'#$%&()*+,-/:;<=>[\\]^_`{|}~\t\n'  # '?' ve '.' kaldırılmayacak
    translate_dict = dict((c, " ") for c in filters)
    translate_map = str.maketrans(translate_dict)
    text = text.translate(translate_map)
    text = text.lower()
    text = text.strip()


    stop_words = set(stopwords.words('turkish'))
    words = word_tokenize(text)
    words_filtered = [word for word in words if word.isalpha() and word not in stop_words]
    text = ' '.join(words_filtered)
    
    return text


xt = pd.read_excel(r'C:\Users\habib\OneDrive\Desktop\TeknoFest\half29072024_teknofest.xlsx')
xt = xt.dropna()
xt['mesaj'] = xt['mesaj'].astype(str).apply(process_msg)
xt['label'] = xt['label']


label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(xt['label'])


max_words = 10000  
max_len = 100      
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(xt['mesaj'])


X_seq = tokenizer.texts_to_sequences(xt['mesaj'])


X_pad = pad_sequences(X_seq, maxlen=max_len)


def create_model():
    model = Sequential()
    model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
    model.add(LSTM(128, return_sequences=True))
    model.add(SeqSelfAttention(attention_activation='sigmoid'))
    model.add(Dropout(0.5))
    model.add(LSTM(64, return_sequences=True))
    model.add(Dropout(0.5))
    model.add(LSTM(64))
    model.add(Dropout(0.5))
    model.add(Dense(len(np.unique(y_encoded)), activation='softmax'))
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model


kfold = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = []
all_y_pred = []
all_y_test = []

for train, test in kfold.split(X_pad, y_encoded):
    model = create_model()
    model.fit(X_pad[train], y_encoded[train], epochs=10, batch_size=32, verbose=0)
    scores = model.evaluate(X_pad[test], y_encoded[test], verbose=0)
    cv_scores.append(scores[1])
    
    y_pred = model.predict(X_pad[test])
    y_pred_classes = np.argmax(y_pred, axis=1)
    all_y_pred.extend(y_pred_classes)
    all_y_test.extend(y_encoded[test])

print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {np.mean(cv_scores)}")


conf_matrix = confusion_matrix(all_y_test, all_y_pred)
print("Confusion Matrix:")
print(conf_matrix)


target_names = [str(cls) for cls in np.unique(y_encoded)]
class_report = classification_report(all_y_test, all_y_pred, target_names=target_names)
print("\nClassification Report:")
print(class_report)


plt.figure(figsize=(10, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title('Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

#test ekranı
def classify_message(message):
 
    message = process_msg(message)
    

    message_seq = tokenizer.texts_to_sequences([message])
    

    message_pad = pad_sequences(message_seq, maxlen=max_len)
    

    model = create_model()
    prediction = model.predict(message_pad)
    prediction_class = np.argmax(prediction, axis=1)

    return label_encoder.inverse_transform(prediction_class)[0]


message = input("Sınıflandırmak istediğiniz mesajı girin: ")
prediction = classify_message(message)
print(f"Tahmin: {prediction}")

In [None]:
#one-hot encoding+Attention head GRU classification
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense, Dropout
from keras_self_attention import SeqSelfAttention
from tensorflow.keras.layers import Bidirectional
import matplotlib.pyplot as plt

 
def process_msg(text):
    text = re.sub(r'(@|https?)\S+|#', '', text)
    filters = '@!"\'#$%&()*+,-/:;<=>[\\]^_`{|}~\t\n'  # '?' ve '.' kaldırılmayacak
    translate_dict = dict((c, " ") for c in filters)
    translate_map = str.maketrans(translate_dict)
    text = text.translate(translate_map)
    text = text.lower()
    text = text.strip()

 
    stop_words = set(stopwords.words('turkish'))
    words = word_tokenize(text)
    words_filtered = [word for word in words if word.isalpha() and word not in stop_words]
    text = ' '.join(words_filtered)
    
    return text

 
xt = pd.read_excel(r'C:\Users\habib\OneDrive\Desktop\TeknoFest\half29072024_teknofest.xlsx')
xt = xt.dropna()
xt['mesaj'] = xt['mesaj'].astype(str).apply(process_msg)
xt['label'] = xt['label']

 
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(xt['label'])

 
max_words = 10000  
max_len = 100      
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(xt['mesaj'])

 
X_seq = tokenizer.texts_to_sequences(xt['mesaj'])

 
X_pad = pad_sequences(X_seq, maxlen=max_len)

 
def create_model():
    model = Sequential()
    model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
    model.add(Bidirectional(GRU(128, return_sequences=True)))
    model.add(SeqSelfAttention(attention_activation='sigmoid'))   
    model.add(Dropout(0.5))
    model.add(GRU(64))
    model.add(Dropout(0.5))
    model.add(Dense(len(np.unique(y_encoded)), activation='softmax'))   
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

 
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = []
all_y_pred = []
all_y_test = []

for train, test in kfold.split(X_pad, y_encoded):
    model = create_model()
    model.fit(X_pad[train], y_encoded[train], epochs=10, batch_size=32, verbose=0)
    scores = model.evaluate(X_pad[test], y_encoded[test], verbose=0)
    cv_scores.append(scores[1])
    
    y_pred = model.predict(X_pad[test])
    y_pred_classes = np.argmax(y_pred, axis=1)
    all_y_pred.extend(y_pred_classes)
    all_y_test.extend(y_encoded[test])

print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {np.mean(cv_scores)}")

 
conf_matrix = confusion_matrix(all_y_test, all_y_pred)
print("Confusion Matrix:")
print(conf_matrix)

 t
target_names = [str(cls) for cls in np.unique(y_encoded)]
class_report = classification_report(all_y_test, all_y_pred, target_names=target_names)
print("\nClassification Report:")
print(class_report)

 
plt.figure(figsize=(10, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title('GRU+One-hot Encoding Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()

# Test ekranı
def classify_message(message):
 
    message = process_msg(message)
    
    
    message_seq = tokenizer.texts_to_sequences([message])
    
   
    message_pad = pad_sequences(message_seq, maxlen=max_len)
    
 
    model = create_model()
    model.fit(X_pad, y_encoded, epochs=10, batch_size=32, verbose=0)
    prediction = model.predict(message_pad)
    prediction_class = np.argmax(prediction, axis=1)
    
     
    return label_encoder.inverse_transform(prediction_class)[0]

 
while True:
    message = input("Sınıflandırmak istediğiniz metni girin (çıkış için 'q' girin): ")
    if message.lower() == 'q':
        break
    prediction = classify_message(message)
    print(f"Tahmin: {prediction}")

In [None]:
#one-hot encoding+Attention head CNN classification
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, MaxPooling1D, Dense, Dropout
from keras_self_attention import SeqSelfAttention
from tensorflow.keras.layers import GlobalMaxPooling1D
import matplotlib.pyplot as plt

 
def process_msg(text):
    text = re.sub(r'(@|https?)\S+|#', '', text)
    filters = '@!"\'#$%&()*+,-/:;<=>[\\]^_`{|}~\t\n'  # '?' ve '.' kaldırılmayacak
    translate_dict = dict((c, " ") for c in filters)
    translate_map = str.maketrans(translate_dict)
    text = text.translate(translate_map)
    text = text.lower()
    text = text.strip()

 
    stop_words = set(stopwords.words('turkish'))
    words = word_tokenize(text)
    words_filtered = [word for word in words if word.isalpha() and word not in stop_words]
    text = ' '.join(words_filtered)
    
    return text

 
xt = pd.read_excel(r'C:\Users\habib\OneDrive\Desktop\TeknoFest\half29072024_teknofest.xlsx')
xt = xt.dropna()
xt['mesaj'] = xt['mesaj'].astype(str).apply(process_msg)
xt['label'] = xt['label']

 
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(xt['label'])

 
max_words = 10000  # En fazla 10,000 kelime
max_len = 100      # Her metin için maksimum uzunluk
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(xt['mesaj'])

 me
X_seq = tokenizer.texts_to_sequences(xt['mesaj'])

 
X_pad = pad_sequences(X_seq, maxlen=max_len)

 
def create_model():
    model = Sequential()
    model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
    model.add(Conv1D(filters=64, kernel_size=5, activation='relu'))  # Konvolüsyon katmanı
    model.add(MaxPooling1D(pool_size=2))  # Max pooling katmanı
    model.add(Conv1D(filters=64, kernel_size=5, activation='relu'))  # İkinci konvolüsyon katmanı
    model.add(MaxPooling1D(pool_size=2))  # İkinci max pooling katmanı
    model.add(SeqSelfAttention(attention_activation='sigmoid'))  # Attention katmanı
    model.add(GlobalMaxPooling1D())  # Global max pooling katmanı
    model.add(Dropout(0.5))  # Dropout katmanı
    model.add(Dense(len(np.unique(y_encoded)), activation='softmax'))  # Çıkış katmanı
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

 
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = []
all_y_pred = []
all_y_test = []

for train, test in kfold.split(X_pad, y_encoded):
    model = create_model()
    model.fit(X_pad[train], y_encoded[train], epochs=10, batch_size=32, verbose=0)
    scores = model.evaluate(X_pad[test], y_encoded[test], verbose=0)
    cv_scores.append(scores[1])
    
    y_pred = model.predict(X_pad[test])
    y_pred_classes = np.argmax(y_pred, axis=1)
    all_y_pred.extend(y_pred_classes)
    all_y_test.extend(y_encoded[test])

print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {np.mean(cv_scores)}")

 
conf_matrix = confusion_matrix(all_y_test, all_y_pred)
print("Confusion Matrix:")
print(conf_matrix)
 
target_names = [str(cls) for cls in np.unique(y_encoded)]
class_report = classification_report(all_y_test, all_y_pred, target_names=target_names)
print("\nClassification Report:")
print(class_report)

 
plt.figure(figsize=(10, 6))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=label_encoder.classes_, yticklabels=label_encoder.classes_)
plt.title('CNN+One-hot Encoding Confusion Matrix')
plt.xlabel('Predicted')
plt.ylabel('Actual')
plt.show()


# test ekranı
def classify_message(message):
 
    message = process_msg(message)
    
 
    message_seq = tokenizer.texts_to_sequences([message])
    
    
    message_pad = pad_sequences(message_seq, maxlen=max_len)
    
    
    model = create_model()
    model.fit(X_pad, y_encoded, epochs=10, batch_size=32, verbose=0)
    prediction = model.predict(message_pad)
    prediction_class = np.argmax(prediction, axis=1)
    
   
    return label_encoder.inverse_transform(prediction_class)[0]


while True:
    message = input("Sınıflandırmak istediğiniz metni girin (çıkış için 'q' girin): ")
    if message.lower() == 'q':
        break
    prediction = classify_message(message)
    print(f"Tahmin: {prediction}")

In [None]:
#one-hot encoding+Attention head BiLSTM classification
import pandas as pd
import numpy as np
import re
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split, KFold
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout, Bidirectional
from keras_self_attention import SeqSelfAttention
import matplotlib.pyplot as plt
 
def process_msg(text):
    text = re.sub(r'(@|https?)\S+|#', '', text)
    filters = '@!"\'#$%&()*+,-/:;<=>[\\]^_`{|}~\t\n'  # '?' ve '.' kaldırılmayacak
    translate_dict = dict((c, " ") for c in filters)
    translate_map = str.maketrans(translate_dict)
    text = text.translate(translate_map)
    text = text.lower()
    text = text.strip()

 
    stop_words = set(stopwords.words('turkish'))
    words = word_tokenize(text)
    words_filtered = [word for word in words if word.isalpha() and word not in stop_words]
    text = ' '.join(words_filtered)
    
    return text

 
xt = pd.read_excel(r'C:\Users\habib\OneDrive\Desktop\TeknoFest\half29072024_teknofest.xlsx')
xt = xt.dropna()
xt['mesaj'] = xt['mesaj'].astype(str).apply(process_msg)
xt['label'] = xt['label']

 
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(xt['label'])

 
max_words = 10000   
max_len = 100      
tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(xt['mesaj'])

 
X_seq = tokenizer.texts_to_sequences(xt['mesaj'])

 
X_pad = pad_sequences(X_seq, maxlen=max_len)

 
def create_model():
    model = Sequential()
    model.add(Embedding(input_dim=max_words, output_dim=128, input_length=max_len))
    model.add(Bidirectional(LSTM(128, return_sequences=True)))  
    model.add(SeqSelfAttention(attention_activation='sigmoid'))   
    model.add(Dropout(0.5))
    model.add(Bidirectional(LSTM(64)))  
    model.add(Dropout(0.5))
    model.add(Dense(len(np.unique(y_encoded)), activation='softmax')) 
    model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

 
kfold = KFold(n_splits=5, shuffle=True, random_state=42)
cv_scores = []
all_y_pred = []
all_y_test = []

for train, test in kfold.split(X_pad, y_encoded):
    model = create_model()
    model.fit(X_pad[train], y_encoded[train], epochs=10, batch_size=32, verbose=0)
    scores = model.evaluate(X_pad[test], y_encoded[test], verbose=0)
    cv_scores.append(scores[1])
    
    y_pred = model.predict(X_pad[test])
    y_pred_classes = np.argmax(y_pred, axis=1)
    all_y_pred.extend(y_pred_classes)
    all_y_test.extend(y_encoded[test])

print(f"Cross-validation scores: {cv_scores}")
print(f"Mean cross-validation score: {np.mean(cv_scores)}")

 
conf_matrix = confusion_matrix(all_y_test, all_y_pred)
print("Confusion Matrix:")
print(conf_matrix)

 
target_names = [str(cls) for cls in np.unique(y_encoded)]
class_report = classification_report(all_y_test, all_y_pred, target_names=target_names)
print("\nClassification Report:")
print(class_report)

# Test ekranı
def classify_message(message):
 
    message = process_msg(message)
    
 
    message_seq = tokenizer.texts_to_sequences([message])
    
 
    message_pad = pad_sequences(message_seq, maxlen=max_len)
    
 
    model = create_model()
    model.fit(X_pad, y_encoded, epochs=10, batch_size=32, verbose=0)
    prediction = model.predict(message_pad)
    prediction_class = np.argmax(prediction, axis=1)
    
 
    return label_encoder.inverse_transform(prediction_class)[0]

 
while True:
    message = input("Sınıflandırmak istediğiniz metni girin (çıkış için 'q' girin): ")
    if message.lower() == 'q':
        break
    prediction = classify_message(message)
    print(f"Tahmin: {prediction}")