In [1]:
import gensim
from gensim.models import Word2Vec
import pandas as pd
import re

import nltk
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer, PorterStemmer

In [2]:
parameters = [
    {'model_type': 'cbow', 'window': 2, 'vector_size': 100},
    {'model_type': 'skipgram', 'window': 2, 'vector_size': 100},
    {'model_type': 'cbow', 'window': 4, 'vector_size': 100},
    {'model_type': 'skipgram', 'window': 4, 'vector_size': 100},
    {'model_type': 'cbow', 'window': 2, 'vector_size': 300},
    {'model_type': 'skipgram', 'window': 2, 'vector_size': 300},
    {'model_type': 'cbow', 'window': 4, 'vector_size': 300},
    {'model_type': 'skipgram', 'window': 4, 'vector_size': 300}
]

In [3]:
df1 = pd.read_csv("../data/lemmatized_sentences.csv")
df2 = pd.read_csv("../data/stemmed_sentences.csv")



In [4]:
df1.columns = ["0"]

# NaN değerleri ve boş stringleri temizle
df1 = df1.dropna()
df1 = df1[df1["0"].str.strip() != ""]

df2.columns = ["0"]

# NaN değerleri ve boş stringleri temizle
df2 = df2.dropna()
df2 = df2[df2["0"].str.strip() != ""]

In [5]:
# Doğru tokenizasyon fonksiyonu
def proper_tokenize(text):
    # Özel karakterleri kaldır ve küçük harfe çevir
    text = re.sub(r'[^a-zA-ZğüşıöçĞÜŞİÖÇ\s]', '', text.lower())
    # NLTK ile tokenize et
    tokens = word_tokenize(text)
    # Stopwords'leri ve tek karakterli kelimeleri kaldır
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word not in stop_words and len(word) > 1]

In [6]:
# Doğru tokenizasyon uygula
df1['tokens'] = df1['0'].apply(proper_tokenize)
df2['tokens'] = df2['0'].apply(proper_tokenize)



In [7]:
# Token listelerini oluştur
tokenized_corpus_lemmatized = df1['tokens'].tolist()
tokenized_corpus_stemmed = df2['tokens'].tolist()

In [8]:

def train_and_save_model(corpus, param, model_prefix):
    model_type = param['model_type']
    vector_size = param['vector_size']
    window = param['window']
    
    # CBOW (sg=0) veya Skip-gram (sg=1)
    sg = 0 if model_type == 'cbow' else 1

    model = Word2Vec(
        sentences=corpus,
        vector_size=vector_size,
        window=window,
        min_count=1,
        workers=4,
        sg=sg
    )

    model_filename = f"{model_prefix}_{model_type}_vs{vector_size}_w{window}.model"
    model.save(model_filename)
    print(f"Model saved as {model_filename}")

In [9]:
# Lemmatize edilmiş corpus ile modelleri eğitme ve kaydetme
for param in parameters:
    train_and_save_model(tokenized_corpus_lemmatized, param, "lemmatized_model")

# Stemlenmiş corpus ile modelleri eğitme ve kaydetme
for param in parameters:
    train_and_save_model(tokenized_corpus_stemmed, param, "stemmed_model")


Model saved as lemmatized_model_cbow_vs100_w2.model
Model saved as lemmatized_model_skipgram_vs100_w2.model
Model saved as lemmatized_model_cbow_vs100_w4.model
Model saved as lemmatized_model_skipgram_vs100_w4.model
Model saved as lemmatized_model_cbow_vs300_w2.model
Model saved as lemmatized_model_skipgram_vs300_w2.model
Model saved as lemmatized_model_cbow_vs300_w4.model
Model saved as lemmatized_model_skipgram_vs300_w4.model
Model saved as stemmed_model_cbow_vs100_w2.model
Model saved as stemmed_model_skipgram_vs100_w2.model
Model saved as stemmed_model_cbow_vs100_w4.model
Model saved as stemmed_model_skipgram_vs100_w4.model
Model saved as stemmed_model_cbow_vs300_w2.model
Model saved as stemmed_model_skipgram_vs300_w2.model
Model saved as stemmed_model_cbow_vs300_w4.model
Model saved as stemmed_model_skipgram_vs300_w4.model


In [10]:
# Model dosyalarını yüklemek
model_1 = Word2Vec.load("../models/lemmatized_model_cbow_vs100_w2.model")
model_2 = Word2Vec.load("../models/lemmatized_model_cbow_vs100_w4.model")
model_3 = Word2Vec.load("../models/lemmatized_model_cbow_vs300_w2.model")
model_4 = Word2Vec.load("../models/lemmatized_model_cbow_vs300_w4.model")
model_5 = Word2Vec.load("../models/lemmatized_model_skipgram_vs100_w2.model")
model_6 = Word2Vec.load("../models/lemmatized_model_skipgram_vs100_w4.model")
model_7 = Word2Vec.load("../models/lemmatized_model_skipgram_vs300_w2.model")
model_8 = Word2Vec.load("../models/lemmatized_model_skipgram_vs300_w4.model")
model_9  = Word2Vec.load("../models/stemmed_model_cbow_vs100_w2.model")
model_10 = Word2Vec.load("../models/stemmed_model_cbow_vs100_w4.model")
model_11 = Word2Vec.load("../models/stemmed_model_cbow_vs300_w2.model")
model_12 = Word2Vec.load("../models/stemmed_model_cbow_vs300_w4.model")
model_13 = Word2Vec.load("../models/stemmed_model_skipgram_vs100_w2.model")
model_14 = Word2Vec.load("../models/stemmed_model_skipgram_vs100_w4.model")
model_15 = Word2Vec.load("../models/stemmed_model_skipgram_vs300_w2.model")
model_16 = Word2Vec.load("../models/stemmed_model_skipgram_vs300_w4.model")

In [14]:
# 'error' kelimesi ile en benzer 3 kelimeyi ve skorlarını yazdırmak
def print_similar_words(model, model_name):
    similarity = model.wv.most_similar("error", topn=3)
    print(f"\n{model_name} Modeli - 'error' ile En Benzer 3 Kelime:")
    for word, score in similarity:
        print(f"Kelime: {word}, Benzerlik Skoru: {score}")

In [16]:
# 16 model için benzer kelimeleri yazdır
print_similar_words(model_1, "Lemmatized CBOW Window 2 Dim 100")
print_similar_words(model_2, "Stemmed Skipgram Window 4 Dim 100")
print_similar_words(model_3, "Lemmatized Skipgram Window 2 Dim 300")
print_similar_words(model_4, "lemmatized skipgram window 4 dim 100")
print_similar_words(model_5, "lemmatized cbow window 2 dim 300")
print_similar_words(model_6, "lemmatizedskipgramwindow 2 dim300")
print_similar_words(model_7, "lemmatized_cbow_window 4_dim300")
print_similar_words(model_8, "lemmatized_skipgram_window4_dim300.model")
print_similar_words(model_9, "stemmed_cbow_window2_dim100")
print_similar_words(model_10, "stemmed_skipgram_window2_dim100")
print_similar_words(model_11, "stemmed_cbow_window4_dim100")
print_similar_words(model_12, "stemmed_skipgram_window4_dim100")
print_similar_words(model_13, "stemmed_cbow_window2_dim300")
print_similar_words(model_14, "stemmed_skipgram_window2_dim300")
print_similar_words(model_15, "stemmed_cbow_window4_dim300")
print_similar_words(model_16, "stemmed_skipgram_window4_dim300")


Lemmatized CBOW Window 2 Dim 100 Modeli - 'error' ile En Benzer 3 Kelime:
Kelime: heater, Benzerlik Skoru: 0.27483057975769043
Kelime: necessary, Benzerlik Skoru: 0.26221245527267456
Kelime: distributor, Benzerlik Skoru: 0.2495647817850113

Stemmed Skipgram Window 4 Dim 100 Modeli - 'error' ile En Benzer 3 Kelime:
Kelime: heater, Benzerlik Skoru: 0.28107109665870667
Kelime: necessary, Benzerlik Skoru: 0.26783761382102966
Kelime: alarm, Benzerlik Skoru: 0.25009340047836304

Lemmatized Skipgram Window 2 Dim 300 Modeli - 'error' ile En Benzer 3 Kelime:
Kelime: heartbeat, Benzerlik Skoru: 0.1780272275209427
Kelime: output, Benzerlik Skoru: 0.1670265793800354
Kelime: high, Benzerlik Skoru: 0.15835227072238922

lemmatized skipgram window 4 dim 100 Modeli - 'error' ile En Benzer 3 Kelime:
Kelime: heartbeat, Benzerlik Skoru: 0.17919139564037323
Kelime: output, Benzerlik Skoru: 0.17486067116260529
Kelime: high, Benzerlik Skoru: 0.17238584160804749

lemmatized cbow window 2 dim 300 Modeli - 'er

In [17]:
# Veri setinizde en sık geçen 20 kelime
from collections import Counter
all_words = [word for sentence in tokenized_corpus_lemmatized for word in sentence]
print("En sık kullanılan 20 kelime:", Counter(all_words).most_common(20))

En sık kullanılan 20 kelime: [('repair', 47), ('check', 46), ('circuit', 41), ('error', 37), ('sensor', 37), ('low', 37), ('perform', 31), ('high', 30), ('fuel', 29), ('oil', 23), ('unit', 23), ('failure', 21), ('control', 21), ('necessary', 21), ('air', 18), ('malfunction', 18), ('pressure', 17), ('load', 17), ('temperature', 16), ('operating', 16)]
