In [1]:
import os
import pandas as pd
from gensim.models import Word2Vec,KeyedVectors
from gensim.utils import simple_preprocess
import numpy as np

In [2]:
df_main = pd.read_csv("HateSpeechTurkish.csv", encoding="utf-8-sig")

In [3]:
# 1. Load the existing KeyedVectors model
current_dir = os.getcwd()
model_path = os.path.join(current_dir, "turkishword2vec", "trmodel")

In [4]:
word_vectors = KeyedVectors.load_word2vec_format(model_path, binary=True)

In [5]:
print(word_vectors.most_similar(positive=["türkiye", "türk"], negative=["suriye"]))

[('atilla', 0.42598018050193787), ('itü', 0.41031748056411743), ('ayhan', 0.4089619219303131), ('sönmez', 0.40660181641578674), ('seyfi', 0.40342816710472107), ('nevzat', 0.3952556550502777), ('irfan', 0.3949110209941864), ('ülkü', 0.3936089277267456), ('şener', 0.3935551047325134), ('vedat', 0.392831951379776)]


In [6]:
# 2. Convert KeyedVectors to a trainable Word2Vec format
word2vec_model = Word2Vec(vector_size=word_vectors.vector_size, min_count=1)
word2vec_model.build_vocab([list(word_vectors.index_to_key)])  
word2vec_model.wv.vectors = word_vectors.vectors 

In [7]:
def custom_tokenize(sentence):
    """
    Cümleyi tokenize eder ve 3 harften az olan kelimeleri filtreler.
    """
    tokens = simple_preprocess(sentence)
    filtered_tokens = [token for token in tokens if len(token) > 3]
    return filtered_tokens

In [8]:
# Tokenize the 'tweet' column using the custom function
tokenized_sentences = df_main['correct_normalize_tweet'].apply(custom_tokenize).tolist()

In [9]:
print("Number of sentences:", len(tokenized_sentences))

Number of sentences: 10224


In [10]:
# Calculate the total tokens
tokens_per_sentence = [len(sentence) for sentence in tokenized_sentences]  # Tokens per sentence
total_tokens = sum(tokens_per_sentence)

# Display the total and average tokens
print(f"Total tokens: {total_tokens}")

Total tokens: 170330


In [11]:
# Step 5: Update the vocabulary with new training data and fine-tune the model
word2vec_model.build_vocab(tokenized_sentences, update=True)
word2vec_model.train(tokenized_sentences, total_examples=len(tokenized_sentences), epochs=4)

(655730, 681320)

In [12]:
# Step 6: Convert the fine-tuned model back to KeyedVectors format
fine_tuned_word_vectors = word2vec_model.wv  # Sadece kelime vektörleri alınır

In [13]:
# Step 8: Test the fine-tuned model
print("Fine-tune edilmiş modelin ilişkileri:")
print(fine_tuned_word_vectors.most_similar(positive=["türkiye", "türk"], negative=["suriye"]))

Fine-tune edilmiş modelin ilişkileri:
[('türkdemek', 0.9047353863716125), ('ülkede', 0.9018970727920532), ('hristiyan', 0.8950584530830383), ('yahudilerdin', 0.8918371200561523), ('şerefsizler', 0.8900843262672424), ('rterdoğan', 0.8895334005355835), ('türkmenlerdi', 0.8853127956390381), ('yemde', 0.8845551609992981), ('enineboyuna', 0.8845038414001465), ('deiste', 0.8841897249221802)]


In [14]:
# Step 7: Save the fine-tuned word vectors
output_path = 'trhatespeechmodel_finetuned.kv'
fine_tuned_word_vectors.save_word2vec_format(output_path, binary=True)