# Downloading related libraries

In [2]:
pip install jpype1




In [3]:
import pandas as pd
import numpy as np

# For NLP libraries
import nltk
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords
import string
import re

# Zemberek setup
import jpype
from jpype import JClass, JString, getDefaultJVMPath, shutdownJVM, startJVM



In [None]:
ZEMBEREK_PATH = 'algorithm/zemberek-full.jar'

In [6]:
try:
    # Check if JVM is already started to avoid errors
    if not jpype.isJVMStarted():
        startJVM(getDefaultJVMPath(), '-ea', classpath=ZEMBEREK_PATH)

    # Initialize Zemberek Morphology
    TurkishMorphology = JClass('zemberek.morphology.TurkishMorphology')
    morphology = TurkishMorphology.createWithDefaults()
    print("Zemberek JVM started and Morphology initialized.")
except JVMException as e:
    print(f"Error starting JVM/Zemberek: {e}")
    print("Please ensure ZEMBEREK_PATH is correct and Java JDK is installed.")

Zemberek JVM started and Morphology initialized.


# Loading Data

In [None]:
review = pd.read_csv("data/e-ticaret_urun_yorumlari.csv", delimiter = ';')

In [10]:
review.columns = ['Comment', 'Sentiment']

In [11]:
review.head()

Unnamed: 0,Comment,Sentiment
0,evet anlatıldığı gibi,1
1,Daha öncede almıştım bu cihazdan ense ve sakal...,1
2,Ürün gayet başarılı sakal kesmede başlık sayıs...,1
3,Daha öncede aynısını almıştım çok güzel ve kal...,1
4,Erkek kuaförüyüm ense ve sıfır sakal traşı içi...,1


In [12]:
def character_switch(text):
    text = text.lower()
    text = text.replace('ç', 'c')
    text = text.replace('ğ', 'g')
    text = text.replace('ü', 'u')
    text = text.replace('ö', 'o')
    text = text.replace('ş', 's')
    text = text.replace('ı', 'i')

    return text



review['comment_english'] = review['Comment'].apply(character_switch)

In [13]:
review.head()

Unnamed: 0,Comment,Sentiment,comment_english
0,evet anlatıldığı gibi,1,evet anlatildigi gibi
1,Daha öncede almıştım bu cihazdan ense ve sakal...,1,daha oncede almistim bu cihazdan ense ve sakal...
2,Ürün gayet başarılı sakal kesmede başlık sayıs...,1,urun gayet basarili sakal kesmede baslik sayis...
3,Daha öncede aynısını almıştım çok güzel ve kal...,1,daha oncede aynisini almistim cok guzel ve kal...
4,Erkek kuaförüyüm ense ve sıfır sakal traşı içi...,1,erkek kuaforuyum ense ve sifir sakal trasi ici...


# Removing Stop words

In [14]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

In [15]:
# Load Turkish Stopwords and extend with punctuation
stop_words = stopwords.words('turkish')
# Add punctuation marks to the list
stop_words.extend(list(string.punctuation))

print(f"Number of Turkish stopwords: {len(stopwords.words('turkish'))}")

Number of Turkish stopwords: 53


We firstly wanted to use stopwords from nltk library. However, their result was 53 words, which seems to low. So we decided to use custom dataset from kaggle for this purpose.

In [None]:
custom_stopwords = pd.read_json("custom/turkce_stopwords.json")

In [18]:
print("Stopwords DataFrame Shape:", custom_stopwords.shape)
print("\nFirst 5 Stopwords (Check Column Name/Structure):")
print(custom_stopwords.head(5))

Stopwords DataFrame Shape: (504, 1)

First 5 Stopwords (Check Column Name/Structure):
    stopwords
0           a
1       acaba
2        acep
3  adamakıllı
4       adeta


Here more words are aviable, which is good.

In [19]:
turkish_stopwords_set = set(custom_stopwords['stopwords'].str.lower())

Later we use TweetTokenizer. We will split each word in our dataset and see if they are aviable in stopwords or not. While we are splitting, we approach each word, including punctuation mark seperately. However, there are some emojis, that we don't want them to get seperated, such as :), :(, :)), they have a meaning in our dataset, such as :) positive, :( negative and so on. Considering all these, we used TweetTokenizer.

In [20]:
tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)

In [21]:
def remove_stopwords(dataset, stopwords_set):
  cleaned_data = []
  for text in dataset:
    tokens = tokenizer.tokenize(text.lower())
    filtered_tokens = [word for word in tokens if word not in stopwords_set]
    cleaned_data.append(" ".join(filtered_tokens))

  return cleaned_data

In [23]:
review["comment_english"] = remove_stopwords(review["comment_english"], turkish_stopwords_set)
review["copy_comment"] = remove_stopwords(review["Comment"], turkish_stopwords_set)

Example row from dataset. There is a word like "amma", but after removing stop words, we will remove "amma" from here

In [24]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)


print(review.loc[[310], ["Comment", "comment_english"]])
print(review.loc[[310], ["Comment", "copy_comment"]])

                                                                                                              Comment  \
310  ürünü begendim Amma taraklar hariç...sadece kısa kesim yapacaklar için uygun.saç kesiminde de performansı iyi...   

                                                                                      comment_english  
310  urunu begendim taraklar haric ... kisa kesim yapacaklar icin uygun.sac kesiminde performansi ...  
                                                                                                              Comment  \
310  ürünü begendim Amma taraklar hariç...sadece kısa kesim yapacaklar için uygun.saç kesiminde de performansı iyi...   

                                                                                copy_comment  
310  ürünü begendim taraklar ... kısa kesim yapacaklar uygun . saç kesiminde performansı ...  


### Checking emojis are there or not

In [26]:
emoji_pattern_ascii = r'[:;=]-?[)D(P]'

emoji_reviews = review[
    review["comment_english"].str.contains(emoji_pattern_ascii, regex=True, na=False)
]

print(f"\n--- Reviews with ASCII Emoticons ({len(emoji_reviews)} found) ---")
print(emoji_reviews[["Comment", "comment_english"]].head())


--- Reviews with ASCII Emoticons (301 found) ---
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                     Comment  \
192                                                                                                                                                                                                                   

# Preprocessing Data

## STREM and FPS (Fixed Prefix Stemmer)

In [27]:
punctuation = set(string.punctuation)

In [28]:
def preprocess_turkish_text(dataset, stopwords_set, stemming_method='none', prefix_length=None):
    """
    Applies Tokenization, Stopword Removal (STREM), and Fixed Prefix Stemming (FPS).
    """
    cleaned_data = []
    for text in dataset:
        if not text:
            cleaned_data.append("")
            continue
        tokens = tokenizer.tokenize(text.lower())
        filtered_tokens = []
        for word in tokens:
            is_stopword = word in stopwords_set
            is_punctuation = word in punctuation

            if not is_stopword and not is_punctuation:
                if stemming_method == 'fps' and prefix_length is not None:
                    # FPS: Take the first 'n' characters of the word
                    stemmed_word = word[:prefix_length]
                    filtered_tokens.append(stemmed_word)
                elif stemming_method == 'none':
                    # STREM Only
                    filtered_tokens.append(word)
        cleaned_data.append(" ".join(filtered_tokens))
    return pd.Series(cleaned_data, index=dataset.index)

In [29]:
review["STREM+FPS3"] = preprocess_turkish_text(review["comment_english"], turkish_stopwords_set, stemming_method='fps', prefix_length=3)
review["STREM+FPS5"] = preprocess_turkish_text(review["comment_english"], turkish_stopwords_set, stemming_method='fps', prefix_length=5)
review["STREM+FPS7"] = preprocess_turkish_text(review["comment_english"], turkish_stopwords_set, stemming_method='fps', prefix_length=7)


In [31]:
# Set display options to see full text
pd.set_option('display.max_colwidth', None)

# Select the original column and all new processed columns
comparison_columns = [
    "comment_english",
    "STREM+FPS3",
    "STREM+FPS5",
    "STREM+FPS7",
]

# Display the comparison for a sample row (like 312)
print("--- Comparison of Turkish Preprocessing Methods (Row 312) ---")
print(review.loc[[312], comparison_columns].T)

--- Comparison of Turkish Preprocessing Methods (Row 312) ---
                                                                                                                                                                                                                                                                                                                                                                                                                                                                 312
comment_english  urun guzel uzun zamandir takip ettigim urun kargo hizli parcalar eksik fabrikadan son paketleyen yada gelene ustune nikelajinda yuzunde darbeler tane diyecem 7-8 adet darbe cizik kullandikca komple nikelaji atmazsa iyidir bakalim kesimi nasil denemedim henuz paketi yeni actim firma yetkililerine selamlar dikkate alirsiniz umarim kalite kontrol yada paketlemeden once son bi urunlere bakarsaniz sagina soluna jelatin yaparsaniz cihaza
STREM+FPS3                      

These models are all good, but not ideal, as STREM do not consider finding root, and FPS algorithms just choose first 3, 5 and 7 letters without actually considering for meaning.

## Zemberek algorithm

Zemberek is an open-source NLP library specifically developed for the Turkish language. Written in Java, it is designed to handle the structural challenges of Turkish. It can perform tasks such as morphological analysis, grammar checking, sentence parsing, and many more operations on Turkish texts.

In [32]:
def zemberek_stem(word, morphology_instance):
    """
    Zemberek'in en güvenilir analiz metoduyla (analyzeAndDisambiguate) lema bulma.
    """
    try:
        # Python dizesini açıkça Java dizesine dönüştür
        java_word = JString(word)

        # 1. Analiz et ve anlam karmaşasını çöz
        analysis_result = morphology_instance.analyzeAndDisambiguate(java_word)
        best_analysis_list = analysis_result.bestAnalysis()

        if best_analysis_list and best_analysis_list.size() > 0:
            # En iyi analizi al
            best_analysis = best_analysis_list.get(0)

            # **KÖK/LEMA BULMA KRİTİK ADIM**
            # Analizin Lemmalar listesini al ve ilkini (en temelini) kullan.
            lemmas = best_analysis.getLemmas()

            if lemmas.size() > 0:
                # Java listesinden lemayı al ve Python dizesine dönüştür
                lemma = str(lemmas.get(0))

                # 'UNK' (Bilinmiyor) kontrolü (Gerekirse)
                if lemma != 'UNK':
                    return lemma

        # Analiz başarısız olursa veya lemma 'UNK' ise orijinal kelimeyi döndür.
        return word
    except Exception as e:
        # Hata ayıklama için print(f"Zemberek Hata: {word} -> {e}")
        return word

In [33]:
def preprocess_turkish_text_with_zemberek(dataset, stopwords_set, morphology_instance):
    """
    Geliştirilmiş Zemberek kök bulma mantığı ile metin ön işleme.
    """
    # TweetTokenizer ve stopword/noktalama filtreleme mantığı aynı kalır.
    tokenizer = TweetTokenizer(preserve_case=False, strip_handles=True, reduce_len=True)
    punctuation = set(string.punctuation)
    cleaned_data = []

    for text in dataset:
        if not isinstance(text, str) or not text.strip():
            cleaned_data.append("")
            continue

        tokens = tokenizer.tokenize(text.lower())

        filtered_tokens = []
        for word in tokens:
            is_stopword = word in stopwords_set
            is_punctuation = word in punctuation

            if not is_stopword and not is_punctuation:

                if word.isalpha():
                    # YENİ ve GÜVENİLİR kök bulma fonksiyonunu çağır
                    stemmed_word = zemberek_stem(word, morphology_instance)
                    filtered_tokens.append(stemmed_word)
                else:
                    # Emoticon, sayı, vs. olduğu gibi bırakılır.
                    filtered_tokens.append(word)

        cleaned_data.append(" ".join(filtered_tokens))

    return pd.Series(cleaned_data, index=dataset.index)

In [34]:
from jpype import JString
from typing import List


In [35]:
# Lütfen bu kodu çalıştırmadan önce, yukarıdaki fonksiyonları tanımladığınızdan emin olun.

def verify_zemberek_stemming(words: List[str], morphology_instance):
    results = []

    for word in words:
        # zemberek_stem_final'ı çağır
        stemmed_word = zemberek_stem(word.lower(), morphology_instance)

        results.append({
            "Original Word": word,
            "Stemmed Word (ZEM)": stemmed_word
        })

    return pd.DataFrame(results)

test_words = [
    "Gelmek",        # Infinitive
    "Geliyorum",     # I am coming
    "Geldiler",      # They came
    "Gelmiştik",     # We had come
    "Geleceksin",    # You will come
    "ürünlerimizin", # Noun inflection
    "başarılı",      # Adjective
    "almıştım",
    "almistim",
]

verification_df = verify_zemberek_stemming(test_words, morphology)

print("--- Zemberek Stemming Verification Test (Final Attempt) ---")
print(verification_df)

--- Zemberek Stemming Verification Test (Final Attempt) ---
   Original Word Stemmed Word (ZEM)
0         Gelmek                gel
1      Geliyorum                gel
2       Geldiler                gel
3      Gelmiştik                gel
4     Geleceksin                gel
5  ürünlerimizin               ürün
6       başarılı             başarı
7       almıştım                 al
8       almistim           almistim


Here we can see that Zemberek algorithm succesfully found the root of the words. Also, we observe that some words, such as almıştım, are recognized and their root is extracted succesfully, compared to when their letters are switched to English alphabet, as almistim. Therefore, we will refer to original dataset with only stopwords removed version to get better results in root finding.

In [37]:
review["STREM+ZEM"] = preprocess_turkish_text_with_zemberek(
    review["copy_comment"],
    turkish_stopwords_set,
    morphology # <--- The initialized Zemberek object from your setup
)

In [38]:
# Set display options to see full text
pd.set_option('display.max_colwidth', None)

# Select the original column and all new processed columns
comparison_columns = [
    "comment_english",
    "copy_comment",
    "STREM+FPS3",
    "STREM+FPS5",
    "STREM+FPS7",
    "STREM+ZEM" # Placeholder
]

# Display the comparison for a sample row (like 312)
print("--- Comparison of Turkish Preprocessing Methods (Row 312) ---")
print(review.loc[[312], comparison_columns].T)

--- Comparison of Turkish Preprocessing Methods (Row 312) ---
                                                                                                                                                                                                                                                                                                                                                                                                                                                                 312
comment_english  urun guzel uzun zamandir takip ettigim urun kargo hizli parcalar eksik fabrikadan son paketleyen yada gelene ustune nikelajinda yuzunde darbeler tane diyecem 7-8 adet darbe cizik kullandikca komple nikelaji atmazsa iyidir bakalim kesimi nasil denemedim henuz paketi yeni actim firma yetkililerine selamlar dikkate alirsiniz umarim kalite kontrol yada paketlemeden once son bi urunlere bakarsaniz sagina soluna jelatin yaparsaniz cihaza
copy_comment                   ü

In [39]:
from collections import defaultdict

def calculate_text_statistics(text_series):
    """Calculates the total dictionary size and average document length."""

    term_counts = defaultdict(int)
    doc_lengths = []

    for text in text_series:
        if not text:
            doc_lengths.append(0)
            continue

        words = text.split()
        doc_lengths.append(len(words))

        # Count term frequencies for dictionary size
        for word in words:
            term_counts[word] += 1

    total_unique_terms = len(term_counts)
    total_documents = len(text_series)
    avg_terms_per_document = sum(doc_lengths) / total_documents if total_documents else 0

    return {
        "N_terms": total_unique_terms,
        "Avg_terms_per_document": avg_terms_per_document,
    }

# --- Calculation ---
results = {}

# List of columns to analyze (including the original Metin for baseline)
preprocessing_columns = [
    "copy_comment",
    "STREM+FPS3",
    "STREM+FPS5",
    "STREM+FPS7",
    "STREM+ZEM" # Placeholder
]

# Calculate statistics for each column
for col in preprocessing_columns:
    if col in review.columns:
        results[col] = calculate_text_statistics(review[col])
    elif col == "copy_comment":
        # Calculate original data (no preprocessing)
        results["copy_comment (None)"] = calculate_text_statistics(review[col].astype(str).str.lower())
    elif col == "STREM+ZEM":
        # Placeholder for Zemberek, based on Milliyet_9c_1k corpus statistics
        # This gives a realistic target for Zemberek's effectiveness.
        results["STREM+ZEM (Target)"] = {
            "N_terms": "Approx. 17,000",
            "Avg_terms_per_document": "Approx. 225",
        }

# Convert results dictionary to a DataFrame for clean display
comparison_df = pd.DataFrame(results).T

print("\n--- Numerical Comparison of Preprocessing Methods ---")
print(comparison_df)


--- Numerical Comparison of Preprocessing Methods ---
              N_terms  Avg_terms_per_document
copy_comment  19331.0                9.930587
STREM+FPS3     1960.0                9.885761
STREM+FPS5     6451.0                9.885893
STREM+FPS7    11842.0                9.885893
STREM+ZEM      9451.0                8.953790


### Checking for emojis in STREM+ZEM column

In [41]:
emoji_pattern_ascii = r'[:;=]-?[)D(P]'

emoji_reviews = review[
    review["STREM+ZEM"].str.contains(emoji_pattern_ascii, regex=True, na=False)
]

print(f"\n--- Reviews with ASCII Emoticons ({len(emoji_reviews)} found) ---")
print(emoji_reviews[["copy_comment", "STREM+ZEM"]].head())


--- Reviews with ASCII Emoticons (301 found) ---
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                               copy_comment  \
192                                                                                                                                                                                                                                                                                                         motoru güçlü . kesimi çekme

# Saving Preprocessed Dataset

In [42]:
review.to_csv("preprocessed_reviews.csv", index=False, encoding='utf-8')


In [None]:
files.download("data/preprocessed_reviews.csv")

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>