# Preprocessing Turkish Tweets with Zemberek

This notebook implements the preprocessing step of the project. It includes tokenization, lowercasing, stemming using Zemberek, and optional stop-word removal. The processed tweets will be saved for later steps, such as TF-IDF transformation and classification.
    

## Step 1: Import Required Libraries

In [1]:
import os
import pickle
from py4j.java_gateway import JavaGateway, GatewayParameters, launch_gateway
from helpers import load_turkish_stop_words_from_csv

## Step 2: Initialize Zemberek for Stemming

In [2]:
from jpype import JClass, JString, getDefaultJVMPath, startJVM, shutdownJVM, java

ZEMBEREK_PATH = "zemberek-full.jar"

# Initialize JVM with Zemberek
def initialize_zemberek():
    startJVM(getDefaultJVMPath(), '-ea', f'-Djava.class.path={ZEMBEREK_PATH}')
    TurkishMorphology = JClass('zemberek.morphology.TurkishMorphology')
    morphology = TurkishMorphology.createWithDefaults()
    return morphology

## Step 3: Define Tokenization, Lowercasing, and Stemming

In [3]:
def preprocess_text(text, morphology, stop_words=[]):
    """
    Preprocesses Turkish text using Zemberek: tokenizes, lemmatizes (stemming),
    and removes stop words. Excludes tokens with lemma "UNK".

    Args:
        text (str): Input text to preprocess.
        morphology: Initialized TurkishMorphology instance.
        stop_words (list): List of stop words to exclude.

    Returns:
        list: List of preprocessed tokens (lemmas or original words).
    """
    # Check for empty or whitespace-only text
    if not text.strip():
        return []  # Return an empty list if input is empty

    # Analyze and disambiguate the text
    analysis = morphology.analyzeAndDisambiguate(text).bestAnalysis()
    tokens = []

    for word_analysis in analysis:
        try:
            # Extract lemma
            lemmas = word_analysis.getLemmas()
            lemma = str(lemmas[0]) if lemmas else "UNK"  # Use "UNK" if no lemma is available
        except Exception:
            # In case of an unexpected error, set lemma as "UNK"
            lemma = "UNK"

        # Exclude "UNK" and stop words
        if lemma != "UNK" and lemma not in stop_words:
            tokens.append(lemma)

    return tokens


## Step 4: Process All Tweets in the Dataset

In [4]:
def process_tweets(dataset_folder, morphology, stop_words=[]):
    """
    Processes all tweets in the dataset.

    Args:
        dataset_folder (str): Path to the dataset folder.
        morphology: Initialized TurkishMorphology instance for stemming.
        stop_words (list): List of stop words to exclude.

    Returns:
        dict: Dictionary with processed tweets by label.
    """
    processed_data = {}

    # Resolve absolute path for dataset folder
    dataset_folder = os.path.abspath(dataset_folder)

    if not os.path.exists(dataset_folder):
        raise FileNotFoundError(f"Dataset folder not found: {dataset_folder}")

    # Iterate over label folders
    for label in os.listdir(dataset_folder):
        label_folder = os.path.join(dataset_folder, label)

        # Skip files like `.DS_Store` and only process directories
        if not os.path.isdir(label_folder):
            print(f"Skipping non-directory item: {label_folder}")
            continue

        tweets = []

        # Iterate over files in label folder
        for filename in os.listdir(label_folder):
            file_path = os.path.join(label_folder, filename)

            # Skip non-text files if needed
            if not file_path.endswith(".txt"):
                print(f"Skipping non-text file: {file_path}")
                continue

            try:
                with open(file_path, "r", encoding="ISO-8859-9") as file:
                    text = file.read().strip()  # Strip leading/trailing whitespace
                    if not text:  # Skip empty files
                        print(f"Skipping empty file: {file_path}")
                        continue

                    # Use the updated preprocess_text function with morphology
                    processed = preprocess_text(text, morphology, stop_words)
                    tweets.append(processed)
            except FileNotFoundError:
                print(f"File not found: {file_path}")
                continue
            except UnicodeDecodeError:
                print(f"Encoding error in file: {file_path}")
                continue

        if tweets:  # Only add if there are tweets
            processed_data[label] = tweets
        else:
            print(f"No valid tweets found in folder: {label_folder}")

    return processed_data


## Step 5: Initialize and Run

In [5]:
# Initialize Zemberek
morphology = initialize_zemberek()


stop_words_csv_path = "data/stop_words.csv"
#custom_stop_words = ["turkcell", "fizy"]
stop_words = load_turkish_stop_words_from_csv(stop_words_csv_path)
print(stop_words)

['neden', '', 've', 'bir', 'her', 'şu', 'ama', 'ne', 'o', 'bu', 'çok', 'nasıl', 'çünkü', '...', '.']


I|14:06:23.848|Root lexicon created in 268 ms.                                                                     | DictionarySerializer#getDictionaryItems
I|14:06:23.850|Dictionary generated in 338 ms                                                                      | RootLexicon#defaultBinaryLexicon
I|14:06:24.028|Initialized in 554 ms.                                                                              | TurkishMorphology#createWithDefaults


In [6]:
# Path to the dataset
dataset_path = "data/raw_texts"

# Process the tweets
processed_tweets = process_tweets(dataset_path,morphology, stop_words)

# Print a sample
for label, tweets in processed_tweets.items():
    print(f"\nLabel: {label}")
    print("Sample Processed Tweet:", tweets[0])

# Save processed_tweets to a pickle file
with open("data/processed_tweets.pkl", "wb") as file:
    pickle.dump(processed_tweets, file)

print("Processed tweets saved to data/processed_tweets.pkl")
    

Skipping non-directory item: /Users/egebilge/Documents/Lectures/SE-4475 NLP/SE-4475-NLP-Assignment/data/raw_texts/.DS_Store
Skipping empty file: /Users/egebilge/Documents/Lectures/SE-4475 NLP/SE-4475-NLP-Assignment/data/raw_texts/3/969.txt

Label: 1
Sample Processed Tweet: ['biraz', 'daha', 'geliş', 'fizy', 'yi', 'tamamen', 'söz']

Label: 3
Sample Processed Tweet: ['16', 'aralık', 'turkcell', '!']

Label: 2
Sample Processed Tweet: ['a', 'şimdi', 'derin', 'nefes', 'al', ',', 'beyin', 'oksijen', 'git', 'ayrıca', 'turkcell', 'ben', 'bok', 'yap', '?']
Processed tweets saved to data/processed_tweets.pkl
