# Preprocessing Turkish Tweets with Zemberek

This notebook implements the preprocessing step of the project. It includes tokenization, lowercasing, stemming using Zemberek, and optional stop-word removal. The processed tweets will be saved for later steps, such as TF-IDF transformation and classification.
    

## Step 1: Import Required Libraries

In [8]:
import os
import pickle
from py4j.java_gateway import JavaGateway, GatewayParameters, launch_gateway
from helpers import load_turkish_stop_words_from_csv

## Step 2: Initialize Zemberek for Stemming

In [2]:
def initialize_zemberek(jar_path="zemberek-full.jar"):
    """Starts Zemberek NLP through Py4J gateway."""
    port = launch_gateway(classpath=jar_path)
    gateway = JavaGateway(gateway_parameters=GatewayParameters(port=port))
    tokenizer = gateway.jvm.zemberek.tokenization.TurkishTokenizer.DEFAULT
    extractor = gateway.jvm.zemberek.tokenization.TurkishSentenceExtractor.DEFAULT
    return tokenizer, extractor

## Step 3: Define Tokenization, Lowercasing, and Stemming

In [3]:
def preprocess_text(text, tokenizer, extractor, stop_words=[]):
    """
    Preprocesses Turkish text using Zemberek. Tokenizes, extracts lemmas, 
    and removes stop words.

    Args:
        text (str): Input text to preprocess.
        tokenizer: Instance of TurkishTokenizer.
        extractor: Instance of TurkishSentenceExtractor.
        stop_words (list): List of stop words to exclude.

    Returns:
        list: List of preprocessed tokens.
    """
    # Extract sentences from text
    sentences = extractor.fromDocument(text)
    tokens = []

    for sentence in sentences:
        # Tokenize each sentence
        token_strings = tokenizer.tokenizeToStrings(sentence)
        
        for token in token_strings:
            # Check if token is not a stop word
            if token not in stop_words:
                tokens.append(token)

    return tokens



## Step 4: Process All Tweets in the Dataset

In [4]:
def process_tweets(dataset_folder, tokenizer, extractor, stop_words=[]):
    """
    Processes all tweets in the dataset.

    Args:
        dataset_folder (str): Path to the dataset folder.
        tokenizer: Instance of TurkishTokenizer.
        extractor: Instance of TurkishSentenceExtractor.
        stop_words (list): List of stop words to exclude.

    Returns:
        dict: Dictionary with processed tweets by label.
    """
    processed_data = {}

    # Resolve absolute path for dataset folder
    dataset_folder = os.path.abspath(dataset_folder)

    if not os.path.exists(dataset_folder):
        raise FileNotFoundError(f"Dataset folder not found: {dataset_folder}")

    # Iterate over label folders
    for label in os.listdir(dataset_folder):
        label_folder = os.path.join(dataset_folder, label)

        # Skip files like `.DS_Store` and only process directories
        if not os.path.isdir(label_folder):
            print(f"Skipping non-directory item: {label_folder}")
            continue

        tweets = []

        # Iterate over files in label folder
        for filename in os.listdir(label_folder):
            file_path = os.path.join(label_folder, filename)

            # Skip non-text files if needed
            if not file_path.endswith(".txt"):
                print(f"Skipping non-text file: {file_path}")
                continue

            try:
                with open(file_path, "r", encoding="ISO-8859-9") as file:
                    text = file.read()
                    processed = preprocess_text(text, tokenizer, extractor, stop_words)
                    tweets.append(processed)
            except FileNotFoundError:
                print(f"File not found: {file_path}")
                continue
            except UnicodeDecodeError:
                print(f"Encoding error in file: {file_path}")
                continue

        if tweets:  # Only add if there are tweets
            processed_data[label] = tweets
        else:
            print(f"No valid tweets found in folder: {label_folder}")

    return processed_data

## Step 5: Example Usage

In [None]:
# Initialize Zemberek
tokenizer, extractor = initialize_zemberek()

stop_words_csv_path = "data/stop_words.csv"
#custom_stop_words = ["turkcell", "fizy"]
stop_words = load_turkish_stop_words_from_csv(stop_words_csv_path)
print(stop_words)

In [None]:

# Path to the dataset
dataset_path = "data/raw_texts"

# Process the tweets
processed_tweets = process_tweets(dataset_path, tokenizer, extractor, stop_words)

# Print a sample
for label, tweets in processed_tweets.items():
    print(f"\nLabel: {label}")
    print("Sample Processed Tweet:", tweets[0])

# Save processed_tweets to a pickle file
with open("data/processed_tweets.pkl", "wb") as file:
    pickle.dump(processed_tweets, file)

print("Processed tweets saved to data/processed_tweets.pkl")
    