# Preprocessing Turkish Tweets with Zemberek

This notebook implements the preprocessing step of the project. It includes tokenization, lowercasing, stemming using Zemberek, and optional stop-word removal. The processed tweets will be saved for later steps, such as TF-IDF transformation and classification.
    

## Step 1: Import Required Libraries

In [1]:
import sys
import os

# Add the project root directory to the Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(project_root)
import pickle
import logging
import re
import emoji
from jpype import JClass, JString, getDefaultJVMPath, startJVM
from helpers import load_turkish_stop_words_from_csv

## Step 2: Initialize Zemberek for Stemming

In [2]:
# Import Zemberek
ZEMBEREK_PATH = "../zemberek-full.jar"

# Configure logging to print the final summary
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')

# Step 2: Initialize Zemberek

def initialize_zemberek():
    startJVM(getDefaultJVMPath(), '-ea', f'-Djava.class.path={ZEMBEREK_PATH}')
    TurkishMorphology = JClass('zemberek.morphology.TurkishMorphology')
    morphology = TurkishMorphology.createWithDefaults()
    return morphology

print("Zemberek initialized successfully.")

Zemberek initialized successfully.


## Step 3: Define Tokenization, Lowercasing, and Stemming

In [None]:
def advanced_preprocess_text(text, morphology, stop_words=[], custom_entities=[]):
    """
    Advanced preprocessing for Turkish text using Zemberek: tokenizes, lemmatizes, removes stop words,
    and handles mentions, hashtags, URLs, and emojis.

    Args:
        text (str): Input text to preprocess.
        morphology: Initialized TurkishMorphology instance.
        stop_words (list): List of stop words to exclude.
        custom_entities (list): List of custom named entities to be removed.

    Returns:
        list: List of preprocessed tokens.
        dict: Dictionary containing the word/token count summary.
    """
    initial_word_count = len(text.split())

    # Step 1: Lowercase Conversion
    text = text.lower()

    # Step 2: Remove URLs, Mentions, Numbers, and Excess Whitespace
    text = re.sub(r'http\S+|www\S+|@[A-Za-z0-9_]+|\d+', '', text)
    text = re.sub(r'\s+', ' ', text).strip()

    # Step 3: Remove Hashtags (Keep text without '#')
    text = re.sub(r'#', '', text)

    # Step 4: Remove Custom Entities
    for entity in custom_entities:
        text = re.sub(r'\b{}\b'.format(re.escape(entity)), '', text)

    # Step 5: Handle Emojis (Convert to Descriptive Text)
    text = emoji.demojize(text)

    # Step 6: Tokenization (Basic Splitting)
    tokens = text.split()

    # Step 7: Remove Short or Invalid Tokens
    tokens = [t for t in tokens if len(t) > 2 and t.isalpha()]

    # Step 8: Stop-word Removal
    token_count_before_stop_word_removal = len(tokens)
    tokens = [t for t in tokens if t not in stop_words]
    token_count_after_stop_word_removal = len(tokens)

    # Step 9: Lemmatization with Zemberek
    lemmatized_tokens = []
    for token in tokens:
        # Perform analysis
        analysis = morphology.analyzeAndDisambiguate(JString(token)).bestAnalysis()

        # Extract all possible lemmas
        lemmas = [str(item.getLemmas()[0]) for item in analysis if item.getLemmas()]
        if lemmas:
            lemmatized_tokens.append(lemmas[0])  # Append the first lemma as a Python string
        else:
            lemmatized_tokens.append(token)  # Append the original token if no lemma is found

    final_token_count = len(lemmatized_tokens)

    # Step 10: Collect Summary Report
    report = {
        "initial_word_count": initial_word_count,
        "token_count_before_stop_word_removal": token_count_before_stop_word_removal,
        "token_count_after_stop_word_removal": token_count_after_stop_word_removal,
        "final_token_count": final_token_count
    }

    return lemmatized_tokens, report


## Step 4: Process All Tweets in the Dataset

In [None]:
def process_tweets(dataset_folder, morphology, stop_words=[], custom_entities=[]):
    """
    Processes all tweets in the dataset.

    Args:
        dataset_folder (str): Path to the dataset folder.
        morphology: Initialized TurkishMorphology instance for stemming.
        stop_words (list): List of stop words to exclude.
        custom_entities (list): List of custom named entities to be removed.

    Returns:
        dict: Dictionary with processed tweets by label.
    """
    processed_data = {}
    summary_reports = []

    # Resolve absolute path for dataset folder
    dataset_folder = os.path.abspath(dataset_folder)

    if not os.path.exists(dataset_folder):
        raise FileNotFoundError(f"Dataset folder not found: {dataset_folder}")

    # Iterate over label folders
    for label in os.listdir(dataset_folder):
        label_folder = os.path.join(dataset_folder, label)

        # Skip non-directory entries
        if not os.path.isdir(label_folder):
            continue

        tweets = []

        # Iterate over files in the label folder
        for filename in os.listdir(label_folder):
            file_path = os.path.join(label_folder, filename)

            # Skip non-text files
            if not file_path.endswith(".txt"):
                continue

            try:
                with open(file_path, "r", encoding="ISO-8859-9") as file:
                    text = file.read().strip()
                    if not text:
                        continue

                    # Preprocess text
                    processed, report = advanced_preprocess_text(text, morphology, stop_words, custom_entities)

                    # Convert all tokens to Python-native strings
                    tweets.append([str(token) for token in processed])
                    summary_reports.append(report)

            except (FileNotFoundError, UnicodeDecodeError) as e:
                logging.error(f"Error in file: {file_path}, {str(e)}")
                continue

        if tweets:
            # Ensure label is a Python-native string
            processed_data[str(label)] = tweets

    # Generate summary report after processing all tweets
    total_initial_words = sum([r["initial_word_count"] for r in summary_reports])
    total_tokens_before_stop_word_removal = sum([r["token_count_before_stop_word_removal"] for r in summary_reports])
    total_tokens_after_stop_word_removal = sum([r["token_count_after_stop_word_removal"] for r in summary_reports])
    total_final_tokens = sum([r["final_token_count"] for r in summary_reports])

    logging.info(f"Summary Report for Preprocessing:")
    logging.info(f"Total initial word count: {total_initial_words}")
    logging.info(f"Total tokens before stop-word removal: {total_tokens_before_stop_word_removal}")
    logging.info(f"Total tokens after stop-word removal: {total_tokens_after_stop_word_removal}")
    logging.info(f"Total final token count after lemmatization: {total_final_tokens}")

    return processed_data


## Step 5: Initialize and Run

In [5]:
# Initialize Zemberek
morphology = initialize_zemberek()

# Load stop words
stop_words_csv_path = "../data/stop_words.csv"
stop_words = load_turkish_stop_words_from_csv(stop_words_csv_path)

# Custom entities to filter out
custom_entities = ['nokia', 'panasonic', 'pepsi', 'istanbul', 'lig', 'kocaeli']

I|21:38:05.162|Root lexicon created in 230 ms.                                                                     | DictionarySerializer#getDictionaryItems
I|21:38:05.163|Dictionary generated in 301 ms                                                                      | RootLexicon#defaultBinaryLexicon
I|21:38:05.334|Initialized in 506 ms.                                                                              | TurkishMorphology#createWithDefaults


In [6]:
# Path to the dataset
dataset_path = "../data/raw_texts"

# Process the tweets
processed_tweets = process_tweets(dataset_path, morphology, stop_words, custom_entities)

# Save processed tweets to a pickle file
with open("../data/processed_tweets.pkl", "wb") as file:
    pickle.dump(processed_tweets, file)

print("Processed tweets saved to data/processed_tweets.pkl")

2024-11-28 21:38:07,421 - INFO - Summary Report for Preprocessing:
2024-11-28 21:38:07,421 - INFO - Total initial word count: 35370
2024-11-28 21:38:07,421 - INFO - Total tokens before stop-word removal: 25098
2024-11-28 21:38:07,421 - INFO - Total tokens after stop-word removal: 19535
2024-11-28 21:38:07,422 - INFO - Total final token count after lemmatization: 19535


PicklingError: Can't pickle <java class 'java.lang.String'>: attribute lookup java.lang.String on jpype._jstring failed