# Preprocessing Turkish Tweets with Zemberek

This notebook implements the preprocessing step of the project. It includes tokenization, lowercasing, stemming using Zemberek, and optional stop-word removal. The processed tweets will be saved for later steps, such as TF-IDF transformation and classification.
    

## Step 1: Import Required Libraries

In [1]:
import sys
import os

# Add the project root directory to the Python path
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
sys.path.append(project_root)
from zemberek import TurkishMorphology
import pickle
import logging
import re
import emoji
from jpype import JClass, JString, getDefaultJVMPath, startJVM
from helpers import load_turkish_stop_words_from_csv

## Step 2: Define Tokenization, Lowercasing, and Stemming

In [None]:
def preprocess_text(text,morphology):
    # Remove punctuations
    text = re.sub(r"[^\w\s]", "", text)
    
    # Remove URLs, mentions, and numbers
    text = re.sub(r'http\S+|www\S+|@[A-Za-z0-9_]+|\d+', '', text)
    
    # Normalize whitespace
    text = re.sub(r'\s+', ' ', text).strip()
    
    # Convert to lowercase
    text = text.lower()
    
    # Split into words
    words = text.split()
    
    # Filter words (longer than 2 characters and alphabetic)
    words = [w for w in words if len(w) > 2 and w.isalpha()]


    processed_words = []
    for word in words:
        results = morphology.analyze(word)
        if results.analysis_results:
            # If the word is not found, get the root form of the word.
            lemma = results.analysis_results[0].get_stem()
            processed_words.append(lemma)
        else:
            # If the word is found, add the word directly.
            processed_words.append(word)

    return ' '.join(processed_words)

## Step 3: Process All Tweets in the Dataset

In [3]:
def process_dataset(dataset_folder, morphology):
    """
    Process all texts in the dataset folder by preprocessing and labeling.

    Args:
        dataset_folder (str): Path to the dataset folder.
        morphology: Zemberek morphology instance.

    Returns:
        tuple: (list of preprocessed texts, list of labels)
    """
    texts, labels = [], []
    for label in os.listdir(dataset_folder):
        label_folder = os.path.join(dataset_folder, label)
        if not os.path.isdir(label_folder):
            continue

        for filename in os.listdir(label_folder):
            file_path = os.path.join(label_folder, filename)
            if file_path.endswith(".txt"):
                with open(file_path, "r", encoding="ISO-8859-9") as file:
                    text = file.read().strip()
                    if text:
                        processed_text = preprocess_text(text, morphology)
                        texts.append(processed_text)
                        labels.append(label)
    
    logging.info(f"Processed {len(texts)} documents with {len(set(labels))} unique labels.")
    return texts, labels


## Step 4: Initialize and Run

In [4]:
# Initialize Zemberek
morphology = TurkishMorphology.create_with_defaults()

# Load stop words
stop_words_csv_path = "../data/stop_words.csv"
stop_words = load_turkish_stop_words_from_csv(stop_words_csv_path)

# Define custom entities
custom_entities = ["nokia", "panasonic", "pepsi", "istanbul", "lig", "kocaeli"]

# Path to the dataset
dataset_folder = "../data/raw_texts"

# Process tweets
text,labels = process_dataset(dataset_folder, morphology)

# Save texts and labels as a dictionary in a pickle file
output_path = "../data/processed_tweets.pkl"
with open(output_path, "wb") as file:
    pickle.dump({"texts": text, "labels": labels}, file)

logging.info(f"Processed tweets and labels saved to {output_path}")

2024-11-28 23:55:19,636 - zemberek.morphology.turkish_morphology - INFO
Msg: TurkishMorphology instance initialized in 3.315598964691162



TypeError: preprocess_text() takes 1 positional argument but 2 were given