In [1]:
import pandas as pd
import pyarrow.parquet as pq
import re
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
import joblib
import numpy as np
import os
from scipy.sparse import save_npz, vstack, hstack

try:
    stopwords.words("english")
except LookupError:
    print("NLTK stopwords indiriliyor...")
    nltk.download("stopwords")

INPUT_PARQUET_PATH = "../data/raw/labeled_reviews.parquet"
OUTPUT_DIR = "../data/processed/"

TEXT_ONLY_DIR = os.path.join(OUTPUT_DIR, "text_data")
HYBRID_DIR = os.path.join(OUTPUT_DIR, "hybrid")
os.makedirs(TEXT_ONLY_DIR, exist_ok=True)
os.makedirs(HYBRID_DIR, exist_ok=True)

VECTORIZER_PATH = os.path.join(OUTPUT_DIR, "tfidf_vectorizer.joblib")

stop_words = set(stopwords.words("english"))
def clean_text(text):
    text = text.lower()
    text = re.sub(r'<.*?>', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return " ".join(words)

def create_behavioral_features(df):
    helpful_list = df["helpful"].apply(eval)
    df['helpful_votes'] = helpful_list.apply(lambda x: x[0])
    df['total_votes'] = helpful_list.apply(lambda x: x[1])
    df['helpfulness_ratio'] = df['helpful_votes'] / (df['total_votes'] + 0.001)

    df['text_length'] = df['full_text'].str.len()
    
    feature_df = df[['overall', 'helpfulness_ratio', 'text_length']].fillna(0)
    return feature_df.to_numpy()

print("Kurulum tamamlandÄ±.")

Kurulum tamamlandÄ±.


In [2]:
if not os.path.exists(VECTORIZER_PATH):
    print("VektÃ¶rleÅŸtirici oluÅŸturuluyor...")
    parquet_file_vec = pq.ParquetFile(INPUT_PARQUET_PATH)
    sample_batch_vec = next(parquet_file_vec.iter_batches(batch_size=200000))
    df_sample = sample_batch_vec.to_pandas()
    df_sample["full_text"] = df_sample["summary"].astype(str) + " " + df_sample["reviewText"].astype(str)
    df_sample["cleaned_text"] = df_sample["full_text"].apply(clean_text)
    tfidf_vectorizer = TfidfVectorizer(max_features=20000, ngram_range=(1, 2))
    tfidf_vectorizer.fit(df_sample["cleaned_text"])
    joblib.dump(tfidf_vectorizer, VECTORIZER_PATH)
    print(f"VektÃ¶rleÅŸtirici '{VECTORIZER_PATH}' adresine kaydedildi.")
else:
    print("VektÃ¶rleÅŸtirici zaten mevcut, bu adÄ±m atlanÄ±yor.")

VektÃ¶rleÅŸtirici zaten mevcut, bu adÄ±m atlanÄ±yor.


In [3]:
all_reviewer_ids = pd.read_parquet(INPUT_PARQUET_PATH, columns=['reviewerID'])
reviewer_counts = all_reviewer_ids['reviewerID'].value_counts()

In [None]:
final_text_files = [
    os.path.join(TEXT_ONLY_DIR, "X_train.npz"), os.path.join(TEXT_ONLY_DIR, "X_test.npz"),
    os.path.join(TEXT_ONLY_DIR, "y_train.npy"), os.path.join(TEXT_ONLY_DIR, "y_test.npy")
]
final_hybrid_files = [
    os.path.join(HYBRID_DIR, "X_train.npz"), os.path.join(HYBRID_DIR, "X_test.npz"),
    os.path.join(HYBRID_DIR, "y_train.npy"), os.path.join(HYBRID_DIR, "y_test.npy")
]

text_files_exist = all(os.path.exists(p) for p in final_text_files)
hybrid_files_exist = all(os.path.exists(p) for p in final_hybrid_files)

# 2. EÄŸer TÃœM dosyalar zaten varsa, hiÃ§bir ÅŸey yapma.
if text_files_exist and hybrid_files_exist:
    print("âœ… TÃ¼m text-only ve hybrid veri setleri zaten mevcut. HiÃ§bir iÅŸlem yapÄ±lmadÄ±.")
else:
    # --- ORTAK VE HIZLI ADIMLAR (SADECE BÄ°R KEZ Ã‡ALIÅžIR) ---
    print("Bir veya daha fazla veri seti eksik. Veri iÅŸleme adÄ±mlarÄ± baÅŸlatÄ±lÄ±yor...")
    
    # TÃ¼m etiketleri ve indeksleri Ã¶nceden al
    print("Etiketler okunuyor ve eÄŸitim/test indeksleri oluÅŸturuluyor...")
    y_final = pd.read_parquet(INPUT_PARQUET_PATH, columns=['class'])['class'].to_numpy()
    indices = np.arange(len(y_final))

    # Sadece indeksleri bÃ¶lerek y_train ve y_test'i oluÅŸtur
    train_indices, test_indices, y_train, y_test = train_test_split(
        indices, y_final, test_size=0.20, random_state=42, stratify=y_final
    )
    train_indices_set = set(train_indices) # HÄ±zlÄ± arama iÃ§in

    # --- YAVAÅž VERÄ° OKUMA DÃ–NGÃœSÃœ (SADECE BÄ°R KEZ Ã‡ALIÅžIR) ---
    print("Veri parÃ§a parÃ§a iÅŸleniyor ve setlere ayrÄ±lÄ±yor...")
    tfidf_vectorizer = joblib.load(VECTORIZER_PATH)
    parquet_file = pq.ParquetFile(INPUT_PARQUET_PATH)
    all_reviewer_ids = pd.read_parquet(INPUT_PARQUET_PATH, columns=['reviewerID'])
    reviewer_counts = all_reviewer_ids['reviewerID'].value_counts()
    
    # Eksik olan setler iÃ§in listeleri baÅŸlat
    if not text_files_exist:
        X_train_text_chunks, X_test_text_chunks = [], []
    if not hybrid_files_exist:
        X_train_hybrid_chunks, X_test_hybrid_chunks = [], []
    
    current_row_index = 0
    batch_iterator = parquet_file.iter_batches(batch_size=100000)
    for i, batch in enumerate(batch_iterator):
        print(f"{i+1}", end="-")
        df_chunk = batch.to_pandas()
        
        # Ã–zellik oluÅŸturma
        df_chunk['full_text'] = df_chunk['summary'].astype(str) + ' ' + df_chunk['reviewText'].astype(str)
        df_chunk['cleaned_text'] = df_chunk['full_text'].apply(clean_text)
        
        chunk_indices = np.arange(current_row_index, current_row_index + len(df_chunk))
        train_mask = [idx in train_indices_set for idx in chunk_indices]
        test_mask = np.invert(train_mask)
        
        # Eksik olan setler iÃ§in Ã¶zellikleri oluÅŸtur ve biriktir
        if not text_files_exist:
            X_chunk_tfidf = tfidf_vectorizer.transform(df_chunk['cleaned_text'])
            X_train_text_chunks.append(X_chunk_tfidf[train_mask])
            X_test_text_chunks.append(X_chunk_tfidf[test_mask])
        
        if not hybrid_files_exist:
            # EÄŸer text Ã¶zellikleri zaten hesaplanmadÄ±ysa, burada hesapla
            if 'X_chunk_tfidf' not in locals() or X_chunk_tfidf.shape[0] != len(df_chunk):
                 X_chunk_tfidf = tfidf_vectorizer.transform(df_chunk['cleaned_text'])
            
            X_chunk_behavioral_part1 = create_behavioral_features(df_chunk)
            df_chunk['reviewer_review_count'] = df_chunk['reviewerID'].map(reviewer_counts).fillna(1)
            X_chunk_behavioral_part2 = df_chunk[['reviewer_review_count']].to_numpy()
            X_chunk_hybrid = hstack([X_chunk_tfidf, X_chunk_behavioral_part1, X_chunk_behavioral_part2])
            
            X_train_hybrid_chunks.append(X_chunk_hybrid[train_mask])
            X_test_hybrid_chunks.append(X_chunk_hybrid[test_mask])
            
        current_row_index += len(df_chunk)

Bir veya daha fazla veri seti eksik. Veri iÅŸleme adÄ±mlarÄ± baÅŸlatÄ±lÄ±yor...
-> Part 1 -> Part 2 

KeyboardInterrupt: 

In [None]:
if not text_files_exist:
        print("\nSadece metin verisi birleÅŸtiriliyor ve kaydediliyor...")
        X_train_text = vstack(X_train_text_chunks)
        X_test_text = vstack(X_test_text_chunks)
        save_npz(final_text_files[0], X_train_text)
        save_npz(final_text_files[1], X_test_text)
        np.save(final_text_files[2], y_train)
        np.save(final_text_files[3], y_test)
        print("âœ… Sadece metin verisi kaydedildi.")

    if not hybrid_files_exist:
        print("\nHibrit veri birleÅŸtiriliyor ve kaydediliyor...")
        X_train_hybrid = vstack(X_train_hybrid_chunks)
        X_test_hybrid = vstack(X_test_hybrid_chunks)
        save_npz(final_hybrid_files[0], X_train_hybrid)
        save_npz(final_hybrid_files[1], X_test_hybrid)
        np.save(final_hybrid_files[2], y_train)
        np.save(final_hybrid_files[3], y_test)
        print("âœ… Hibrit veri kaydedildi.")

    print("\nðŸš€ TÃ¼m iÅŸlemler tamamlandÄ±.")