In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
from sentence_transformers import SentenceTransformer
from sklearn.decomposition import TruncatedSVD
from scipy import sparse
import joblib
import nltk
from nltk.corpus import stopwords, opinion_lexicon
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# NLTK download just one time
# nltk.download('punkt')
# nltk.download('wordnet')
# nltk.download('omw-1.4')
# nltk.download('stopwords')
# nltk.download('opinion_lexicon')

# Lemma and Stopwords
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
positive_words = set(opinion_lexicon.positive())
negative_words = set(opinion_lexicon.negative())

def lemmatize_text(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    lemmatized_tokens = [lemmatizer.lemmatize(word, 'v') for word in tokens]
    return ' '.join(lemmatized_tokens)

def count_sentiment_words(text):
    tokens = word_tokenize(text.lower())
    tokens = [word for word in tokens if word.isalpha() and word not in stop_words]
    pos_count = sum(1 for word in tokens if word in positive_words)
    neg_count = sum(1 for word in tokens if word in negative_words)
    return pos_count, neg_count

# Load and clean data
df = pd.read_csv("../raw/merged_reviews_metadata_renamed.csv", sep=";")
df_clean = df[df['product_main_category'] == 'Musical Instruments'][[
    'review_rating', 'review_title', 'review_text', 'helpful_votes',
    'is_verified_purchase', 'product_title', 'product_average_rating',
    'product_rating_count', 'products_bought_together'
]]

# Drop rows with missing data
df_clean = df_clean.dropna(subset=['review_title', 'review_text'])

# Lemma
df_clean['review_title_lemmatized'] = df_clean['review_title'].apply(lemmatize_text)
df_clean['review_text_lemmatized'] = df_clean['review_text'].apply(lemmatize_text)
df_clean['full_review'] = df_clean['review_title_lemmatized'] + ' ' + df_clean['review_text_lemmatized']

# Count word and char
df_clean['word_count'] = df_clean['full_review'].apply(lambda x: len(x.split()))
df_clean['char_count'] = df_clean['full_review'].apply(lambda x: len(x))
df_clean['avg_word_length'] = df_clean['char_count'] / df_clean['word_count']

# Sentiment features
df_clean[['positive_word_count', 'negative_word_count']] = df_clean['full_review'].apply(
    lambda x: pd.Series(count_sentiment_words(x))
)

# Drop products_bought_together (not enough rows)
df_clean = df_clean.drop(columns=['products_bought_together'])

# Log-Transformation 
df_clean['log_helpful_votes'] = np.log1p(df_clean['helpful_votes'])

numerical_features = [
    'word_count', 'char_count', 'avg_word_length', 
    'positive_word_count', 'negative_word_count',
    'product_average_rating', 'product_rating_count', 'log_helpful_votes'
]

for feature in numerical_features:
    num_inf = np.isinf(df_clean[feature]).sum()
    num_nan = df_clean[feature].isna().sum()
    print(f"Feature '{feature}': {num_inf} Inf-Werte, {num_nan} NaN-Werte")

# Change INF to NaA
df_clean.replace([np.inf, -np.inf], np.nan, inplace=True)

# Drop rows with word_count == 0
zero_word_count = (df_clean['word_count'] == 0).sum()
print(f"Sum word_count == 0: {zero_word_count}")
df_clean = df_clean[df_clean['word_count'] != 0]

# Imputation with Med
df_clean[numerical_features] = df_clean[numerical_features].fillna(df_clean[numerical_features].median())

# Final Check
for feature in numerical_features:
    if np.isinf(df_clean[feature]).any():
        print(f"Feature '{feature}' Inf-Rows.")
    if df_clean[feature].isna().any():
        print(f"Feature '{feature}' NaN-Rows.")

df_clean['is_verified_purchase'] = df_clean['is_verified_purchase'].astype(int).astype(np.float32)

# scaler and float32
scaler = StandardScaler()
df_clean[numerical_features] = scaler.fit_transform(df_clean[numerical_features]).astype(np.float32)

# TF-IDF-Features
tfidf = TfidfVectorizer(max_features=500, ngram_range=(1, 2))  # Reduce max_features (not enough storage)
tfidf_features = tfidf.fit_transform(df_clean['full_review']).astype(np.float32)

# Sentence Embeddings
model = SentenceTransformer('bert-base-nli-mean-tokens')
embeddings = model.encode(df_clean['full_review'].tolist(), show_progress_bar=True, convert_to_numpy=True).astype(np.float32)

# Convert Embeddings to Sparse-Matrix
embeddings_sparse = sparse.csr_matrix(embeddings)

# Dimensionality reduction Embeddings to 150 (storage)
svd_embeddings = TruncatedSVD(n_components=150, random_state=42)
embeddings_reduced = svd_embeddings.fit_transform(embeddings_sparse)
embeddings_reduced_sparse = sparse.csr_matrix(embeddings_reduced.astype(np.float32))

# Dimensionality reduction TF-IDF-Features to 150 (storage)
svd_tfidf = TruncatedSVD(n_components=150, random_state=42)
tfidf_reduced = svd_tfidf.fit_transform(tfidf_features)
tfidf_reduced_sparse = sparse.csr_matrix(tfidf_reduced.astype(np.float32))

# Combine the reduced features
numeric_features_values = df_clean[numerical_features].values.astype(np.float32)

categorical_features = df_clean['is_verified_purchase'].values.reshape(-1,1).astype(np.float32)

# Convert numeric and categorical features to sparse matrix
numeric_sparse = sparse.csr_matrix(numeric_features_values)
categorical_sparse = sparse.csr_matrix(categorical_features)

# Combining the reduced TF-IDF features, reduced embeddings, numerical and categorical features
X = sparse.hstack([tfidf_reduced_sparse, embeddings_reduced_sparse, numeric_sparse, categorical_sparse]).astype(np.float32)

# Target
y = df_clean['review_rating'].astype(np.int32).values 

# Safe features and target
def save_processed_data(X, y, tfidf_vectorizer, svd_tfidf, svd_embeddings, 
                       X_path='X_features_reduced.joblib', 
                       y_path='y_target_reduced.joblib', 
                       tfidf_path='tfidf_vectorizer_reduced.joblib', 
                       svd_tfidf_path='svd_tfidf_reduced.joblib', 
                       svd_embeddings_path='svd_embeddings_reduced.joblib'):

    # Saving the feature matrix with maximum compression
    joblib.dump(X, X_path, compress=9)
    
    # Saving the target with maximum compression
    joblib.dump(y, y_path, compress=9)
    
    # Saving the tfidf with maximum compression
    joblib.dump(tfidf_vectorizer, tfidf_path, compress=9)
    
    # Saving the SVD with maximum compression
    joblib.dump(svd_tfidf, svd_tfidf_path, compress=9)
    joblib.dump(svd_embeddings, svd_embeddings_path, compress=9)
    
    print("Alle verarbeiteten Daten und Modelle wurden erfolgreich gespeichert.")

save_processed_data(
    X, y, tfidf, svd_tfidf, svd_embeddings,
    X_path='X_features_reduced.joblib', 
    y_path='y_target_reduced.joblib', 
    tfidf_path='tfidf_vectorizer_reduced.joblib', 
    svd_tfidf_path='svd_tfidf_reduced.joblib', 
    svd_embeddings_path='svd_embeddings_reduced.joblib'
)


[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1129)>
[nltk_data] Error loading wordnet: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1129)>
[nltk_data] Error loading omw-1.4: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1129)>
[nltk_data] Error loading stopwords: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1129)>
[nltk_data] Error loading opinion_lexicon: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:1129)>


Feature 'word_count': 0 Inf-Werte, 0 NaN-Werte
Feature 'char_count': 0 Inf-Werte, 0 NaN-Werte
Feature 'avg_word_length': 27 Inf-Werte, 0 NaN-Werte
Feature 'positive_word_count': 0 Inf-Werte, 0 NaN-Werte
Feature 'negative_word_count': 0 Inf-Werte, 0 NaN-Werte
Feature 'product_average_rating': 0 Inf-Werte, 0 NaN-Werte
Feature 'product_rating_count': 0 Inf-Werte, 0 NaN-Werte
Feature 'log_helpful_votes': 0 Inf-Werte, 0 NaN-Werte
Anzahl der Zeilen mit word_count == 0: 27


Batches: 100%|██████████| 810/810 [03:30<00:00,  3.85it/s]


Alle verarbeiteten Daten und Modelle wurden erfolgreich gespeichert.
