In [2]:
import pandas as pd
import json
from tqdm import tqdm 
from gensim.models import Word2Vec, FastText
from nltk.tokenize import word_tokenize
import numpy as np
import nltk
# bash
# pip install optuna
import optuna
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import r2_score
from sklearn.metrics import mean_absolute_error, mean_squared_error


  from .autonotebook import tqdm as notebook_tqdm


In [2]:

nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ela\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

part 1

In [1]:
# 1.1
import pandas as pd
import json
from tqdm import tqdm
from collections import defaultdict

# Function to check if a business is a restaurant
def is_restaurant(categories):
    if not isinstance(categories, str):
        return False
    return 'Restaurants' in categories

# Load businesses and filter for restaurants
businesses = pd.read_json('yelp_academic_dataset_business.json', lines=True)
restaurants = businesses[businesses['categories'].apply(is_restaurant)]
restaurant_ids = set(restaurants['business_id'])

# Process reviews
user_restaurants = defaultdict(set)   # user_id -> set of restaurant IDs reviewed
biz_users = defaultdict(set)          # business_id -> set of users who reviewed it
valid_reviews = []

with open('yelp_academic_dataset_review.json', 'r', encoding='utf-8') as f:
    for line in tqdm(f, desc="Processing reviews"):
        review = json.loads(line)
        if review['business_id'] in restaurant_ids:
            valid_reviews.append(review)
            user = review['user_id']
            biz = review['business_id']
            user_restaurants[user].add(biz)
            biz_users[biz].add(user)   # ⬅️ Important fix: collect users for each business

# User filter: must have reviewed at least 100 different restaurants
qualified_users = {u for u, biz_set in user_restaurants.items() if len(biz_set) >= 100}

# Restaurant filter: must be reviewed by at least 1000 unique users
qualified_biz = {b for b, user_set in biz_users.items() if len(user_set) >= 1000}

# Final review filtering
final_reviews = [
    r for r in valid_reviews
    if r['user_id'] in qualified_users and r['business_id'] in qualified_biz
]

# Save filtered reviews
with open('filtered_reviews.json', 'w', encoding='utf-8') as f:
    for r in final_reviews:
        f.write(json.dumps(r) + '\n')

# Save number of unique restaurants reviewed per user
user_stats = pd.DataFrame([
    {'user_id': user, 'unique_restaurant_count': len(biz_ids)}
    for user, biz_ids in user_restaurants.items()
])

# Save to CSV file
user_stats.to_csv('user_restaurant_counts.csv', index=False)

print("📄 File 'user_restaurant_counts.csv' saved. It contains the number of unique restaurants each user has reviewed.")

# Summary report
print("Users:", len(qualified_users))
print("Restaurants:", len(qualified_biz))
print("Reviews:", len(final_reviews))

Processing reviews: 6990280it [05:55, 19665.21it/s]


📄 File 'user_restaurant_counts.csv' saved. It contains the number of unique restaurants each user has reviewed.
Users: 2121
Restaurants: 296
Reviews: 23924


In [None]:
# 1.2
filtered_reviews = pd.read_json('filtered_reviews.json', lines=True)

# حالا می‌تونی ادامه بدی
filtered_reviews['date'] = pd.to_datetime(filtered_reviews['date'])
sorted_reviews = filtered_reviews.sort_values('date')

train = sorted_reviews.iloc[:20000]
rest = sorted_reviews.iloc[20000:]
valid = rest.iloc[:len(rest)//2]
test = rest.iloc[len(rest)//2:]

# ذخیره فایل‌ها
train.to_json('train.json', lines=True, orient='records')
valid.to_json('valid.json', lines=True, orient='records')
test.to_json('test.json', lines=True, orient='records')

print("Train size:", len(train))
print("Validation size:", len(valid))
print("Test size:", len(test))


Train size: 20000
Validation size: 1962
Test size: 1962


part 2

In [3]:

# 2.1
import pandas as pd
from gensim.models import Word2Vec, FastText
from nltk.tokenize import word_tokenize

def preprocess(text):
    return [w.lower() for w in word_tokenize(text)]

# بارگذاری و پیش‌پردازش
train_reviews = pd.read_json('train.json', lines=True)
sentences = train_reviews['text'].apply(preprocess).tolist()

# آموزش مدل‌ها
w2v_model = Word2Vec(sentences, seed=1234)
ft_model = FastText(sentences, seed=1234)

# تابع برای تبدیل لیست مشابه‌ها به DataFrame مرتب
def format_similar_words(model, word, topn=15):
    similar = model.wv.most_similar(word, topn=topn)
    return pd.DataFrame(similar, columns=['Word', 'Similarity'])

# ایجاد جدول نهایی
df_tasty = pd.concat([
    format_similar_words(ft_model, 'tasty').rename(columns={'Word': 'FastText', 'Similarity': 'FT_Sim'}),
    format_similar_words(w2v_model, 'tasty').rename(columns={'Word': 'Word2Vec', 'Similarity': 'W2V_Sim'})
], axis=1)

df_give = pd.concat([
    format_similar_words(ft_model, 'give').rename(columns={'Word': 'FastText', 'Similarity': 'FT_Sim'}),
    format_similar_words(w2v_model, 'give').rename(columns={'Word': 'Word2Vec', 'Similarity': 'W2V_Sim'})
], axis=1)

# نمایش زیباتر
print("\n🔍 Top 15 Similar Words to 'tasty'")
print(df_tasty.to_string(index=False))

print("\n🔍 Top 15 Similar Words to 'give'")
print(df_give.to_string(index=False))



🔍 Top 15 Similar Words to 'tasty'
  FastText   FT_Sim      Word2Vec  W2V_Sim
    tasty- 0.957964         yummy 0.833816
     good- 0.839100     delicious 0.813541
 delicious 0.824517          good 0.807784
 flavorful 0.821973     flavorful 0.733744
      good 0.821693        delish 0.700442
delicious- 0.805538    satisfying 0.698690
   goodbye 0.793263       filling 0.669034
     -good 0.792252 disappointing 0.657894
flavorfull 0.790689         bland 0.622572
  delicacy 0.790058    refreshing 0.619355
     pasty 0.788990        hearty 0.618211
flavourful 0.785256   outstanding 0.614990
suspicious 0.784993        strong 0.610984
  delicous 0.779898      tempting 0.610952
     yummy 0.777956   scrumptious 0.605299

🔍 Top 15 Similar Words to 'give'
FastText   FT_Sim Word2Vec  W2V_Sim
 forgive 0.823551    bring 0.604657
   agave 0.748712 consider 0.583525
   given 0.746449    allow 0.577139
   gives 0.745981      add 0.566688
    gave 0.703354  deserve 0.566082
     ive 0.694540     rate 

In [8]:
# 2.1
import pandas as pd
from gensim.models import Word2Vec, FastText
from nltk.tokenize import word_tokenize

# --- Preprocessing Function ---
def preprocess(text):
    return [w.lower() for w in word_tokenize(text)]

# --- Load and Preprocess Training Reviews ---
train_reviews = pd.read_json('train.json', lines=True)
sentences = train_reviews['text'].apply(preprocess).tolist()


# --- Train Embedding Models ---
w2v_model = Word2Vec(sentences, vector_size=100, window=5, min_count=5, sg=1, seed=1234)  # SGNS
ft_model = FastText(sentences, vector_size=100, window=5, min_count=5, sg=1, seed=1234)   # SGNS with subword info

# --- Function to Format Similar Words ---
def format_similar_words(model, word, topn=15):
    similar = model.wv.most_similar(word, topn=topn)
    return pd.DataFrame(similar, columns=['Word', 'Similarity'])

# --- Generate Tables for 'tasty' and 'give' ---
df_tasty = pd.concat([
    format_similar_words(ft_model, 'tasty').rename(columns={'Word': 'FastText', 'Similarity': 'FT_Sim'}),
    format_similar_words(w2v_model, 'tasty').rename(columns={'Word': 'Word2Vec', 'Similarity': 'W2V_Sim'})
], axis=1)

df_give = pd.concat([
    format_similar_words(ft_model, 'give').rename(columns={'Word': 'FastText', 'Similarity': 'FT_Sim'}),
    format_similar_words(w2v_model, 'give').rename(columns={'Word': 'Word2Vec', 'Similarity': 'W2V_Sim'})
], axis=1)

# --- Display Results ---
print("\n🔍 Top 15 Similar Words to 'tasty'")
print(df_tasty.to_string(index=False))

print("\n🔍 Top 15 Similar Words to 'give'")
print(df_give.to_string(index=False))

# --- Optional: Try OOV Word ---
print("\n❓ OOV Example (word: 'givez'):")
try:
    print("FastText:", ft_model.wv['givez'][:5])  # shows vector head
except KeyError:
    print("FastText: OOV not handled.")
try:
    print("Word2Vec:", w2v_model.wv['givez'][:5])
except KeyError:
    print("Word2Vec: ❌ OOV not in vocabulary")



🔍 Top 15 Similar Words to 'tasty'
     FastText   FT_Sim      Word2Vec  W2V_Sim
       tasty- 0.926242     delicious 0.868959
    delicious 0.857806         yummy 0.817968
   delicious- 0.837358          good 0.784924
    flavorful 0.803806     flavorful 0.772891
   flavorfull 0.801029         delic 0.769139
        yummy 0.796259      passable 0.745104
   flavourful 0.792423  unimpressive 0.736847
         good 0.775880  over-dressed 0.724674
    delicioso 0.773783 well-prepared 0.721848
        good- 0.757103     portioned 0.720370
     delicous 0.757069        tastey 0.718796
well-flavored 0.757052         good- 0.716557
        delic 0.753661   scrumptious 0.716499
  deliciously 0.744429        delish 0.714353
        yummo 0.743477      absurdly 0.711759

🔍 Top 15 Similar Words to 'give'
 FastText   FT_Sim  Word2Vec  W2V_Sim
   giving 0.739561    giving 0.705823
  forgive 0.684235      bump 0.703141
  deserve 0.681480   deserve 0.672059
     gave 0.677911      gave 0.663112
forgi

In [4]:
# --- Document Embedding Function ---
def doc_embedding(text, model, aggregation='mean'):
    words = preprocess(text)
    vectors = [model.wv[w] for w in words if w in model.wv]
    if not vectors:
        return np.zeros(model.vector_size)
    if aggregation == 'sum':
        return np.sum(vectors, axis=0)
    return np.mean(vectors, axis=0)

# مثال استفاده:
# X_train = np.array([doc_embedding(text, w2v_model, aggregation='mean') for text in train_reviews['text']])


paer 3

In [None]:
# 3.1
from sklearn.linear_model import LinearRegression, Ridge
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import r2_score
import numpy as np

def build_embedding_model(sentences, method='word2vec', **kwargs):
    if method == 'word2vec':
        model = Word2Vec(sentences, **kwargs)
    elif method == 'fasttext':
        model = FastText(sentences, **kwargs)
    else:
        raise ValueError("Unknown embedding method")
    return model

def tfidf_weighted(text, model, vectorizer):
    words = preprocess(text)
    vec = vectorizer.transform([text])
    vec_size = model.vector_size
    result = np.zeros(vec_size)
    total_weight = 0
    for word in words:
        if word in model.wv and word in vectorizer.vocabulary_:
            tfidf = vec[0, vectorizer.vocabulary_[word]]
            result += model.wv[word] * tfidf
            total_weight += tfidf
    return result / total_weight if total_weight else result

def get_doc_embedding_func(model, aggregation='mean', tfidf_vectorizer=None):
    def embed(text):
        words = preprocess(text)
        if aggregation == 'sum':
            vecs = [model.wv[w] for w in words if w in model.wv]
            return np.sum(vecs, axis=0) if vecs else np.zeros(model.vector_size)
        elif aggregation == 'mean':
            vecs = [model.wv[w] for w in words if w in model.wv]
            return np.mean(vecs, axis=0) if vecs else np.zeros(model.vector_size)
        elif aggregation == 'tfidf':
            return tfidf_weighted(text, model, tfidf_vectorizer)
        else:
            raise ValueError("Unknown aggregation")
    return embed

def evaluate_model(model, X_train, y_train, X_valid, y_valid):
    model.fit(X_train, y_train)
    preds = model.predict(X_valid)
    return r2_score(y_valid, preds)

def run_pipeline(embedding_type='word2vec',
                 aggregation='mean',
                 regression_model='ridge',
                 use_scaling=False,
                 embedding_params={},
                 reg_params={}):
    
    # Step 1: prepare data
    texts = train['text'].tolist()
    tokenized = [preprocess(t) for t in texts]
    model = build_embedding_model(tokenized, method=embedding_type, **embedding_params)

    if aggregation == 'tfidf':
        from sklearn.feature_extraction.text import TfidfVectorizer
        vectorizer = TfidfVectorizer(tokenizer=preprocess)
        vectorizer.fit(texts)
    else:
        vectorizer = None

    embed_func = get_doc_embedding_func(model, aggregation, vectorizer)

    # Step 2: embed train and valid
    X_train = np.array([embed_func(t) for t in train['text']])
    X_valid = np.array([embed_func(t) for t in valid['text']])
    y_train = train['stars'].values
    y_valid = valid['stars'].values

    # Step 3: Feature scaling
    if use_scaling:
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_valid = scaler.transform(X_valid)

    # Step 4: Choose regressor
    if regression_model == 'ridge':
        reg = Ridge(**reg_params)
    elif regression_model == 'linear':
        reg = LinearRegression()
    elif regression_model == 'rf':
        reg = RandomForestRegressor(**reg_params)
    else:
        raise ValueError("Unknown regression model")

    r2 = evaluate_model(reg, X_train, y_train, X_valid, y_valid)
    print(f"[{embedding_type.upper()}-{aggregation}] → {regression_model} R² score: {r2:.4f}")
    return r2

# Example usage
run_pipeline(
    embedding_type='word2vec',
    aggregation='mean',
    regression_model='ridge',
    embedding_params={'vector_size': 100, 'window': 5, 'sg': 1, 'min_count': 5, 'epochs': 10, 'seed': 1234},
    reg_params={'alpha': 1.0},
    use_scaling=True
)


[WORD2VEC-mean] → ridge R² score: 0.3586


0.35863232612609863

In [None]:
# 3.1
from gensim.models import Word2Vec, FastText
import numpy as np
from gensim.models.doc2vec import Doc2Vec, TaggedDocument
from sklearn.linear_model import Ridge, LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split


def train_embedding_model(sentences, method='word2vec', vector_size=100, window=5, epochs=10, sg=1, seed=1234):
    if method == 'word2vec':
        model = Word2Vec(sentences, vector_size=vector_size, window=window, epochs=epochs, sg=sg, seed=seed)
    elif method == 'fasttext':
        model = FastText(sentences, vector_size=vector_size, window=window, epochs=epochs, sg=sg, seed=seed)
    else:
        raise ValueError("Invalid embedding type. Choose 'word2vec' or 'fasttext'")
    return model


def get_doc_embedding(tokens, model, aggregation='mean'):
    vectors = [model.wv[w] for w in tokens if w in model.wv]
    if not vectors:
        return np.zeros(model.vector_size)
    if aggregation == 'sum':
        return np.sum(vectors, axis=0)
    elif aggregation == 'mean':
        return np.mean(vectors, axis=0)
    else:
        raise ValueError("Aggregation must be 'mean' or 'sum'")


def train_doc2vec_model(texts, vector_size=100, window=5, dm=1, epochs=20, seed=1234):
    tagged_data = [TaggedDocument(words=preprocess(text), tags=[str(i)]) for i, text in enumerate(texts)]
    model = Doc2Vec(tagged_data, vector_size=vector_size, window=window, dm=dm, epochs=epochs, seed=seed)
    return model

def build_features(data, embedding_model, method='word2vec', aggregation='mean', use_doc2vec=False, doc2vec_model=None):
    features = []
    for i, row in data.iterrows():
        tokens = row['tokens']
        if use_doc2vec and doc2vec_model:
            vec = doc2vec_model.dv[str(i)]
        else:
            vec = get_doc_embedding(tokens, embedding_model, aggregation=aggregation)
        features.append(vec)
    return np.array(features)


# --- انتخاب مدل‌های مختلف برای آزمایش ---
regressors = {
    "LinearRegression": LinearRegression(),
    "Ridge": Ridge(alpha=1.0),
    "SVR": SVR(kernel='rbf'),
    "RandomForest": RandomForestRegressor(n_estimators=100, random_state=1234)
}

# تنظیمات
embedding_type = 'word2vec'  # یا 'fasttext'
aggregation_type = 'mean'    # یا 'sum'
use_doc2vec = False           # True اگر بخوای از doc2vec استفاده کنی

# آموزش مدل embedding
embedding_model = train_embedding_model(train_reviews['tokens'], method=embedding_type)

# ساخت ویژگی‌ها
X = build_features(train_reviews, embedding_model, method=embedding_type, aggregation=aggregation_type)

# هدف
y = train_reviews['stars'].values  # فرض: ستون ستاره‌ها نامش "stars" هست

# تقسیم‌بندی داده‌ها
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state= 1234)

# ارزیابی مدل‌های مختلف
for name, reg in regressors.items():
    reg.fit(X_train, y_train)
    preds = reg.predict(X_val)
    rmse = np.sqrt(mean_squared_error(y_val, preds))
    print(f"🔎 {name}: RMSE = {rmse:.4f}")


KeyError: 'tokens'

In [None]:
# optuna_tuning.py

import pandas as pd
import numpy as np
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
from gensim.models import Word2Vec, FastText, Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from nltk.tokenize import word_tokenize
import optuna
from optuna.samplers import TPESampler
import multiprocessing

# ------------------------
# Preprocessing
# ------------------------
def preprocess(text):
    return [w.lower() for w in word_tokenize(text)]

def build_features(df, model, aggregation='mean'):
    features = []
    for text in df['text']:
        words = preprocess(text)
        vectors = [model.wv[w] for w in words if w in model.wv]
        if not vectors:
            vec = np.zeros(model.vector_size)
        elif aggregation == 'sum':
            vec = np.sum(vectors, axis=0)
        else:
            vec = np.mean(vectors, axis=0)
        features.append(vec)
    return np.array(features)

def build_doc2vec_features(df, model):
    return np.array([model.infer_vector(preprocess(text)) for text in df['text']])

# ------------------------
# Objective Template
# ------------------------
def get_objective(model_type, mode):
    def objective(trial):
        vector_size = trial.suggest_int('vector_size', 50, 300)
        window = trial.suggest_int('window', 2, 10)
        epochs = trial.suggest_int('epochs', 5, 30)
        min_count = trial.suggest_int('min_count', 1, 10)
        aggregation = trial.suggest_categorical('aggregation', ['mean', 'sum'])

        if model_type == 'word2vec':
            sg = 1 if mode == 'sgns' else 0
            model = Word2Vec(
                sentences=df['tokens'], vector_size=vector_size,
                window=window, epochs=epochs, min_count=min_count,
                sg=sg, seed=1234
            )
            X = build_features(df, model, aggregation)

        elif model_type == 'fasttext':
            sg = 1 if mode == 'sgns' else 0
            model = FastText(
                sentences=df['tokens'], vector_size=vector_size,
                window=window, epochs=epochs, min_count=min_count,
                sg=sg, seed=1234
            )
            X = build_features(df, model, aggregation)

        elif model_type == 'doc2vec':
            dm = 1 if mode == 'dm' else 0
            tagged = [TaggedDocument(words=toks, tags=[i]) for i, toks in enumerate(df['tokens'])]
            model = Doc2Vec(
                documents=tagged, vector_size=vector_size, window=window,
                epochs=epochs, min_count=min_count, dm=dm, seed=1234
            )
            X = build_doc2vec_features(df, model)

        else:
            raise ValueError("Unknown model type")

        y = df['stars'].values
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=1234)
        reg = Ridge()
        reg.fit(X_train, y_train)
        y_pred = reg.predict(X_val)
        return r2_score(y_val, y_pred)
    return objective

# ------------------------
# Main Execution
# ------------------------
if __name__ == '__main__':
    import nltk
    nltk.download('punkt')

    df = pd.read_json("train.json", lines=True)
    df['tokens'] = df['text'].apply(preprocess)

    configs = [
        ('word2vec', 'sgns'),
        ('word2vec', 'cbow'),
        ('fasttext', 'sgns'),
        ('fasttext', 'cbow'),
        ('doc2vec', 'dm'),
        ('doc2vec', 'dbow')
    ]

    for model_type, mode in configs:
        print(f"\n🔧 Optimizing: {model_type.upper()} - {mode.upper()}")
        study = optuna.create_study(direction='maximize', sampler=TPESampler(seed= 1234 ))
        objective = get_objective(model_type, mode)
        study.optimize(objective, n_trials=50, n_jobs=multiprocessing.cpu_count())

        print("Best R2:", study.best_value)
        print("Best Params:", study.best_params)

        fig = optuna.visualization.matplotlib.plot_param_importances(study)
        fig.set_size_inches(8, 5)
        fig.savefig(f"importance_{model_type}_{mode}.png")


part 4

3

In [1]:
import pandas as pd
import numpy as np
from gensim.models import Word2Vec, FastText, Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.svm import SVR
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler
from sklearn.feature_extraction.text import TfidfVectorizer
import nltk
# nltk.download('punkt')
from nltk.tokenize import word_tokenize
import random

# ---------------------------------
# تنظیمات اولیه و فایل‌ها
# ---------------------------------
SEED = 1234
random.seed(SEED)
np.random.seed(SEED)

embedding_type = 'word2vec'     # 'fasttext' یا 'doc2vec'
aggregation_type = 'mean'       # 'mean', 'tfidf', 'doc2vec'
regressor_type = 'RandomForest' # 'LinearRegression', 'SVR', 'RandomForest'

vector_size = 100
window = 5
epochs = 20
sg = 1  # 1=SGNS, 0=CBOW

# ---------------------------------
# 1. Load Train Data and Preprocess
# ---------------------------------
train_df = pd.read_json('train.json', lines=True)

def preprocess(text):
    return [w.lower() for w in word_tokenize(text)]

train_df['tokens'] = train_df['text'].apply(preprocess)

# ---------------------------------
# 2. Train Word Embedding
# ---------------------------------
sentences = train_df['tokens'].tolist()

def train_embedding(sentences, method='word2vec'):
    if method == 'word2vec':
        model = Word2Vec(vector_size=vector_size, window=window, sg=sg, seed=SEED, min_count=2)
        model.build_vocab(sentences)
        model.train(sentences, total_examples=model.corpus_count, epochs=epochs)
    elif method == 'fasttext':
        model = FastText(vector_size=vector_size, window=window, sg=sg, seed=SEED, min_count=2)
        model.build_vocab(sentences)
        model.train(sentences, total_examples=model.corpus_count, epochs=epochs)
    else:
        model = None
    return model

if embedding_type in ['word2vec', 'fasttext']:
    embedding_model = train_embedding(sentences, method=embedding_type)

# ---------------------------------
# 3. Document Embedding Aggregation
# ---------------------------------
def mean_embedding(tokens, model):
    vectors = [model.wv[w] for w in tokens if w in model.wv]
    return np.mean(vectors, axis=0) if vectors else np.zeros(vector_size)

def tfidf_weighted_embedding(df, model):
    tfidf = TfidfVectorizer()
    tfidf.fit(df['text'])
    idf_dict = dict(zip(tfidf.get_feature_names_out(), tfidf.idf_))
    embeddings = []
    for tokens in df['tokens']:
        weighted = [model.wv[w] * idf_dict.get(w, 0.0) for w in tokens if w in model.wv]
        weights = [idf_dict.get(w, 0.0) for w in tokens if w in model.wv]
        if weighted:
            doc_vec = np.sum(weighted, axis=0) / np.sum(weights)
        else:
            doc_vec = np.zeros(vector_size)
        embeddings.append(doc_vec)
    return np.array(embeddings)

def doc2vec_embedding(df):
    tagged = [TaggedDocument(words=t, tags=[str(i)]) for i, t in enumerate(df['tokens'])]
    model = Doc2Vec(tagged, vector_size=vector_size, window=window, epochs=epochs, seed=SEED)
    return np.array([model.dv[str(i)] for i in range(len(df))])

if aggregation_type == 'doc2vec' or embedding_type == 'doc2vec':
    X = doc2vec_embedding(train_df)
else:
    if aggregation_type == 'mean':
        X = np.vstack(train_df['tokens'].apply(lambda x: mean_embedding(x, embedding_model)))
    elif aggregation_type == 'tfidf':
        X = tfidf_weighted_embedding(train_df, embedding_model)

y = train_df['stars'].values  # هدف: امتیاز 1 تا 5

# ---------------------------------
# 4. Feature Manipulation (Optional)
# ---------------------------------
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# ---------------------------------
# 5. Regression Model Training
# ---------------------------------
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.2, random_state=SEED)

regressors = {
    'LinearRegression': LinearRegression(),
    'SVR': SVR(kernel='rbf', C=1.0),
    'RandomForest': RandomForestRegressor(n_estimators=100, max_depth=5, random_state=SEED)
}

regressor = regressors[regressor_type]
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test)

mse = mean_squared_error(y_test, y_pred)
rmse = mse ** 0.5

print(f" Model: {regressor_type} | Embedding: {embedding_type} | Aggregation: {aggregation_type} | RMSE: {rmse:.4f}")


 Model: RandomForest | Embedding: word2vec | Aggregation: mean | RMSE: 0.8326


In [None]:
# optuna_hyperparameter_tuning.py
import optuna
from optuna.samplers import TPESampler
from gensim.models import Word2Vec, FastText, Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize

# Preprocess
train_reviews = pd.read_json('train.json', lines=True)
valid_reviews = pd.read_json('valid.json', lines=True)

def preprocess(text):
    return [w.lower() for w in word_tokenize(text)]

sentences = train_reviews['text'].apply(preprocess).tolist()
val_texts = valid_reviews['text'].apply(preprocess)
y_val = valid_reviews['stars'].values

# Word2Vec & FastText Embedding Function
def embed_docs(model, docs, vector_size):
    result = []
    for tokens in docs:
        vectors = [model.wv[w] for w in tokens if w in model.wv]
        result.append(np.mean(vectors, axis=0) if vectors else np.zeros(vector_size))
    return np.array(result)

# Doc2Vec Embedding Function
def embed_docs_d2v(model, docs, vector_size):
    return np.array([model.infer_vector(toks) for toks in docs])

# Template for study

def run_study(model_type):
    def objective(trial):
        vector_size = trial.suggest_categorical("vector_size", [100, 150, 200, 250, 300])
        epochs = trial.suggest_int("epochs", 10, 30)
        min_count = trial.suggest_int("min_count", 1, 5)

        if model_type.startswith("word2vec") or model_type.startswith("fastText"):
            window = trial.suggest_int("window", 3, 10)
            sg = 1 if "SGNS" in model_type else 0

            if model_type.startswith("word2vec"):
                model = Word2Vec(sentences, vector_size=vector_size, window=window, 
                                 epochs=epochs, min_count=min_count, sg=sg, seed=SEED)
            else:
                min_n = trial.suggest_int("min_n", 2, 4)
                max_n = trial.suggest_int("max_n", 5, 6)
                model = FastText(sentences, vector_size=vector_size, window=window, 
                                 epochs=epochs, min_count=min_count, sg=sg, 
                                 min_n=min_n, max_n=max_n, seed=SEED)

            X = embed_docs(model, val_texts, vector_size)

        elif model_type.startswith("doc2vec"):
            tagged_docs = [TaggedDocument(words=t, tags=[i]) for i, t in enumerate(sentences)]
            dm = 1 if "DM" in model_type else 0
            model = Doc2Vec(tagged_docs, vector_size=vector_size, epochs=epochs, 
                            min_count=min_count, dm=dm, seed=SEED)
            X = embed_docs_d2v(model, val_texts, vector_size)

        reg = Ridge()
        reg.fit(X, y_val)
        preds = reg.predict(X)
        return r2_score(y_val, preds)

    sampler = TPESampler(seed=SEED)
    study = optuna.create_study(direction="maximize", sampler=sampler)
    study.optimize(objective, n_trials=50)
    print(f"\n===== {model_type} =====")
    print("Best trial:", study.best_trial)
    try:
        import optuna.visualization as vis
        fig = vis.plot_param_importances(study)
        fig.show()
    except:
        pass
    return study

if __name__ == '__main__':
    settings = [
        'word2vec-SGNS', 'word2vec-CBOW',
        'fastText-SGNS', 'fastText-CBOW',
        'doc2vec-DM', 'doc2vec-DBOW'
    ]

    results = {}
    for setting in settings:
        results[setting] = run_study(setting)

    # Optional: Save studies
    for name, study in results.items():
        study.trials_dataframe().to_csv(f'{name}_optuna_trials.csv', index=False)


[I 2025-06-28 00:36:23,175] A new study created in memory with name: no-name-72f2a10b-78f4-40d5-bfb7-e7ef1fe509e9
[I 2025-06-28 00:45:39,841] Trial 0 finished with value: 0.4077486991882324 and parameters: {'vector_size': 150, 'epochs': 13, 'min_count': 1, 'window': 9}. Best is trial 0 with value: 0.4077486991882324.
[I 2025-06-28 00:52:01,057] Trial 1 finished with value: 0.43179863691329956 and parameters: {'vector_size': 250, 'epochs': 14, 'min_count': 1, 'window': 4}. Best is trial 1 with value: 0.43179863691329956.
[I 2025-06-28 00:57:51,547] Trial 2 finished with value: 0.43681877851486206 and parameters: {'vector_size': 300, 'epochs': 12, 'min_count': 2, 'window': 5}. Best is trial 2 with value: 0.43681877851486206.
[I 2025-06-28 01:01:08,969] Trial 3 finished with value: 0.391482949256897 and parameters: {'vector_size': 150, 'epochs': 10, 'min_count': 4, 'window': 4}. Best is trial 2 with value: 0.43681877851486206.
[I 2025-06-28 01:05:37,580] Trial 4 finished with value: 0.421


===== word2vec-SGNS =====
Best trial: FrozenTrial(number=44, state=1, values=[0.46813344955444336], datetime_start=datetime.datetime(2025, 6, 28, 9, 30, 39, 891179), datetime_complete=datetime.datetime(2025, 6, 28, 9, 51, 21, 857919), params={'vector_size': 300, 'epochs': 30, 'min_count': 1, 'window': 9}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'vector_size': CategoricalDistribution(choices=(100, 150, 200, 250, 300)), 'epochs': IntDistribution(high=30, log=False, low=10, step=1), 'min_count': IntDistribution(high=5, log=False, low=1, step=1), 'window': IntDistribution(high=10, log=False, low=3, step=1)}, trial_id=44, value=None)


[I 2025-06-28 11:16:29,960] A new study created in memory with name: no-name-a6c22af1-1122-401a-911e-8fec781bdc39
[I 2025-06-28 11:18:05,664] Trial 0 finished with value: 0.41857314109802246 and parameters: {'vector_size': 150, 'epochs': 13, 'min_count': 1, 'window': 9}. Best is trial 0 with value: 0.41857314109802246.
[I 2025-06-28 11:19:49,306] Trial 1 finished with value: 0.47096413373947144 and parameters: {'vector_size': 250, 'epochs': 14, 'min_count': 1, 'window': 4}. Best is trial 1 with value: 0.47096413373947144.
[I 2025-06-28 11:21:16,704] Trial 2 finished with value: 0.47289347648620605 and parameters: {'vector_size': 300, 'epochs': 12, 'min_count': 2, 'window': 5}. Best is trial 2 with value: 0.47289347648620605.
[I 2025-06-28 11:22:13,914] Trial 3 finished with value: 0.4083157181739807 and parameters: {'vector_size': 150, 'epochs': 10, 'min_count': 4, 'window': 4}. Best is trial 2 with value: 0.47289347648620605.
[I 2025-06-28 11:23:24,665] Trial 4 finished with value: 0.

In [2]:
# test 1 trial
# optuna_hyperparameter_tuning.py
import optuna
from optuna.samplers import TPESampler
from gensim.models import Word2Vec, FastText, Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize

SEED = 1234

# Preprocess
train_reviews = pd.read_json('train.json', lines=True)
valid_reviews = pd.read_json('valid.json', lines=True)

def preprocess(text):
    return [w.lower() for w in word_tokenize(text)]

sentences = train_reviews['text'].apply(preprocess).tolist()
val_texts = valid_reviews['text'].apply(preprocess)
y_val = valid_reviews['stars'].values

# Word2Vec & FastText Embedding Function
def embed_docs(model, docs, vector_size):
    result = []
    for tokens in docs:
        vectors = [model.wv[w] for w in tokens if w in model.wv]
        result.append(np.mean(vectors, axis=0) if vectors else np.zeros(vector_size))
    return np.array(result)

# Doc2Vec Embedding Function
def embed_docs_d2v(model, docs, vector_size):
    return np.array([model.infer_vector(toks) for toks in docs])

# Template for study

def run_study(model_type):
    def objective(trial):
        vector_size = trial.suggest_categorical("vector_size", [100, 150, 200, 250, 300])
        epochs = trial.suggest_int("epochs", 10, 30)
        min_count = trial.suggest_int("min_count", 1, 5)

        if model_type.startswith("word2vec") or model_type.startswith("fastText"):
            window = trial.suggest_int("window", 3, 10)
            sg = 1 if "SGNS" in model_type else 0

            if model_type.startswith("word2vec"):
                model = Word2Vec(sentences, vector_size=vector_size, window=window, 
                                 epochs=epochs, min_count=min_count, sg=sg, seed=SEED)
            else:
                min_n = trial.suggest_int("min_n", 2, 4)
                max_n = trial.suggest_int("max_n", 5, 6)
                model = FastText(sentences, vector_size=vector_size, window=window, 
                                 epochs=epochs, min_count=min_count, sg=sg, 
                                 min_n=min_n, max_n=max_n, seed=SEED)

            X = embed_docs(model, val_texts, vector_size)

        elif model_type.startswith("doc2vec"):
            tagged_docs = [TaggedDocument(words=t, tags=[i]) for i, t in enumerate(sentences)]
            dm = 1 if "DM" in model_type else 0
            model = Doc2Vec(tagged_docs, vector_size=vector_size, epochs=epochs, 
                            min_count=min_count, dm=dm, seed=SEED)
            X = embed_docs_d2v(model, val_texts, vector_size)

        reg = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=SEED)
        reg.fit(X, y_val)
        preds = reg.predict(X)
        return r2_score(y_val, preds)

    sampler = TPESampler(seed=SEED)
    study = optuna.create_study(direction="maximize", sampler=sampler)
    study.optimize(objective, n_trials=1)
    print(f"\n===== {model_type} =====")
    print("Best trial:", study.best_trial)
    try:
        import optuna.visualization as vis
        fig = vis.plot_param_importances(study)
        fig.show()
    except:
        pass
    return study

if __name__ == '__main__':
    settings = [
        'word2vec-SGNS', 'word2vec-CBOW',
        'fastText-SGNS', 'fastText-CBOW',
        'doc2vec-DM', 'doc2vec-DBOW'
    ]

    results = {}
    for setting in settings:
        results[setting] = run_study(setting)

    # Optional: Save studies
    for name, study in results.items():
        study.trials_dataframe().to_csv(f'{name}_optuna_trials.csv', index=False)


  from .autonotebook import tqdm as notebook_tqdm
[I 2025-06-28 16:32:49,077] A new study created in memory with name: no-name-33f02c8b-de1a-4e34-acd6-f5e75d7dff2d
[I 2025-06-28 16:42:45,544] Trial 0 finished with value: 0.5433725575800599 and parameters: {'vector_size': 250, 'epochs': 15, 'min_count': 2, 'window': 9}. Best is trial 0 with value: 0.5433725575800599.



===== word2vec-SGNS =====
Best trial: FrozenTrial(number=0, state=1, values=[0.5433725575800599], datetime_start=datetime.datetime(2025, 6, 28, 16, 32, 49, 94888), datetime_complete=datetime.datetime(2025, 6, 28, 16, 42, 45, 544643), params={'vector_size': 250, 'epochs': 15, 'min_count': 2, 'window': 9}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'vector_size': CategoricalDistribution(choices=(100, 150, 200, 250, 300)), 'epochs': IntDistribution(high=30, log=False, low=10, step=1), 'min_count': IntDistribution(high=5, log=False, low=1, step=1), 'window': IntDistribution(high=10, log=False, low=3, step=1)}, trial_id=0, value=None)


[I 2025-06-28 16:42:48,027] A new study created in memory with name: no-name-fa2002f1-fc10-448f-b5b0-ef54c0e59140
[I 2025-06-28 16:45:18,698] Trial 0 finished with value: 0.539526243525664 and parameters: {'vector_size': 250, 'epochs': 15, 'min_count': 2, 'window': 9}. Best is trial 0 with value: 0.539526243525664.
[I 2025-06-28 16:45:18,698] A new study created in memory with name: no-name-516fed96-7bd4-4490-a109-37763308064a



===== word2vec-CBOW =====
Best trial: FrozenTrial(number=0, state=1, values=[0.539526243525664], datetime_start=datetime.datetime(2025, 6, 28, 16, 42, 48, 43753), datetime_complete=datetime.datetime(2025, 6, 28, 16, 45, 18, 698860), params={'vector_size': 250, 'epochs': 15, 'min_count': 2, 'window': 9}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'vector_size': CategoricalDistribution(choices=(100, 150, 200, 250, 300)), 'epochs': IntDistribution(high=30, log=False, low=10, step=1), 'min_count': IntDistribution(high=5, log=False, low=1, step=1), 'window': IntDistribution(high=10, log=False, low=3, step=1)}, trial_id=0, value=None)


[I 2025-06-28 17:04:21,046] Trial 0 finished with value: 0.5462658916667551 and parameters: {'vector_size': 250, 'epochs': 15, 'min_count': 2, 'window': 9, 'min_n': 4, 'max_n': 6}. Best is trial 0 with value: 0.5462658916667551.
[I 2025-06-28 17:04:21,052] A new study created in memory with name: no-name-c7f71af0-bf68-4691-9636-878d3b12c609



===== fastText-SGNS =====
Best trial: FrozenTrial(number=0, state=1, values=[0.5462658916667551], datetime_start=datetime.datetime(2025, 6, 28, 16, 45, 18, 698860), datetime_complete=datetime.datetime(2025, 6, 28, 17, 4, 21, 46447), params={'vector_size': 250, 'epochs': 15, 'min_count': 2, 'window': 9, 'min_n': 4, 'max_n': 6}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'vector_size': CategoricalDistribution(choices=(100, 150, 200, 250, 300)), 'epochs': IntDistribution(high=30, log=False, low=10, step=1), 'min_count': IntDistribution(high=5, log=False, low=1, step=1), 'window': IntDistribution(high=10, log=False, low=3, step=1), 'min_n': IntDistribution(high=4, log=False, low=2, step=1), 'max_n': IntDistribution(high=6, log=False, low=5, step=1)}, trial_id=0, value=None)


[I 2025-06-28 17:15:58,268] Trial 0 finished with value: 0.4692528567840909 and parameters: {'vector_size': 250, 'epochs': 15, 'min_count': 2, 'window': 9, 'min_n': 4, 'max_n': 6}. Best is trial 0 with value: 0.4692528567840909.
[I 2025-06-28 17:15:58,271] A new study created in memory with name: no-name-d313bdba-3365-4b54-800e-8acfb09995ff



===== fastText-CBOW =====
Best trial: FrozenTrial(number=0, state=1, values=[0.4692528567840909], datetime_start=datetime.datetime(2025, 6, 28, 17, 4, 21, 54450), datetime_complete=datetime.datetime(2025, 6, 28, 17, 15, 58, 267711), params={'vector_size': 250, 'epochs': 15, 'min_count': 2, 'window': 9, 'min_n': 4, 'max_n': 6}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'vector_size': CategoricalDistribution(choices=(100, 150, 200, 250, 300)), 'epochs': IntDistribution(high=30, log=False, low=10, step=1), 'min_count': IntDistribution(high=5, log=False, low=1, step=1), 'window': IntDistribution(high=10, log=False, low=3, step=1), 'min_n': IntDistribution(high=4, log=False, low=2, step=1), 'max_n': IntDistribution(high=6, log=False, low=5, step=1)}, trial_id=0, value=None)


[I 2025-06-28 17:19:58,380] Trial 0 finished with value: 0.39839579446547113 and parameters: {'vector_size': 250, 'epochs': 15, 'min_count': 2}. Best is trial 0 with value: 0.39839579446547113.
[I 2025-06-28 17:19:58,384] A new study created in memory with name: no-name-37697dca-40a6-40aa-84cf-4e8612e3a832



===== doc2vec-DM =====
Best trial: FrozenTrial(number=0, state=1, values=[0.39839579446547113], datetime_start=datetime.datetime(2025, 6, 28, 17, 15, 58, 272703), datetime_complete=datetime.datetime(2025, 6, 28, 17, 19, 58, 380108), params={'vector_size': 250, 'epochs': 15, 'min_count': 2}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'vector_size': CategoricalDistribution(choices=(100, 150, 200, 250, 300)), 'epochs': IntDistribution(high=30, log=False, low=10, step=1), 'min_count': IntDistribution(high=5, log=False, low=1, step=1)}, trial_id=0, value=None)


[I 2025-06-28 17:22:38,938] Trial 0 finished with value: 0.45027486687144924 and parameters: {'vector_size': 250, 'epochs': 15, 'min_count': 2}. Best is trial 0 with value: 0.45027486687144924.



===== doc2vec-DBOW =====
Best trial: FrozenTrial(number=0, state=1, values=[0.45027486687144924], datetime_start=datetime.datetime(2025, 6, 28, 17, 19, 58, 386114), datetime_complete=datetime.datetime(2025, 6, 28, 17, 22, 38, 938821), params={'vector_size': 250, 'epochs': 15, 'min_count': 2}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'vector_size': CategoricalDistribution(choices=(100, 150, 200, 250, 300)), 'epochs': IntDistribution(high=30, log=False, low=10, step=1), 'min_count': IntDistribution(high=5, log=False, low=1, step=1)}, trial_id=0, value=None)


In [None]:
# test 1 trial
# optuna_hyperparameter_tuning.py
import optuna
from optuna.samplers import TPESampler
from gensim.models import Word2Vec, FastText, Doc2Vec
from gensim.models.doc2vec import TaggedDocument
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize

SEED = 1234

# Preprocess
train_reviews = pd.read_json('train.json', lines=True)
valid_reviews = pd.read_json('valid.json', lines=True)

def preprocess(text):
    return [w.lower() for w in word_tokenize(text)]

sentences = train_reviews['text'].apply(preprocess).tolist()
val_texts = valid_reviews['text'].apply(preprocess)
y_val = valid_reviews['stars'].values

# Word2Vec & FastText Embedding Function
def embed_docs(model, docs, vector_size):
    result = []
    for tokens in docs:
        vectors = [model.wv[w] for w in tokens if w in model.wv]
        result.append(np.mean(vectors, axis=0) if vectors else np.zeros(vector_size))
    return np.array(result)

# Doc2Vec Embedding Function
def embed_docs_d2v(model, docs, vector_size):
    return np.array([model.infer_vector(toks) for toks in docs])

# Template for study

def run_study(model_type):
    def objective(trial):
        vector_size = trial.suggest_categorical("vector_size", [100, 150, 200, 250, 300])
        epochs = trial.suggest_int("epochs", 10, 30)
        min_count = trial.suggest_int("min_count", 1, 5)

        if model_type.startswith("word2vec") or model_type.startswith("fastText"):
            window = trial.suggest_int("window", 3, 10)
            sg = 1 if "SGNS" in model_type else 0

            if model_type.startswith("word2vec"):
                model = Word2Vec(sentences, vector_size=vector_size, window=window, 
                                 epochs=epochs, min_count=min_count, sg=sg, seed=SEED)
            else:
                min_n = trial.suggest_int("min_n", 2, 4)
                max_n = trial.suggest_int("max_n", 5, 6)
                model = FastText(sentences, vector_size=vector_size, window=window, 
                                 epochs=epochs, min_count=min_count, sg=sg, 
                                 min_n=min_n, max_n=max_n, seed=SEED)

            X = embed_docs(model, val_texts, vector_size)

        elif model_type.startswith("doc2vec"):
            tagged_docs = [TaggedDocument(words=t, tags=[i]) for i, t in enumerate(sentences)]
            dm = 1 if "DM" in model_type else 0
            model = Doc2Vec(tagged_docs, vector_size=vector_size, epochs=epochs, 
                            min_count=min_count, dm=dm, seed=SEED)
            X = embed_docs_d2v(model, val_texts, vector_size)

        reg = RandomForestRegressor(n_estimators=100, max_depth=5, random_state=SEED)
        reg.fit(X, y_val)
        preds = reg.predict(X)
        return r2_score(y_val, preds)

    sampler = TPESampler(seed=SEED)
    study = optuna.create_study(direction="maximize", sampler=sampler)

    # n_trials رو بیشتر کن که چند trial داشته باشی
    # n_jobs رو بزار بیشتر از 1 تا از multiprocessing داخلی Optuna استفاده کنه
    study.optimize(objective, n_trials=10, n_jobs=4)  # مثلا 4 تا پردازش همزمان

    print(f"\n===== {model_type} =====")
    print("Best trial:", study.best_trial)
    try:
        import optuna.visualization as vis
        fig = vis.plot_param_importances(study)
        fig.show()
    except:
        pass
    return study

if __name__ == '__main__':
    settings = [
        'word2vec-SGNS', 'word2vec-CBOW',
        'fastText-SGNS', 'fastText-CBOW',
        'doc2vec-DM', 'doc2vec-DBOW'
    ]

    results = {}
    for setting in settings:
        results[setting] = run_study(setting)

    # Optional: Save studies
    for name, study in results.items():
        study.trials_dataframe().to_csv(f'{name}_optuna_trials.csv', index=False)


[I 2025-06-28 20:27:04,660] A new study created in memory with name: no-name-b32acb13-c13d-4f85-b370-652144b27ffe
[I 2025-06-28 20:50:14,719] Trial 0 finished with value: 0.5220784989700811 and parameters: {'vector_size': 150, 'epochs': 17, 'min_count': 2, 'window': 4}. Best is trial 0 with value: 0.5220784989700811.
[I 2025-06-28 21:02:21,995] Trial 2 finished with value: 0.5517016801320288 and parameters: {'vector_size': 300, 'epochs': 12, 'min_count': 1, 'window': 9}. Best is trial 2 with value: 0.5517016801320288.
[I 2025-06-28 21:03:53,277] Trial 3 finished with value: 0.5265013118366382 and parameters: {'vector_size': 300, 'epochs': 14, 'min_count': 4, 'window': 9}. Best is trial 2 with value: 0.5517016801320288.
[I 2025-06-28 21:08:19,134] Trial 4 finished with value: 0.5153044969624514 and parameters: {'vector_size': 250, 'epochs': 15, 'min_count': 3, 'window': 3}. Best is trial 2 with value: 0.5517016801320288.
[I 2025-06-28 21:10:12,623] Trial 1 finished with value: 0.4939772


===== word2vec-SGNS =====
Best trial: FrozenTrial(number=2, state=1, values=[0.5517016801320288], datetime_start=datetime.datetime(2025, 6, 28, 20, 27, 4, 737889), datetime_complete=datetime.datetime(2025, 6, 28, 21, 2, 21, 995943), params={'vector_size': 300, 'epochs': 12, 'min_count': 1, 'window': 9}, user_attrs={}, system_attrs={}, intermediate_values={}, distributions={'vector_size': CategoricalDistribution(choices=(100, 150, 200, 250, 300)), 'epochs': IntDistribution(high=30, log=False, low=10, step=1), 'min_count': IntDistribution(high=5, log=False, low=1, step=1), 'window': IntDistribution(high=10, log=False, low=3, step=1)}, trial_id=2, value=None)


[I 2025-06-28 21:35:11,173] A new study created in memory with name: no-name-ea92a809-ba95-421e-9eb4-cf702846514d
[I 2025-06-28 21:41:33,300] Trial 0 finished with value: 0.5332306365266628 and parameters: {'vector_size': 200, 'epochs': 11, 'min_count': 3, 'window': 7}. Best is trial 0 with value: 0.5332306365266628.
[I 2025-06-28 21:41:43,535] Trial 1 finished with value: 0.5135399307075488 and parameters: {'vector_size': 200, 'epochs': 11, 'min_count': 1, 'window': 9}. Best is trial 0 with value: 0.5332306365266628.
[I 2025-06-28 21:44:27,644] Trial 3 finished with value: 0.5232170879261189 and parameters: {'vector_size': 200, 'epochs': 20, 'min_count': 2, 'window': 8}. Best is trial 0 with value: 0.5332306365266628.
[I 2025-06-28 21:44:42,638] Trial 2 finished with value: 0.5187079655193989 and parameters: {'vector_size': 250, 'epochs': 17, 'min_count': 4, 'window': 7}. Best is trial 0 with value: 0.5332306365266628.
[I 2025-06-28 21:50:21,462] Trial 7 finished with value: 0.4484876