In [1]:
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, accuracy_score
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import MultinomialNB
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from catboost import CatBoostClassifier
from sklearn.model_selection import RandomizedSearchCV
from scipy.stats import uniform
from scipy.stats import randint
import pickle
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import StandardScaler
from scipy.sparse import hstack
from sentence_transformers import SentenceTransformer
from scipy.sparse import csr_matrix
from sklearn.preprocessing import MinMaxScaler

In [11]:
# 1. Temizleme fonksiyonu 
def temizle(text):
    text = str(text).lower()
    text = re.sub(r'\d+\s*(kg|g|gr|ml|l|lt)\b', '', text)
    text = re.sub(r'\d+\s*(li|lı|lu|lü|lik)\b', '', text)
    text = re.sub(r'\d+\s*(adet|paket|cc|cl|pcs)\b', '', text)
    text = re.sub(r'\d+\s*', '', text)
    text = re.sub(r'%\s*\d+', '', text)
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\s+', ' ', text)
    return text.strip()

In [3]:
# 2. Veri oku ve temizle
df = pd.read_csv("./data.csv", encoding="utf-8-sig")
df['urun_adi_temiz'] = df['urun_adi'].apply(temizle)
X = df['urun_adi_temiz']
y = df['alaka']

In [4]:
# 3. Vektörleştirme
#  TF-IDF
vectorizer = TfidfVectorizer(ngram_range=(1,2), max_features=1384)
X_tfidf = vectorizer.fit_transform(df['urun_adi_temiz'])

#  Embedding
model = SentenceTransformer('all-MiniLM-L6-v2')
X_embed = model.encode(df['urun_adi_temiz'].tolist())

#  Normalleştir
scaler = MinMaxScaler()  # MultinomialNB negatif değer kabul etmez
X_embed_scaled = scaler.fit_transform(X_embed)

# 5. hstack (TF-IDF + Embedding birleşimi)
X_combined = hstack([X_tfidf, csr_matrix(X_embed_scaled)])


In [None]:
# 4. Eğitim/test ayrımı 
X_train, X_test, y_train, y_test = train_test_split(X_combined, y, test_size=0.2, random_state=42)

#Naive Bayes için sadece TF-IDF ile ayrı bir set oluştur
X_train_nb, X_test_nb, y_train_nb, y_test_nb = train_test_split(X_tfidf, y, test_size=0.2, random_state=42)

# 5. Modeller
modeller = {
    "Logistic Regression": LogisticRegression(max_iter=1000),
    "Naive Bayes": MultinomialNB(),
    "K-Nearest Neighbors": KNeighborsClassifier(),
    "Support Vector Machine": SVC(),
    "Random Forest": RandomForestClassifier(random_state=42),
    "Gradient Boosting": GradientBoostingClassifier(random_state=42),
    "XGBoost": XGBClassifier(use_label_encoder=False, eval_metric='logloss', random_state=42),
    "LightGBM": LGBMClassifier(random_state=42),
    "CatBoost": CatBoostClassifier(random_state=42, logging_level='Silent', save_snapshot=False, allow_writing_files=False)
}

# 6. Model karşılaştırmaları
for ad, model in modeller.items():
    print(f"--- Model: {ad} ---")

    if ad == "Naive Bayes":
        model.fit(X_train_nb, y_train_nb)
        y_pred = model.predict(X_test_nb)
    else:
        model.fit(X_train, y_train)
        y_pred = model.predict(X_test)

    acc = accuracy_score(y_test, y_pred)
    print(f"Accuracy: {acc:.4f}")
    print("Sınıflandırma Raporu:")
    print(classification_report(y_test, y_pred, digits=4, zero_division=0))
    print("-" * 50 + "\n")

--- Model: Logistic Regression ---
Accuracy: 0.8475
Sınıflandırma Raporu:
              precision    recall  f1-score   support

           0     0.8448    0.7313    0.7840        67
           1     0.8487    0.9182    0.8821       110

    accuracy                         0.8475       177
   macro avg     0.8468    0.8248    0.8330       177
weighted avg     0.8473    0.8475    0.8450       177

--------------------------------------------------

--- Model: Naive Bayes ---
Accuracy: 0.8870
Sınıflandırma Raporu:
              precision    recall  f1-score   support

           0     0.9796    0.7164    0.8276        67
           1     0.8516    0.9909    0.9160       110

    accuracy                         0.8870       177
   macro avg     0.9156    0.8537    0.8718       177
weighted avg     0.9000    0.8870    0.8825       177

--------------------------------------------------

--- Model: K-Nearest Neighbors ---
Accuracy: 0.8475
Sınıflandırma Raporu:
              precision    r

Parameters: { "use_label_encoder" } are not used.

  bst.update(dtrain, iteration=i, fobj=obj)


Accuracy: 0.8418
Sınıflandırma Raporu:
              precision    recall  f1-score   support

           0     0.8421    0.7164    0.7742        67
           1     0.8417    0.9182    0.8783       110

    accuracy                         0.8418       177
   macro avg     0.8419    0.8173    0.8262       177
weighted avg     0.8418    0.8418    0.8389       177

--------------------------------------------------

--- Model: LightGBM ---
[LightGBM] [Info] Number of positive: 467, number of negative: 238
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.011755 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 90534
[LightGBM] [Info] Number of data points in the train set: 705, number of used features: 416
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.662411 -> initscore=0.674059
[LightGBM] [Info] Start training from score 0.674059




Accuracy: 0.8305
Sınıflandırma Raporu:
              precision    recall  f1-score   support

           0     0.8491    0.6716    0.7500        67
           1     0.8226    0.9273    0.8718       110

    accuracy                         0.8305       177
   macro avg     0.8358    0.7995    0.8109       177
weighted avg     0.8326    0.8305    0.8257       177

--------------------------------------------------

--- Model: CatBoost ---
Accuracy: 0.8418
Sınıflandırma Raporu:
              precision    recall  f1-score   support

           0     0.8679    0.6866    0.7667        67
           1     0.8306    0.9364    0.8803       110

    accuracy                         0.8418       177
   macro avg     0.8493    0.8115    0.8235       177
weighted avg     0.8448    0.8418    0.8373       177

--------------------------------------------------



In [None]:
# 7. SVM için parametre aralığını belirle
svm_params = {
    'C': uniform(0.1, 100),  
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': ['scale', 'auto'] + list(uniform(0.001, 1).rvs(5)) 
}

svm_random_search = RandomizedSearchCV(
    estimator=SVC(),
    param_distributions=svm_params,
    n_iter=50,
    cv=5,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

svm_random_search.fit(X_train, y_train)

print("\nEn iyi SVM parametreleri:")
print(svm_random_search.best_params_)

best_svm = svm_random_search.best_estimator_
y_pred_best_svm = best_svm.predict(X_test)

print("\nTuned Support Vector Machine Performansı:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_best_svm):.4f}")
print(classification_report(y_test, y_pred_best_svm, digits=4))
print("-" * 50 + "\n")


Fitting 5 folds for each of 50 candidates, totalling 250 fits

En iyi SVM parametreleri:
{'C': np.float64(42.61558744912447), 'gamma': np.float64(0.049237173421451175), 'kernel': 'rbf'}

Tuned Support Vector Machine Performansı:
Accuracy: 0.8870
              precision    recall  f1-score   support

           0     0.8983    0.7910    0.8413        67
           1     0.8814    0.9455    0.9123       110

    accuracy                         0.8870       177
   macro avg     0.8898    0.8682    0.8768       177
weighted avg     0.8878    0.8870    0.8854       177

--------------------------------------------------



In [None]:
# 8. KNN için parametre aralığını belirle
knn_params = {
    'n_neighbors': randint(3, 30),
    'weights': ['uniform', 'distance'],
    'metric': ['euclidean', 'manhattan', 'minkowski']
}


knn_random_search = RandomizedSearchCV(
    estimator=KNeighborsClassifier(),
    param_distributions=knn_params,
    n_iter=50,
    cv=5,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

knn_random_search.fit(X_train, y_train)


print("\nEn iyi KNN parametreleri:")
print(knn_random_search.best_params_)

best_knn = knn_random_search.best_estimator_
y_pred_best_knn = best_knn.predict(X_test)

print("\nTuned K-Nearest Neighbors Performansı:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_best_knn):.4f}")
print(classification_report(y_test, y_pred_best_knn, digits=4))
print("-" * 50 + "\n")

Fitting 5 folds for each of 50 candidates, totalling 250 fits

En iyi KNN parametreleri:
{'metric': 'manhattan', 'n_neighbors': 3, 'weights': 'distance'}

Tuned K-Nearest Neighbors Performansı:
Accuracy: 0.8814
              precision    recall  f1-score   support

           0     0.9423    0.7313    0.8235        67
           1     0.8560    0.9727    0.9106       110

    accuracy                         0.8814       177
   macro avg     0.8992    0.8520    0.8671       177
weighted avg     0.8887    0.8814    0.8777       177

--------------------------------------------------



In [None]:
# 9. Naive Bayes için parametre aralığını belirle
nb_params = {
    'alpha': uniform(0.007, 0.02)
}

nb_random_search = RandomizedSearchCV(
    estimator=MultinomialNB(),
    param_distributions=nb_params,
    n_iter=50,
    cv=5,
    verbose=1,
    n_jobs=-1,
    random_state=42
)

nb_random_search.fit(X_train, y_train)

print("\nEn iyi Naive Bayes parametreleri:")
print(nb_random_search.best_params_)

best_nb = nb_random_search.best_estimator_
y_pred_best_nb = best_nb.predict(X_test)

print("\nTuned Naive Bayes Performansı:")
print(f"Accuracy: {accuracy_score(y_test, y_pred_best_nb):.4f}")
print(classification_report(y_test, y_pred_best_nb, digits=4))
print("-" * 50 + "\n")


Fitting 5 folds for each of 50 candidates, totalling 250 fits

En iyi Naive Bayes parametreleri:
{'alpha': np.float64(0.017495128632644758)}

Tuned Naive Bayes Performansı:
Accuracy: 0.9153
              precision    recall  f1-score   support

           0     0.9062    0.8657    0.8855        67
           1     0.9204    0.9455    0.9327       110

    accuracy                         0.9153       177
   macro avg     0.9133    0.9056    0.9091       177
weighted avg     0.9150    0.9153    0.9149       177

--------------------------------------------------



En iyi Naive Bayes parametreleri:
{'alpha': 0.015808361216819947}

Tuned Naive Bayes Performansı:
Accuracy: 0.9096

In [None]:
best_params = {'alpha': 0.015808361216819947}
best_model = MultinomialNB(**best_params)

best_model.fit(X_train, y_train)

In [10]:

with open('vectorizer.pkl', 'wb') as f:
    pickle.dump(vectorizer, f)

with open('model.pkl', 'wb') as file:
    pickle.dump(best_model, file)

with open("scaler.pkl", "wb") as f:
    pickle.dump(scaler, f)

print(f"Modeller başarıyla models klasörüne kaydedildi.")

Modeller başarıyla models klasörüne kaydedildi.
