In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [2]:
data = pd.read_csv('/Users/voronik/DeepLearning/Anti_duplicate/cleaned.csv')
data = data.dropna()

In [3]:
y = data['is_duplicate'].values
X = data.drop('is_duplicate', axis=1)

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=10, stratify=y)

In [5]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity 

In [6]:
train_q1 = X_train['name_1'].apply(lambda x: x).tolist()
train_q2 = X_train['name_2'].apply(lambda x: x).tolist()
train_list = train_q1 + train_q2

In [7]:
tfidf_veczr = TfidfVectorizer(ngram_range=(1,3))    
tf_corpus=tfidf_veczr.fit_transform(train_list)

In [8]:
train_term_doc1 = tfidf_veczr.transform(X_train['name_1'].apply(lambda x: x).tolist())
train_term_doc2 = tfidf_veczr.transform(X_train['name_2'].apply(lambda x: x).tolist())
X_train = train_term_doc1 + train_term_doc2

In [9]:
X_train

<397768x74314 sparse matrix of type '<class 'numpy.float64'>'
	with 5768586 stored elements in Compressed Sparse Row format>

In [10]:
from sklearn.linear_model import LogisticRegression

log_clf = LogisticRegression(C=0.5, max_iter=1000, class_weight='balanced')
log_clf.fit(X_train, y_train)

In [11]:
test_term_doc1 = tfidf_veczr.transform(X_test['name_1'].apply(lambda x: x).tolist())
test_term_doc2 = tfidf_veczr.transform(X_test['name_2'].apply(lambda x: x).tolist())
X_test = test_term_doc1 + test_term_doc2

In [12]:
y_test_pred = log_clf.predict(X_test)

In [13]:
from sklearn.metrics import f1_score
print('f1 macro -',f1_score(y_test, y_test_pred, average='macro'))
print('f1 micro -',f1_score(y_test, y_test_pred, average='micro'))
print('f1 weighted -',f1_score(y_test, y_test_pred, average='weighted'))

f1 macro - 0.6170072508719464
f1 micro - 0.9619275557611472
f1 weighted - 0.9751147175714706


In [14]:
from sklearn.metrics import precision_score
print('precision macro -',precision_score(y_test, y_test_pred, average='macro'))
print('precision micro -',precision_score(y_test, y_test_pred, average='micro'))
print('precision weighted -',precision_score(y_test, y_test_pred, average='weighted'))

precision macro - 0.5736104223580658
precision micro - 0.9619275557611472
precision weighted - 0.9928005690776964


In [15]:
from sklearn.metrics import recall_score
print('recall macro -',recall_score(y_test, y_test_pred, average='macro'))
print('recall micro -',recall_score(y_test, y_test_pred, average='micro'))
print('recall weighted -',recall_score(y_test, y_test_pred, average='weighted'))

recall macro - 0.9204810774842462
recall micro - 0.9619275557611472
recall weighted - 0.9619275557611472


In [16]:
from sklearn.metrics import roc_auc_score
print('roc_auc_score -',recall_score(y_test, y_test_pred, average='macro'))

roc_auc_score - 0.9204810774842462


In [17]:
from sklearn.model_selection import GridSearchCV

In [20]:
!pip install optuna

Collecting optuna
  Downloading optuna-3.0.3-py3-none-any.whl (348 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m348.5/348.5 kB[0m [31m528.9 kB/s[0m eta [36m0:00:00[0m kB/s[0m eta [36m0:00:01[0m:01[0m
Collecting PyYAML
  Using cached PyYAML-6.0-cp38-cp38-macosx_10_9_x86_64.whl (192 kB)
Collecting sqlalchemy>=1.3.0
  Downloading SQLAlchemy-1.4.42-cp38-cp38-macosx_10_15_x86_64.whl (1.6 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.6/1.6 MB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0mm eta [36m0:00:01[0m[36m0:00:01[0m0m
[?25hCollecting colorlog
  Downloading colorlog-6.7.0-py2.py3-none-any.whl (11 kB)
Collecting alembic>=1.5.0
  Downloading alembic-1.8.1-py3-none-any.whl (209 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m209.8/209.8 kB[0m [31m1.1 MB/s[0m eta [36m0:00:00[0m kB/s[0m eta [36m0:00:01[0m:01[0m
[?25hCollecting cmaes>=0.8.2
  Downloading cmaes-0.8.2-py3-non

In [30]:
import sklearn
from sklearn import model_selection
import optuna
from sklearn.metrics import f1_score

def objective(trial):    
    logreg_c = trial.suggest_float("logreg_c", 1e-10, 1e10, log=True)
    classifier_obj = LogisticRegression(C=logreg_c)
    score = model_selection.cross_val_score(classifier_obj, X_train, y_train, n_jobs=-1, cv=3)
    accuracy = score.mean()
    return accuracy

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)

[32m[I 2022-10-19 17:28:55,620][0m A new study created in memory with name: no-name-37022585-8569-40d2-9d39-a057832a70fc[0m
[32m[I 2022-10-19 17:29:10,319][0m Trial 0 finished with value: 0.9977901692765213 and parameters: {'logreg_c': 18.656961834528058}. Best is trial 0 with value: 0.9977901692765213.[0m
[32m[I 2022-10-19 17:29:19,505][0m Trial 1 finished with value: 0.9977901692385994 and parameters: {'logreg_c': 48812713.84259153}. Best is trial 0 with value: 0.9977901692765213.[0m
[32m[I 2022-10-19 17:29:29,430][0m Trial 2 finished with value: 0.9977951973457176 and parameters: {'logreg_c': 4206112869.379557}. Best is trial 2 with value: 0.9977951973457176.[0m
[32m[I 2022-10-19 17:29:40,995][0m Trial 3 finished with value: 0.9978203375968944 and parameters: {'logreg_c': 10819.399670557741}. Best is trial 3 with value: 0.9978203375968944.[0m
[32m[I 2022-10-19 17:29:52,025][0m Trial 4 finished with value: 0.9977977113423938 and parameters: {'logreg_c': 5855.60471957

In [None]:
import sklearn
from sklearn import model_selection
import optuna
from sklearn.metrics import f1_score

def objective(trial):    
    logreg_c = trial.suggest_float("logreg_c", 1e-10, 1e10, log=True)
    classifier_obj = LogisticRegression(C=logreg_c)
    classifier_obj.fit(X_train, y_train)
    y_pred = classifier_obj.predict(X_test)
    accuracy = 
    return accuracy

study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=10)