In [3]:
import pandas as pd
import numpy as np
import Levenshtein as lev
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity

In [4]:
matched_train_data = pd.read_csv("../data/preprocessed/matched_train_data.csv", low_memory=False, lineterminator='\n')
matched_train_data = matched_train_data.dropna(subset=["id_y"]).reset_index()

false_train_data = pd.read_csv("../data/preprocessed/false_train_data.csv", low_memory=False, lineterminator='\n')
false_train_data = false_train_data.dropna(subset=["id_y"]).reset_index()

matched_test_data = pd.read_csv("../data/preprocessed/matched_test_data.csv", low_memory=False, lineterminator='\n')
matched_test_data = matched_test_data.dropna(subset=["id_y"]).reset_index()

false_test_data = pd.read_csv("../data/preprocessed/false_test_data.csv", low_memory=False, lineterminator='\n')
false_test_data = false_test_data.dropna(subset=["id_y"]).reset_index()

In [5]:
def get_strings_similarity(google_strings, apple_strings):
    similarities = []

    for string_index in range(len(google_strings)):
        google_string, apple_string = google_strings[string_index], apple_strings[string_index]
        
        if not isinstance(google_string, str) or not isinstance(apple_string, str):
            similarities.append(0)
            continue
        
        similarities.append(lev.ratio(google_string, apple_string))
    
    return similarities

In [6]:
def get_tfidf_embeddings(documents):
    vectorizer = TfidfVectorizer(lowercase=False)

    return vectorizer.fit_transform(documents)

In [7]:
from scipy import spatial

def get_description_similarity(embeddings):
    similarities = []

    for embeddings_index in range(0, embeddings.shape[0], 2):
        similarities.append(1 - spatial.distance.cosine(embeddings[embeddings_index].toarray().flatten(), embeddings[embeddings_index + 1].toarray().flatten()))
    
    return similarities

In [8]:
def get_matched_dataframe(data, label):
    title_similarities = get_strings_similarity(data['title_x'], data['title_y'])
    author_similarities = get_strings_similarity(data['author_x'], data['author_y'])
    devsite_similarities = get_strings_similarity(data['devsite_x'], data['devsite_y'])

    # combine the description one by one
    x_y_descriptipns = [None]*(len(data['description_x'])+len(data['description_y']))
    x_y_descriptipns[::2] = data['description_x']
    x_y_descriptipns[1::2] = data['description_y']

    embeddings = get_tfidf_embeddings(x_y_descriptipns)

    description_similarities = get_description_similarity(embeddings)
    labels = [label] * len(data)

    data = list(zip(title_similarities, author_similarities, devsite_similarities, description_similarities, labels))
    columns = ['title_similarity', 'author_similarity', 'devsite_similarity', 'description_similarity', 'label']

    return pd.DataFrame(data=data, columns=columns)

In [9]:
%%time 

false_train_data_matched = get_matched_dataframe(false_train_data, label=0)
train_data_matched = get_matched_dataframe(matched_train_data, label=1)

CPU times: user 12min 5s, sys: 11.3 s, total: 12min 16s
Wall time: 12min 29s


In [10]:
X_train = pd.concat([false_train_data_matched, train_data_matched])
X_train = X_train.sample(frac=1)

In [11]:
false_train_data_matched.head()

Unnamed: 0,title_similarity,author_similarity,devsite_similarity,description_similarity,label
0,0.333333,0.315789,0.315789,0.027542,0
1,0.105263,0.0,0.0,0.066865,0
2,0.181818,0.266667,0.571429,0.037541,0
3,0.153846,0.285714,0.285714,0.025978,0
4,0.285714,0.307692,0.333333,0.003082,0


In [12]:
%%time 

false_test_data_matched = get_matched_dataframe(false_test_data, label=0)
test_data_matched = get_matched_dataframe(matched_test_data, label=1)

CPU times: user 1min 12s, sys: 1.43 s, total: 1min 13s
Wall time: 1min 16s


In [13]:
X_test = pd.concat([false_test_data_matched, test_data_matched])
X_test = X_test.sample(frac=1)

In [16]:
%%time
from xgboost import XGBClassifier

model = XGBClassifier()
model.fit(X_train.drop(columns=["label"]), X_train["label"])

CPU times: user 36.2 s, sys: 1.22 s, total: 37.5 s
Wall time: 5.67 s


XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.300000012, max_delta_step=0, max_depth=6,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=100, n_jobs=8, num_parallel_tree=1, random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, subsample=1,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [17]:
y_pred = model.predict(X_test.drop(columns=["label"]))



In [53]:
y_pred_sorted_labels_true_index = np.where(y_pred[y_pred == 1])[0]
y_test_sorted_labels_true_index = np.sort(np.array(X_test[X_test['label'] == 1].index))

In [19]:
from sklearn.metrics import balanced_accuracy_score
balanced_accuracy_score(X_test['label'], y_pred)

0.9988231445350284