In [2]:
import pandas as pd
import numpy as np
import Levenshtein as lev
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics import balanced_accuracy_score
from xgboost import XGBClassifier

In [3]:
matched_train_data = pd.read_csv("../data/preprocessed/matched_train_data.csv", low_memory=False, lineterminator='\n')
matched_train_data = matched_train_data.dropna(subset=["id_y"]).reset_index()

false_train_data = pd.read_csv("../data/preprocessed/false_train_data.csv", low_memory=False, lineterminator='\n')
false_train_data = false_train_data.dropna(subset=["id_y"]).reset_index()

matched_test_data = pd.read_csv("../data/preprocessed/matched_test_data.csv", low_memory=False, lineterminator='\n')
matched_test_data = matched_test_data.dropna(subset=["id_y"]).reset_index()

false_test_data = pd.read_csv("../data/preprocessed/false_test_data.csv", low_memory=False, lineterminator='\n')
false_test_data = false_test_data.dropna(subset=["id_y"]).reset_index()

crossed_all_data = pd.read_csv("../data/preprocessed/crossed_all_data.csv", low_memory=False, lineterminator='\n')
crossed_all_data = crossed_all_data.dropna(subset=["id_y"]).reset_index()

In [4]:
matched_train_data = matched_train_data.fillna('')
false_train_data = false_train_data.fillna('')
matched_test_data = matched_test_data.fillna('')
false_test_data = false_test_data.fillna('')
crossed_all_data = crossed_all_data.fillna('')

In [5]:
print(matched_train_data.shape)
print(false_train_data.shape)

(62095, 19)
(155285, 15)


In [6]:
print(matched_test_data.shape)
print(false_test_data.shape)

(16864, 19)
(38330, 15)


In [7]:
def get_strings_similarity(google_strings, apple_strings):
    similarities = []

    for string_index in range(len(google_strings)):
        google_string, apple_string = google_strings[string_index], apple_strings[string_index]
        
        if not isinstance(google_string, str) or not isinstance(apple_string, str):
            similarities.append(0)
            continue
        
        similarities.append(lev.ratio(google_string, apple_string))
    
    return similarities

In [8]:
def get_tfidf_embeddings(documents):
    vectorizer = TfidfVectorizer(lowercase=False)

    return vectorizer.fit_transform(documents)

In [9]:
from scipy import spatial

def get_description_similarity(embeddings):
    similarities = []

    for embeddings_index in range(0, embeddings.shape[0], 2):
        similarities.append(1 - spatial.distance.cosine(embeddings[embeddings_index].toarray().flatten(), embeddings[embeddings_index + 1].toarray().flatten()))
    
    return similarities

In [10]:
def get_matched_dataframe(data, label):
    title_similarities = get_strings_similarity(data['title_x'], data['title_y'])
    author_similarities = get_strings_similarity(data['author_x'], data['author_y'])
    devsite_similarities = get_strings_similarity(data['devsite_x'], data['devsite_y'])

    # combine the description one by one
    x_y_descriptipns = [None]*(len(data['description_x'])+len(data['description_y']))
    x_y_descriptipns[::2] = data['description_x']
    x_y_descriptipns[1::2] = data['description_y']

    embeddings = get_tfidf_embeddings(x_y_descriptipns)

    description_similarities = get_description_similarity(embeddings)
    labels = [label] * len(data)

    data = list(zip(title_similarities, author_similarities, devsite_similarities, description_similarities, labels))
    columns = ['title_similarity', 'author_similarity', 'devsite_similarity', 'description_similarity', 'label']

    return pd.DataFrame(data=data, columns=columns)

In [10]:
%%time 

false_train_data_matched = get_matched_dataframe(false_train_data, label=0)
train_data_matched = get_matched_dataframe(matched_train_data, label=1)

CPU times: user 44min 23s, sys: 39.9 s, total: 45min 3s
Wall time: 45min 39s


In [15]:
X_train = pd.concat([false_train_data_matched, train_data_matched])
X_train = X_train.sample(frac=1)

X_train.shape

(682455, 5)

In [12]:
%%time 

false_test_data_matched = get_matched_dataframe(false_test_data, label=0)
test_data_matched = get_matched_dataframe(matched_test_data, label=1)

  dist = 1.0 - uv / np.sqrt(uu * vv)
CPU times: user 3min 35s, sys: 14.4 s, total: 3min 50s
Wall time: 3min 53s


In [14]:
X_test = pd.concat([false_test_data_matched, test_data_matched])
X_test = X_test.sample(frac=1)

X_test.shape

(170154, 5)

In [103]:
%%time

def get_predictions(X, X_t, labels_to_drop):
    model = XGBClassifier()
    model.fit(X.drop(columns=labels_to_drop), X["label"])
    
    y_pred = model.predict(X_t.drop(columns=labels_to_drop))

    return y_pred

CPU times: user 12 µs, sys: 0 ns, total: 12 µs
Wall time: 15 µs


In [48]:
soft_pred = get_predictions(X_train, X_test, ["label"])
average_precision = balanced_accuracy_score(X_test['label'], soft_pred)
average_precision



0.998719267102962

In [1]:
def get_hard_data(data):
    data_sum = data.copy()
    data_sum['sum'] = data_sum.drop(columns=["label"]).sum(axis=1)
    data_sum_top_25 = data_sum.nlargest(int(data.shape[0] * 0.25), 'sum')
    print(data_sum_top_25["sum"].mean(), data.drop(columns=["label"]).sum(axis=1).mean())
    return data_sum_top_25

In [2]:
false_train_data_matched_hard = get_hard_data(false_train_data_matched)
false_test_data_matched_hard = get_hard_data(false_test_data_matched)

NameError: name 'false_train_data_matched' is not defined

In [60]:
def concat_df(first, second):
    conc = pd.concat([first, second])
    conc = conc.sample(frac=1)

    return conc.reset_index()

In [61]:
X_train_hard = concat_df(false_train_data_matched_hard, train_data_matched)
X_test_hard = concat_df(false_test_data_matched_hard, test_data_matched)

In [7]:
X_train_hard[X_train_hard["label"] == 0].sort_values(by=["sum"], ascending=False).head(50)

NameError: name 'X_train_hard' is not defined

In [53]:
print(X_train_hard.shape)
print(X_test_hard.shape)

(217140, 6)
(55104, 6)


In [55]:
hard_pred = get_predictions(X_train_hard, X_test_hard, ["label", "sum"])
average_precision = balanced_accuracy_score(X_test_hard["label"], hard_pred)
average_precision



0.9982792410273427

In [106]:
from itertools import chain, combinations

features = ["title_similarity", "author_similarity", "devsite_similarity", "description_similarity"]

def powerset(iterable):
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(len(s)+1))

all_features_combinations = list(powerset(features))

for combination in all_features_combinations:
    l = list(combination)
    l.append("label")
    print(f'LABELS --> {l}')
    if(len(l) > 1):
        soft_pred = get_predictions(X_train[l], X_test[l], ["label"])
        average_precision = balanced_accuracy_score(X_test['label'], soft_pred)
        print(f'SCORE --> {average_precision}')

LABELS --> ['label']
LABELS --> ['title_similarity', 'label']
SCORE --> 0.9762907120275155
LABELS --> ['author_similarity', 'label']
SCORE --> 0.8650000871573762
LABELS --> ['devsite_similarity', 'label']
SCORE --> 0.8883062201262257
LABELS --> ['description_similarity', 'label']
SCORE --> 0.9860792110358669
LABELS --> ['title_similarity', 'author_similarity', 'label']
SCORE --> 0.9888927639767358
LABELS --> ['title_similarity', 'devsite_similarity', 'label']
SCORE --> 0.9914831485104104
LABELS --> ['title_similarity', 'description_similarity', 'label']
SCORE --> 0.9984298925520754
LABELS --> ['author_similarity', 'devsite_similarity', 'label']
SCORE --> 0.9533695870405697
LABELS --> ['author_similarity', 'description_similarity', 'label']
SCORE --> 0.9952446698206223
LABELS --> ['devsite_similarity', 'description_similarity', 'label']
SCORE --> 0.9941013424726143
LABELS --> ['title_similarity', 'author_similarity', 'devsite_similarity', 'label']
SCORE --> 0.9938347840886117
LABELS -->

In [80]:
X_train[["title_similarity", "label"]]

Unnamed: 0,title_similarity,label
549603,0.250000,0
22648,0.307692,0
539416,0.250000,0
287748,0.181818,0
492207,0.000000,0
...,...,...
438828,0.250000,0
495538,0.181818,0
133185,0.285714,0
405502,0.000000,0


In [11]:
%%time 

false_train_data_matched = get_matched_dataframe(false_train_data, label=0)
train_data_matched = get_matched_dataframe(matched_train_data, label=1)

  dist = 1.0 - uv / np.sqrt(uu * vv)
CPU times: user 13min 27s, sys: 17.8 s, total: 13min 45s
Wall time: 14min


In [1]:
false_train_data["label"] = 0
false_train_data

NameError: name 'false_train_data' is not defined

In [12]:
X_train = pd.concat([false_train_data_matched, train_data_matched])
X_train = X_train.sample(frac=1)

X_train.shape

(217380, 5)

In [13]:
%%time 

false_test_data_matched = get_matched_dataframe(false_test_data, label=0)
test_data_matched = get_matched_dataframe(matched_test_data, label=1)

CPU times: user 1min 6s, sys: 5.52 s, total: 1min 12s
Wall time: 1min 12s


In [43]:
X_test = pd.concat([false_test_data_matched, test_data_matched])
X_test = X_test.sample(frac=1)

X_test.shape

(55194, 5)

In [46]:
X_test.iloc[1401]

title_similarity          0.222222
author_similarity         0.333333
devsite_similarity        0.000000
description_similarity    0.076723
label                     0.000000
Name: 4702, dtype: float64

In [48]:
model = XGBClassifier()
model.fit(X_train.drop(columns=["label"]), X_train["label"])
    
y_pred = model.predict(X_test.drop(columns=["label"]))



In [49]:
np.where(y_pred == 1)[0][:20]

array([ 0,  4,  5,  9, 15, 26, 29, 30, 33, 35, 36, 42, 45, 48, 50, 55, 61,
       62, 65, 73])

In [2]:
match = matched_test_data.iloc[45]
match

NameError: name 'matched_test_data' is not defined

In [None]:
%%time 

crossed_all_data_matched = get_matched_dataframe(crossed_all_data, label=0)
crossed_all_data_matched

In [17]:
y_pred_crossed = model.predict(crossed_all_data_matched.drop(columns=["label"]))



In [57]:
np.where(y_pred_crossed == 1)[0].shape

(611,)

In [4]:
np.where(y_pred_crossed == 1) 

NameError: name 'np' is not defined

In [75]:
# 422583
# 883787
# 894341

# Check the score for each match
# Short string match should get low score
# Check the devsite histogram
crossed_all_data_matched[crossed_all_data_matched.index == 883787]

Unnamed: 0,title_similarity,author_similarity,devsite_similarity,description_similarity,label
422583,1.0,0.5,0.181818,0.091055,0


In [76]:
xasd = crossed_all_data[crossed_all_data.index == 883787]
xasd

Unnamed: 0,index,id_x,store_x,apple_maincategory_x,google_maincategory_x,title_x,author_x,devsite_x,description_x,id_y,store_y,apple_maincategory_y,google_maincategory_y,title_y,author_y,devsite_y,description_y
422583,422583,howto.become.famous.se,0,,Lifestyle,how,solution,easyloveyourself,how to become famous a quick guide at one tim...,1111808667,1,6017,,how,toan,fewfew,wanna draw your favorite characters the easy w...


In [69]:
crossed_all_data_matched['description_similarity'] = crossed_all_data_matched['description_similarity'].fillna(0)
crossed_all_data_matched['description_similarity'].isnull().any()

False

In [70]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.datasets import make_classification

pca = PCA(2)
X = crossed_all_data_matched.drop(columns=["label"])
X_transformed = pca.fit_transform(X)
eigenvalues = pca.explained_variance_
print(eigenvalues)

[0.02313763 0.0168818 ]


In [5]:
import matplotlib.pyplot as plt
plt.scatter(X_transformed[:,0],X_transformed[:,1])
plt.show()

NameError: name 'X_transformed' is not defined

In [73]:
import numpy as np
from sklearn.decomposition import PCA
from sklearn.datasets import make_classification

pca = PCA(2)
X = X_train.drop(columns=["label"])
X_transformed = pca.fit_transform(X)
eigenvalues = pca.explained_variance_
print(eigenvalues)

[0.44893883 0.03946634]


In [6]:
plt.scatter(X_transformed[:,0],X_transformed[:,1])
plt.show()

NameError: name 'X_transformed' is not defined