In [46]:
import pandas as pd
import regex as re
from collections import Counter

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

## 1. Retrieval of the pickled normalized dataframe

In [34]:
df_normalized = pd.read_pickle("../analysis/cvs_dataframe_normalized.pkl")

## 2. Vectorization

We use the Scikit Lean vectorizer.

In [35]:
cv = CountVectorizer(analyzer='word', binary=False, decode_error='strict', \
encoding='utf-8', input='content', \
lowercase=True, max_df=1.0, max_features=None, min_df=1, \
ngram_range=(1, 1), preprocessor=None, stop_words=None, \
strip_accents=None, token_pattern='(?u)\\b[^\\d\\W]{2,}\\b',
tokenizer=None, vocabulary=None)

In [36]:
cv.fit(df_normalized["no_stopwords"])

In [37]:
vt_cv = cv.transform(df_normalized["no_stopwords"])

In [38]:
vt_cv.shape

(1468, 37541)

## 3. Cosine similarity

In [39]:
df_cos_sim = pd.DataFrame(cosine_similarity(vt_cv, vt_cv), dtype='float')
df_cos_sim.head(10)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1458,1459,1460,1461,1462,1463,1464,1465,1466,1467
0,1.0,0.298074,0.335854,0.218218,0.144781,0.266147,0.303577,0.310827,0.566764,0.179118,...,0.598634,0.303778,0.170447,0.162174,0.218598,0.239884,0.140732,0.259555,0.305388,0.409926
1,0.298074,1.0,0.177055,0.181642,0.105528,0.160715,0.210404,0.174581,0.347288,0.104139,...,0.400575,0.248123,0.117146,0.143897,0.170492,0.179439,0.149619,0.181332,0.20623,0.223962
2,0.335854,0.177055,1.0,0.116041,0.190801,0.28949,0.283694,0.307405,0.398697,0.350608,...,0.512819,0.481551,0.092877,0.406555,0.342361,0.285292,0.198526,0.211235,0.404495,0.624232
3,0.218218,0.181642,0.116041,1.0,0.216349,0.180174,0.163666,0.106496,0.232936,0.199342,...,0.308515,0.158592,0.12536,0.106168,0.178392,0.133697,0.232887,0.113771,0.120199,0.181448
4,0.144781,0.105528,0.190801,0.216349,1.0,0.300049,0.260699,0.171459,0.169161,0.304203,...,0.210223,0.169014,0.105704,0.111087,0.409898,0.099738,0.252828,0.195572,0.251753,0.201822
5,0.266147,0.160715,0.28949,0.180174,0.300049,1.0,0.25859,0.217339,0.31283,0.288196,...,0.324966,0.252893,0.157438,0.266864,0.383036,0.199504,0.224077,0.220611,0.343652,0.255767
6,0.303577,0.210404,0.283694,0.163666,0.260699,0.25859,1.0,0.196631,0.337277,0.246131,...,0.34194,0.306517,0.118072,0.185706,0.293371,0.179004,0.212385,0.245241,0.288678,0.309087
7,0.310827,0.174581,0.307405,0.106496,0.171459,0.217339,0.196631,1.0,0.338769,0.191791,...,0.383836,0.274102,0.099999,0.275852,0.239486,0.328765,0.215225,0.188353,0.273617,0.283459
8,0.566764,0.347288,0.398697,0.232936,0.169161,0.31283,0.337277,0.338769,1.0,0.204992,...,0.664261,0.413338,0.172847,0.245617,0.250523,0.31065,0.166499,0.267786,0.360985,0.467658
9,0.179118,0.104139,0.350608,0.199342,0.304203,0.288196,0.246131,0.191791,0.204992,1.0,...,0.318288,0.302296,0.13535,0.253737,0.407128,0.180928,0.26342,0.161303,0.260783,0.287734


We filter the values < 0.4:

In [40]:
df_cos_sim_s = (df_cos_sim >= 0.4).stack()


And create a list of coordinates (document_a_id, document_b_id), removing duples and tuples (document_b_id, document_a_id)

In [41]:
coordinates = [*df_cos_sim_s[df_cos_sim_s].index]
coord_undupled = [el for el in coord_undupled if el[0] < el[1]]

We map this list to a list of tuples of statuses of the documents:

In [43]:
status_list = [(df_normalized.loc[coord_a].status, df_normalized.loc[coord_b].status) for (coord_a, coord_b) in coord_undupled]

And count the number of different tuples:

In [47]:
count_status = Counter(status_list)
print(count_status)

Counter({('bad', 'bad'): 64577, ('bad', 'good'): 6294, ('good', 'bad'): 6086, ('good', 'good'): 3703})


The difference in favor of the ('bad', 'bad') tuples is striking. Fake CVs show more similarities among them than legitimate to fake documents or among legitimate ones. This points to a pattern in the vocabulary used in the fake documents.