# how many posts are identical or similar? #

Unfortunately we only can work with a subset of data, due to resource limitations.

Using 30000 posts as an example.


In [1]:
import pandas as pd


In [None]:
df = pd.read_csv('fbtrx_onlyEN.csv', nrows=30000)

## Easy enough, look at the postId ##

In [10]:
print("\n{} posts from a total of {} ({:.2f}%) are unique by postid".format(len(df.postId.unique()),len(df), len(df.postId.unique()) * 100 / len(df)))


28350 posts from a total of 30000 (94.50%) are unique by postid


## Does this match with the content itself? ##

In [11]:
print("\n{} posts from a total of {} ({:.2f}%) are unique by content\n".format(len(df.concatenatedText.unique()),len(df), len(df.concatenatedText.unique()) * 100 / len(df)))


28087 posts from a total of 30000 (93.62%) are unique by content



## What about the little differences? ##

In [14]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import euclidean_distances
vectorizer = TfidfVectorizer()


# This gets a matrix of distances from every text with every other text
# Here lies the memory Problem
def get_similarity_matrix(data):
    X = vectorizer.fit_transform(data.concatenatedText)
    return pd.DataFrame(data=euclidean_distances(X))


# reduces similarities matrix to the indices of texts below a given threshold
def transform_to_similar_indices(x, threshold):
    return [index for index, value in enumerate(x) if value <= threshold]


In [None]:
# With this, we can enrich the original data
dist = get_similarity_matrix(df)
## Chosen a threashold of 0.5
df['similars'] = dist.apply(lambda x: transform_to_similar_indices(x, 0.5), axis=1)
# store output:
#df.to_csv('with_similarities3.csv', index=False)


### I prepared something to avoid recalculation ###

In [2]:
# reading already enriched dataset:

# need some converters to read that:
from ast import literal_eval
def converter(x):
    return literal_eval(x)
converters={'similars': converter}
df = pd.read_csv('with_similarities3.csv', converters=converters)

In [3]:
# Filtering rows that have more than 10 similarities:

df[df['similars'].apply(lambda x: len(x) > 10)]


Unnamed: 0,ANGRY,HAHA,LIKE,LOVE,SAD,WOW,displaySource,fblinktype,id,images.count,...,permaLink,postId,publicationTime,source,sourceLink,timeline,user,concatenatedText,concatLanguage,similars
168,,,25.0,4.0,,,Jigsaw World Cruise,,06976ac0b2ce1a7fd4cf3a29a23e2839fa2d7d85,2,...,/ads/about,709117892630502,2017-10-09T14:37:02.000Z,Jigsaw World Cruise,https://apps.facebook.com/jigsawworldcruise/,pomelo-okra-milk,mushroom-curry-turnip,Let us take you on a luxurious trip around the...,en,"[168, 421, 1107, 1399, 2453, 7982, 8339, 8976,..."
219,,672.0,16000.0,881.0,,,Kialo,posts,084cb1b73b757f675a18fd420a5b7f7469a5ba58,1,...,/kialo/posts/1120121358118429,1120121358118429,2017-12-13T16:11:38.000Z,Kialo,https://www.facebook.com/kialo/,pistachio-pistachio-nachos,pear-dandelion-tamales,Sick of the internet shouting factory? Looking...,en,"[219, 1135, 1161, 1217, 1288, 1912, 2784, 3080..."
421,,,5.0,,,,Jigsaw World Cruise,,0ec10e1ab84f72c72b246e4c8e4912279519ef9b,2,...,/ads/about,709117929297165,2017-10-09T14:37:28.000Z,Jigsaw World Cruise,https://apps.facebook.com/jigsawworldcruise/,sandwich-hamburger-caramel,mushroom-curry-turnip,Let us take you on a luxurious trip around the...,en,"[168, 421, 1107, 1399, 2453, 7982, 8339, 8976,..."
898,,,,,,,Displate,photo,213d03d5371987405d189d3414e9627efd2d5907,1,...,/displate/photos/p.2209071655791513/2209071655...,2209085032456842,2018-09-24T14:58:35.000Z,Displate,https://www.facebook.com/displate/,waffle-turnip-shawarma,potato-macaroons-farfalle,We are the leading manufacturer of posters mad...,en,"[898, 1754, 5842, 8696, 10580, 10953, 14780, 1..."
1107,,,44.0,11.0,,,Jigsaw World Cruise,,298784eea97f17571f93c4d646289bb41e176fe4,2,...,,709117812630510,2017-10-09T14:37:02.000Z,Jigsaw World Cruise,https://apps.facebook.com/jigsawworldcruise/,asparagus-lime-manicotti,mushroom-curry-turnip,Let us take you on a luxurious trip around the...,en,"[168, 421, 1107, 1399, 2453, 7982, 8339, 8976,..."
1135,,430.0,9800.0,474.0,,,Kialo,,2a6e4065c5b3bd5d425fa135feef75b5eb135cdf,1,...,,1120121358118429,2018-05-14T07:47:20.000Z,Kialo,https://www.facebook.com/kialo/,pomelo-feta-asparagus,yams-avocado-berry,Sick of the internet shouting factory? Looking...,en,"[219, 1135, 1161, 1217, 1288, 1912, 2784, 3080..."
1161,,,,,,,Kialo,,2b5a6beea72efc0ccabcb0433b0c05559d65d6d6,2,...,/ads/about,1120121358118429,2017-12-13T16:11:38.000Z,Kialo,https://www.facebook.com/kialo/,taco-garbanzo-alfalfa,peach-date-thyme,Sick of the internet shouting factory? Kialo: ...,en,"[219, 1135, 1161, 1217, 1288, 1912, 2784, 3080..."
1217,,40.0,1000.0,62.0,,,Jacopo Lanza and Damiano Fraizzoli like Kialo.,,2d3edce837bc183afcbc9e2c9285348ea4d2e4ca,1,...,/ads/about,1120121358118429,2017-12-13T16:11:38.000Z,Jacopo Lanza,https://www.facebook.com/Jacopolanza,grapefruit-prune-waffle,radicchio-pistachio-pear,Sick of the internet shouting factory? Looking...,en,"[219, 1135, 1161, 1217, 1288, 1912, 2784, 3080..."
1288,,477.0,11000.0,585.0,,,Kialo,,2ff070b315708fcd193aafa5b664a9f5863024a7,1,...,/ads/about,1120121358118429,2017-12-13T16:11:38.000Z,Kialo,https://www.facebook.com/kialo/,blueberry-garbanzo-pomelo,kiwifruit-dandelion-biscotti,Sick of the internet shouting factory? Kialo: ...,en,"[219, 1135, 1161, 1217, 1288, 1912, 2784, 3080..."
1399,,,45.0,12.0,,,Jigsaw World Cruise,,34161f234c264a57138d3109794984290d2b83c0,2,...,/ads/about,709117812630510,2017-10-09T14:37:02.000Z,Jigsaw World Cruise,https://apps.facebook.com/jigsawworldcruise/,cinnamon-leek-basil,mushroom-curry-turnip,Let us take you on a luxurious trip around the...,en,"[168, 421, 1107, 1399, 2453, 7982, 8339, 8976,..."


In [4]:
#pd.set_option("display.width",10000)
pd.set_option("display.max_colwidth",10000)


In [5]:
row=3080
all = df.iloc[row]['similars']

df.loc[all,["postId", "concatenatedText"]]


Unnamed: 0,postId,concatenatedText
219,1120121358118429,"Sick of the internet shouting factory? Looking for a more civil place to discuss the big issues? After five years of development, we welcome you to Kialo, a system designed for thoughtful debate."
1135,1120121358118429,"Sick of the internet shouting factory? Looking for a more civil place to discuss the big issues? After five years of development, we welcome you to Kialo, a system designed for thoughtful debate. kialo.com"
1161,1120121358118429,"Sick of the internet shouting factory? Kialo: Thoughtful. Friendly. Debate. Looking for a more civil place to discuss the big issues? After five years of development, we welcome you to Kialo, a system designed for thoughtful debate."
1217,1120121358118429,"Sick of the internet shouting factory? Looking for a more civil place to discuss the big issues? After five years of development, we welcome you to Kialo, a system designed for thoughtful debate. kialo.com"
1288,1120121358118429,"Sick of the internet shouting factory? Kialo: Thoughtful. Friendly. Debate. Looking for a more civil place to discuss the big issues? After five years of development, we welcome you to Kialo, a system designed for thoughtful debate."
1912,1120121358118429,"Sick of the internet shouting factory? Looking for a more civil place to discuss the big issues? After five years of development, we welcome you to Kialo, a system designed for thoughtful debate. kialo.com"
2784,1120121358118429,"Sick of the internet shouting factory? Kialo: Thoughtful. Friendly. Debate. Looking for a more civil place to discuss the big issues? After five years of development, we welcome you to Kialo, a system designed for thoughtful debate."
3080,1120121358118429,"Sick of the internet shouting factory? Kialo: Thoughtful. Friendly. Debate. Looking for a more civil place to discuss the big issues? After five years of development, we welcome you to Kialo, a system designed for thoughtful debate."
3136,1120121358118429,"Sick of the internet shouting factory? Kialo: Thoughtful. Friendly. Debate. Looking for a more civil place to discuss the big issues? After five years of development, we welcome you to Kialo, a system designed for thoughtful debate."
3817,1120121358118429,Sick of the internet shouting factory? Kialo: Thoughtful. Friendly. Debate. Looking for a more civil place to discuss the big issues? After five years…
