In [2]:
import os
import csv
import json
import pandas as pd
from tqdm.notebook import tqdm

In [3]:
path_to_tweets = '../hierarchical_edge_bundling/tweets_marchandisation.csv'
path_to_users = r'D:\Alie\Documents\Projets\TwitterScripts\resources\following/'

df = pd.read_csv(path_to_tweets, sep=';')
sorted_tweets = df.sort_values(by="average_hash")

In [5]:
def load_dict(path, reset=False):
    if not os.path.exists(path) or reset:
        return {}
    else:
        with open(path, 'r', encoding='utf-8') as f:
            return json.load(f)

In [6]:
follow_links, \
share_links, \
rt_links = [load_dict(p) for p in ["follow_links", "share_links", "rt_links"]]


In [14]:
weights = {
    "follow": 0.1,
    "retweet": 0.1,
    "share": 1
}

def dico_to_csv(follow=False, retweet=False, share=False):
    edges = "Source;Target;LinkType;Weight;AverageHash;Sha1;Category\n"
    labels =  ['follow', 'retweet', 'share']
    for dic, link_type in [[d, n] for d, n in zip([follow, retweet, share], labels) if d]:
        for key, value in dic.items():
            user_1, user_2 = key.split('-')
            edges += f"{user_1};{user_2};{link_type};{value['weight'] * weights[link_type]};{value['average_hash']};{value['sha1']}; {value['category']}\n"
    with open(f"follow_{'null' if not follow else weights['follow']}_share_{'null' if not share else weights['share']}_retweet_{'null' if not retweet else weights['retweet']}.csv", 'w', encoding='utf-8') as f:
        f.write(edges)

In [16]:
dico_to_csv(follow=follow_links, retweet=rt_links, share=share_links)

### Fonction pour calculer les dictionnaires de liens une fois calculés ces dictionnaires sont stockés

In [8]:
def add_edge(dico, key, sha1=None, average_hash=None, category=None):
    if key not in dico:
        dico[key] = {"weight": 0, "sha1": sha1, "average_hash": average_hash, "category": category}
    dico[key]["weight"] += 1

In [9]:
# sert à stocker les utilisateurs de qui ont a déjà stocké les follows
users_parsed = []

# stocker les ahash de la première ligne pour initialiser 
current_ahash = sorted_tweets["average_hash"][0]

# on stock dans ce tableau des triplets de type(username, sha1, tweet_text)
# il permet de comparer l'utilisateur actuel de la boucle (user_name) avec tous les autres utilisateurs ayant partagé le même averagehash
current_users = []

with tqdm(total=sorted_tweets.shape[0]) as pbar:
    # pour chaque ligne du csv
    for index, row in sorted_tweets.iterrows():
        user_name = row['from_user_name']
        # si on a pas pu récupérer les followers on ne prend pas en compte cet utilisateur
        if os.path.exists(path_to_users + "error-" + user_name + '.json'):
            continue
        ahash = row['average_hash']
        sha1 = row['sha1']
        text = row['real_text']
        image = row['image'].split('/')[-1]
        category = row['category']

        # partie pour les share / retweets
        # si le averagehash est le même que celui d'avant
        if ahash == current_ahash:
            # on va checker tous les users d'avant qui avaient le même avereage hash
            for current_user, current_sha1, current_text in current_users:
                # on vérifie que ça n'est pas le même user, ni le même sha1 (on peut supprimer la deuxième condition si on veut prendre en compte les sha1)
                if current_user != user_name:
                    key = "-".join(sorted([current_user, user_name]))
                    # si les sha1 sont identiques ET que l'un des tweets est le tweet original on peut créer un lien de retweets
                    if current_sha1 == sha1 and any([not current_text.startswith('RT'), not text.startswith('RT')]):
                        add_edge(rt_links, key, sha1, ahash, category)
                    # sinon on créé un lien de partage
                    else:
                        add_edge(share_links, key, average_hash=ahash, category=category)
        # sinon on remet à 0 la liste des utilisateurs du averagehash
        else:
            current_users = []
            current_ahash = ahash
        # dans tous les cas on ajoute le user qu'on est en train de traiter avec son sha1 à la liste des users à comparer
        current_users.append([user_name, sha1, text])

        # partie pour les followers
        # si on a pas déjà regardé les followers de l'utilisateur
        if user_name not in users_parsed:
            # on ouvre le fichier des followers
            with open(path_to_users + user_name + '.json', 'r') as f:
                # on ne garde que les followers qui sont aussi dans le corpus
                data = list(set([user['username'] for user in json.load(
                    f) if user['username'] in list(df['from_user_name'])]))
                # on créé un lien entre chacun de ces followers et l'utilisateur courant
                for user in data:
                    add_edge(follow_links, "-".join(sorted([user_name, user])))
            users_parsed.append(user_name)
        pbar.update(1)


  0%|          | 0/1986 [00:00<?, ?it/s]

toto
toto


In [11]:
with open('share_links.json', 'w', encoding='utf-8') as f:
    json.dump(share_links, f)

with open('rt_links.json', 'w', encoding='utf-8') as f:
    json.dump(rt_links, f)

with open('follow_links.json', 'w', encoding='utf-8') as f:
    json.dump(follow_links, f)

In [5]:
# with open(f"follows_only.csv", 'w', encoding='utf-8') as f:
#     f.write(edges)

with open(f"follows_and_retweets.csv", 'w', encoding='utf-8') as f:
    f.write(edges)

# with open(f"share_only.csv", 'w', encoding='utf-8') as f:
#     f.write(edges)



In [34]:
# Écrire le .csv

with open(f"fol_{follow_weight}_ahash_{hash_weight}_retweet_{retweet_weight}.csv", 'w', encoding='utf-8') as f:
    f.write(edges)

In [22]:
# check if we collected the right followers for each user

invalids = []
with open(r'D:\Alie\Documents\Projets\TwitterScripts\resources\mapping_names_to_ids.json', 'r') as f:
    mapping = json.load(f)

for user_name in sorted.from_user_name.unique():
    user_id = None
    for user_file in os.listdir(path_to_users):
        with open(os.path.join(path_to_users, user_file), 'r', encoding='utf-8') as f:
            user = json.load(f)
        if isinstance(user, list):
            continue
        elif user['username'] == user_name:
            user_id = user_file.split('.json')[0]
            mapping[user_name] = user_id
            break
    if not user_id:
        print(f"User {user_name} not found")
        invalids.append(user_name)
    else:
        pass
        # print(f"User {user_name} found")
    

User Djokkor not found
User LN_Hec not found
User mfgiraud not found
User GennoEmma not found
User kasteel_mc not found
User hugetransmess not found
User VotezPoisson not found
User JaiRaison not found
User SeguinHugues not found
User Brunnin19872015 not found
User Baptiste_6_2 not found
User mg12gm not found
User Civitas_ not found
User Chezladuparc not found
User M_LE_GAULOIS not found
User lefigarolaune not found
User TissotOlivier not found
User JenRavin not found
User xarratt not found
User OnarriveMarie not found
User Foulisou not found
User Choukker not found
User PGannat not found
User wvonpikkendorf not found
User abbealdric not found
User jeanclaudepeyra not found
User Fillon2017Rueil not found
User Valeurs not found
User lollimeur not found
User Frugheon_D not found
User ma_france4 not found
User LouiseM71493518 not found
User dede17_86 not found
User hamalus not found
User AberlinJacklyne not found
User LeGoulu not found
User LubacGenevieve not found
User leborgn90181021 no

In [25]:
with open(r'D:\Alie\Documents\Projets\TwitterScripts\resources\mapping_names_to_ids.json', 'w') as f:
    json.dump(mapping, f)