## Temporal Data Skeletton Analysis

Objective, create a gephi graph to visualize the occurences of every image in the corpus during the time of the controversy. 
Some examples of such graph can be found here: https://ateliercartographie.wordpress.com/2016/08/17/temporal-data-skeletons/

In [40]:
import os
import pandas as pd
from datetime import date, timedelta


### Load images that are more than 3 times in corpus

In [5]:
path = r'C:\Users\Orion\Documents\Projets\CERES\ScrappingTwitter\resources'
df = pd.read_csv(os.path.join(path, 'pma_images_multiple_occurences.csv'), header=0, sep=',')
df

Unnamed: 0,sha1,created_at,from_user_name
0,ffff9b90e8dce13857e3bef7389e50ed5eb648a7,2019-09-26 13:49:23,josephinemd34
1,ffff9b90e8dce13857e3bef7389e50ed5eb648a7,2019-09-27 06:31:17,ClPeqt
2,ffff9b90e8dce13857e3bef7389e50ed5eb648a7,2019-09-26 17:46:36,JulietteLauzza
3,ffff9b90e8dce13857e3bef7389e50ed5eb648a7,2019-09-26 13:45:58,MaKarDoue
4,ffff9b90e8dce13857e3bef7389e50ed5eb648a7,2019-09-27 07:27:31,06aime
...,...,...,...
261007,0008255f72057dbf97e229199e6595e2deebecfd,2019-10-06 10:36:34,O_de_Clisson
261008,0008255f72057dbf97e229199e6595e2deebecfd,2019-10-06 10:30:59,ManifPourTous89
261009,0008255f72057dbf97e229199e6595e2deebecfd,2019-10-06 11:22:15,vaubandaniel
261010,0008255f72057dbf97e229199e6595e2deebecfd,2019-10-06 10:32:13,albdmt


### order values by date

In [10]:
df["created_at"] = pd.to_datetime(df["created_at"])
# keep only date and not time
df["created_at"] = df["created_at"].dt.date
df = df.sort_values(by="created_at")

Unnamed: 0,sha1,created_at,from_user_name
14865,ed732ab07f2cbb0ec7a030bf2e5aaa3e2c0089c5,2019-08-28,AlaouiMoualaoui
97304,9ec90ff173cacf392f8f03f33fe3f2cf7061d618,2019-08-28,Arnaud_38000
48779,cf44e461bcfb2d6fb50b8454f6c7881377727443,2019-08-28,AllianceVita34
124032,838c857d07ee0dce852ca22c2555e2fdeeb3c8b2,2019-08-28,JoycenoaNoa
197820,3d007650564cee0febd894fc958409803b689a2c,2019-08-28,BeaulieuBap
...,...,...,...
51219,cc0417ee078d6b661b7db5e4f8fd44969bea6c2d,2021-08-21,RDelinfo
117080,8a7232c3f9b1649d947b5fc565d11ed0705b12b5,2021-08-23,rotreuil
66252,bb5ba917fe22b4e6e6a2458c00106007f9c94705,2021-08-26,le_stoique
144527,7009158eae27715fdd1f20e0db156cfbca589635,2021-08-27,Fests_infobot


### build skeletton from df

In [None]:

start_date = df['created_at'].iloc[0]
end_date = df['created_at'].iloc[-1]
delta = timedelta(days=1)
while start_date <= end_date:
    print(start_date.strftime("%Y-%m-%d"))
    start_date += delta

In [31]:
res = df.loc[df['created_at'] == df.iloc[0]['created_at']].groupby('sha1', as_index=False).count()

In [45]:
df.loc[df['sha1'] == '3e78381d49834fac28ff3e0d541341227b887592']

Unnamed: 0,sha1,created_at,from_user_name
192653,3e78381d49834fac28ff3e0d541341227b887592,2020-04-28,____Sha_____
194085,3e78381d49834fac28ff3e0d541341227b887592,2020-04-28,viiesurnous
193269,3e78381d49834fac28ff3e0d541341227b887592,2020-04-28,celia_rfd
192128,3e78381d49834fac28ff3e0d541341227b887592,2020-04-28,LapointeBoumed1
193268,3e78381d49834fac28ff3e0d541341227b887592,2020-04-28,MrDridrix
...,...,...,...
194506,3e78381d49834fac28ff3e0d541341227b887592,2020-05-30,ibrahim_cr678
193718,3e78381d49834fac28ff3e0d541341227b887592,2020-06-07,andromed_a_
193843,3e78381d49834fac28ff3e0d541341227b887592,2020-06-12,slytherinzone
194277,3e78381d49834fac28ff3e0d541341227b887592,2020-09-16,audic_212


In [49]:
def build_skeletton(df, field='sha1'):
    # preprocess df
    df["created_at"] = pd.to_datetime(df["created_at"])
    # keep only date and not time
    df["created_at"] = df["created_at"].dt.date
    df = df.sort_values(by="created_at")

    start_date = df['created_at'].iloc[0]
    end_date = df['created_at'].iloc[-1]
    nodes = "Id;Label;Type"
    nodes_dic = {}
    edges = "Source;Target;Weight;Label"
    default_weight = 100
    index_days = 0
    index_field = 0
    prev_id = None 
    delta = timedelta(days=1)
    while start_date <= end_date:
        # create day id
        id_current = f"d{index_days}"
        index_days += 1
        # add day to the nodes
        nodes += f"\n{id_current};{start_date.strftime('%Y-%m-%d')};Date"
        # create a link between current day and previous day if there is one
        if prev_id:
            edges += f"\n{prev_id};{id_current};{default_weight};Skeletton"
        # get all field at this date and group them by field
        images_at_this_date = df.loc[df['created_at'] == start_date].groupby(field, as_index=False).count()
        if not images_at_this_date.empty:
            # for each field at this date
            for index, line in images_at_this_date.iterrows():
                # create an id only if the node does not already exist
                field_value = line[field]
                if field_value in nodes_dic:
                    id_field = nodes_dic[field_value]
                else:
                    id_field = f"s{index_field}"
                    nodes += f"\n{id_field};{field_value};Node"
                    index_field += 1
                    nodes_dic[field_value] = id_field
                # add an edge between the day and the field with weight = nb of occurences that day
                edges += f"\n{id_current};{id_field};{line['created_at']};Edge"
        # increment date                             
        start_date += delta
        # save current_id
        prev_id = id_current
    return nodes, edges

In [44]:
nodes, edges = build_skeletton(df)
with open(os.path.join(path, 'nodes.csv'), 'w') as f:
    f.write(nodes)
with open(os.path.join(path, 'edges.csv'), 'w') as f:
    f.write(edges)

### Do the same with average_hash instead of sha1

- compute all groups (use groups as 8)
- take only groups that are interesting
- get stats from bdd from this groups
- make a df
- build edges and nodes


In [50]:
import psycopg2
groups_folders = r"/home/tyra/Documents/CERES/PMA/hashs/8_clean"
conn = psycopg2.connect(host="localhost", dbname="postgres", user="postgres", password="edwin007")

In [24]:
req = """
select tweets.created_at, tweets.from_user_name
from pma_uniques as tweets
join pma_media as media
on media.tweet_id = tweets.id
where media.sha1 = %s
"""

In [53]:
res = pd.DataFrame(columns=['created_at', 'from_user_name', 'Label'])
for folder in os.listdir(groups_folders):
    print(folder)
    for file in os.listdir(os.path.join(groups_folders, folder)):
        sha1 = file.split('.')[0].split('-')[1]
        df_temp = pd.read_sql(req, conn, params=(sha1, ))
        df_temp['Label'] = [folder for i in range(df_temp.shape[0])]
        res = res.append(df_temp, ignore_index=True)

ovule
postits_pma
vrac_taubira
emoji_pouce
photo_cure
scentifique_cellules
charlie_hebdo
bd_macron_amendements
loi_protection_enfants
manif_600000
elysee_bananiere
sylviane_agacinski
tabmours_rouges
manif_pro_pma7
conge_pat
photos_tracts_vita
cassation_gpa
cafe_ludo
pere_stormtrooper
bd_gpa_charb
speech_inconnu
stop_intox_em
amendement_melanchon
citation_edouard_philippe
projet_loi_bioéthique
sondage_pma2
tugdual_derville
taubira_traore
tract_chimere
enfants_baillonés
bd_jesus_marie_joseph
famille_noire
buzyn
tract_pro_pma
tract_anti_bioethique
tract_ventre_manif
loi_bioethique
place_sante
bd_psy
pma_oui_oui
verdier_jouclas
papa_maman_cure
deux_femmes_bebe
papa_maman_drapeau_lgbt
tract_an_bioethique
tract_un_pourcent
taubira_belkacem_royal
tract_vita_pere
jean_louis_touraine
tract_71_pourcent
tag_anti_pma
show_lumiere
votants_loi
claude_dharcourt
voter_pour_macron
powerpoint_anti_mpt
tract_gilet_jaune_anti_pma
bd_facture
tract_anti_bioethique6
manif_pro_pma3
pma_oui_non_screen
anti_tra

In [54]:
nodes, edges = build_skeletton(res, 'Label')
with open(os.path.join(groups_folders, 'nodes.csv'), 'w') as f:
    f.write(nodes)
with open(os.path.join(groups_folders, 'edges.csv'), 'w') as f:
    f.write(edges)