## Date plot for sha1 and ahash

Objective: create plots to show the number of image mention for a specific sha1 or ahash

In [None]:
import concurrent
import pandas
import os
import json
from tqdm import tqdm
import psycopg2
from datetime import timedelta
import matplotlib.pyplot as plt
import matplotlib.dates as mdates

In [None]:
ahash = "0202c3c7e7fbfd19" # display the number of images for one ahash or the number of sha1 ? 
sha1 = "003006be00fa0b45687c9413ba52fea04b7dac9b"
conn = psycopg2.connect(host="localhost", dbname="postgres", user="postgres", password="edwin007")

In [None]:
def get_ahash(ahash):
    req = """select date_trunc('week', tweets.created_at) as weekly, count(*)
    from pma_media join import.pma_full as tweets
    on pma_media.tweet_id = tweets.id
    where pma_media.average_hash_8 = %s
    group by weekly"""
    cursor = conn.cursor()
    cursor.execute(req, (ahash,))
    return cursor.fetchall()

In [None]:
def get_sha1(sha1):
    req = """select date_trunc('week', tweets.created_at) as weekly, count(*)
    from pma_media join import.pma_full as tweets
    on pma_media.tweet_id = tweets.id
    where pma_media.sha1 = %s
    group by weekly"""
    cursor = conn.cursor()
    cursor.execute(req, (sha1,))
    return cursor.fetchall()

In [None]:
def get_weeks_between(start, end):
    delta = end - start
    curr_date = start
    all_dates = [curr_date]
    while curr_date < end:
        curr_date += timedelta(7)
        all_dates.append(curr_date)
    return all_dates

In [None]:
def map_values(dates, res):
    values = []
    for d in dates:
        found = False
        for el in res:
            if el[0] == d:
                values.append(el[1])
                found = True
        if not found:
            values.append(0)
    return values

In [None]:
def make_plot(dates, values, hash_, folder):
    ax = plt.subplot(111)
    ax.bar(dates, values, width=5)
    ax.xaxis_date()
    plt.gcf().autofmt_xdate()

    for date, value in zip(dates, values):
        ax.text(x=date, y=value + 20, s=str(value), fontdict=dict(fontsize=10))
    plt.title(f"Number of images with hash {hash_} per week")
    plt.savefig(os.path.join(folder, f'{hash_}.png'))
    plt.clf()

In [None]:
def plot_ahash(ahash, folder=""):
    res = get_ahash(ahash)
    if res == []:
        return False
    dates = get_weeks_between(res[0][0], res[len(res) - 1][0])
    values = map_values(dates, res)
    make_plot(dates, values, ahash, folder)
    return True

In [None]:
def plot_sha1(sha1, folder=""):
    res = get_sha1(sha1)
    if res == []:
        return
    dates = get_weeks_between(res[0][0], res[len(res) - 1][0])
    values = map_values(dates, res)
    make_plot(dates, values, sha1, folder)

In [None]:
# import matplotlib.dates as mdates
# plt.gca().xaxis.set_major_formatter(mdates.DateFormatter('%m/%d/%Y'))
# plt.gca().xaxis.set_major_locator(mdates.DayLocator())

## Extract tweets and stats for one sha1

In [None]:
def get_stats_sha1(sha1, folder='', name=""):
    req = "select * from media_stats where sha1=%s"
    cursor = conn.cursor()
    cursor.execute(req, (sha1,))
    res = cursor.fetchone()
    if not res:
        print("failed for sha1 " + sha1)
        return
    columns = [des[0] for des in cursor.description]
    df = pandas.DataFrame([res], columns=columns)
    df = df.drop(columns=['sha1', 'extension', 'average_hash_8'])
    df = df.rename(columns={'oldest_tweet_id': 'oldest_tweet', 'oldest_creation_date': 'oldest_date', 'occurences_sha1': 'nb_sha1_corpus'})
    print(folder)
    df.to_html(os.path.join(folder, f'{name or sha1}.html'), index=False)
    return df

In [None]:
def get_tweets_sha1(sha1, folder="", name=""):
    print(folder)
    req = "select real_text, from_user_name from import.pma_full as tweets join pma_media on tweets.id = pma_media.tweet_id where pma_media.sha1=%s"
    cursor = conn.cursor()
    cursor.execute(req, (sha1,))
    columns = [des[0] for des in cursor.description]
    res = cursor.fetchall()
    if res == []:
        return
    tweets = {}
    for record in res:
        tweets[record[0]] = [*tweets.get(record[0], []), record[1]]
    tweets
    output = ""
    for key, value in tweets.items():
        output += "Texte: \n\n"
        output += key + " \n\n"
        output += "Utilisateurs: \n\n"
        output += ' '.join(value) + '\n'
        output += "========================================================\n"
    with open(os.path.join(folder, f'{name or sha1}.txt'), 'w') as f:
        f.write(output)

## Generate Files for all sha1

In [None]:
root = r'/home/tyra/Documents/pma_clean'
precision = ''
path = os.path.join(root, precision)
path

In [None]:
# for ahash in [el for el in os.listdir(path) if el != 'similar']:
#    ahash_path = os.path.join(path, ahash)
#     skip = not plot_ahash(ahash, ahash_path)
#     if skip:
#         continue
#    print(ahash)
for f in os.listdir(path):
    name = f.split('.')[0]
    sha1 = name.split('-')[-1]
    # get_tweets_sha1(sha1, ahash_path, name)
    get_stats_sha1(sha1, path, name)
#         plot_sha1(sha1, ahash_path)

## Select all images by date

In [None]:
def export_by_date(start, end):
    req = "select sha1 from media_stats where oldest_creation_date > %s and oldest_creation_date < %s"
    cursor = conn.cursor()
    cursor.execute(req, (start, end))
    res = cursor.fetchall()
    res = [el[0] for el in res]
    import shutil
    root = '/home/tyra/Documents/CERES/PMA/images'
    output = f'/home/tyra/Documents/CERES/PMA/{start}_{end}'
    os.makedirs(output, exist_ok=True)
    images = os.listdir(root)
    for sha1 in res:
        for image in images:
            if image.startswith(sha1):
                shutil.copy(os.path.join(root, image), output)

In [None]:
start = '2019-10-06' # will get this date included
end = '2019-10-07' # will get not get this date

start = '2020-10-10' # will get this date included
end = '2020-10-11' # will get not get this date

start = '2021-01-31' # will get this date included
end = '2021-02-01' # will get not get this date

start = '2021-06-08' # will get this date included
end = '2021-06-09' # will get not get this date

export_by_date(start, end)

## Detect images that should not be in corpus

In [None]:
req = """
select distinct media.sha1 , media.tweet_id, uniques.text, uniques.real_text
from pma_uniques as uniques join pma_media as media on
uniques.id = media.tweet_id
where 
	uniques.real_text is not null 
	and substring(uniques.real_text, 0, 10) <> substring(uniques.text, 0, 10)
	and media.sha1 is not null;
"""

cursor = conn.cursor()
cursor.execute(req, (start, end))
res = cursor.fetchall()
res = [el[0] for el in res]
with open('/home/tyra/Documents/CERES/PMA/useless.json', 'w') as f:
    json.dump(res, f)

In [None]:
import shutil
root = '/home/tyra/Documents/CERES/PMA/images'
output = f'/home/tyra/Documents/CERES/PMA/failed'
os.makedirs(output, exist_ok=True)
images = os.listdir(root)
for sha1 in res:
    for image in images:
        if image.startswith(sha1):
            shutil.copy(os.path.join(root, image), output)