In [11]:
using CausalityTools
include("../Utils/entropy.jl")
using Random
using BenchmarkTools

rng = MersenneTwister(145)

a = rand(rng, 0:10, 100)
b = rand(rng, 0:10, 100)

est = Kraskov(k=1)

print("Current method :")
foo = @btime TE(Int.(a .> 0), Int.(b .> 0))
println("The result is $foo.\n")


print("True transfer entropy :")
a = float.(a)
b = float.(b)
foo2 = @btime transferentropy(a, b, est)
println("The result is $foo2.\n")

print("CCM :")
foo3 = @btime crossmap(a, b, 2, 1)
println("The result is $foo3.\n")

In [None]:
import PyPlot as plt

rng = MersenneTwister(145)

a = rand(rng, 0:10, 100)
b = rand(rng, 0:10, 100)

a = float.(a)
b = float.(b)

Ls = [10:5:50; 60:10:100]

begin
    test = [crossmap(a[1:L], b[1:L], 2, 1) for L in Ls]
    test2 = [crossmap(b[1:L], a[1:L], 2, 1) for L in Ls]
end

plt.figure()
plt.plot(Ls, test, "b-", label="a to b")
plt.plot(Ls, test2, "r-", label="b to a")
plt.legend()
show(plt.gcf())

In [None]:
using BenchmarkTools
using CausalityTools
using StatsBase: minimum, maximum, mean, std

function standardize(x)
    std_ = std(x, dims=1)
    return (x .- mean(x, dims=1)) ./ ifelse.(std_ .> 0, std_, ones(size(std_)))
end

B = 10
d = 3
τ = 1
alpha = 0.001

cuttoff = 0.5
cuttoff2 = 0.01

Nts = 13*24*2

func(x, y) = pvalue(jdd(OneSampleTTest, x, y, B=B, D=d, τ=τ, μ0=0.0), tail=:right) < alpha ? 1 : 0

tot = 0

for i = 1:10000
    x = standardize(rand(0:10, Nts))
    y = standardize(rand(0:10, Nts))

    # x = rand(0:10, Nts)
    # y = rand(0:10, Nts)

    tot += func(x ,y)
end

println("jdd : $tot")

In [None]:
using PyPlot: @L_str, latexstring
using Printf

annot = Matrix{String}(undef, size(mean_value))
for i in eachindex(annot)
    foo = @sprintf("%.1e", mean_value[i])
    deci = split(foo, 'e')[1]
    power = parse(Int, split(foo, 'e')[2])
    power = string(power)
    annot[i] = latexstring(deci, "\\cdot 10^{", power, "}")
end

In [None]:
import Seaborn as sns

result = load_data("/Users/cyrilvallez/Desktop/Thesis/Results/Find_thresholds/new_test2_TE.jld2")
labels = ["None", "Q(0.5)", "Q(0.75)", "Q(0.9)", "max", "2max", "4max"]
thresholds = 0:0.1:0.6


mean_value = Matrix{Float64}(undef, size(result))
for i in eachindex(result)
    mean_value[i] = mean(result[i])
end

if any(mean_value .== 0)
    vmin = minimum(mean_value[mean_value .!= 0])*0.5
else
    vmin = minimum(mean_value)
end

plt.figure(figsize=[6.4, 4.8].*1.2)
sns.heatmap(mean_value, annot=annot, cmap="rocket_r", fmt="", norm=plt.matplotlib.colors.LogNorm(vmin=vmin, clip=true))
plt.xlabel("Threshold")
plt.ylabel("Limit value")
xloc, xlabels = plt.xticks()
plt.xticks(xloc, thresholds)
yloc, ylabels = plt.yticks()
plt.yticks(yloc, labels, rotation="horizontal")
plt.gcf();

In [None]:
import Seaborn as sns

result = load_data("/Users/cyrilvallez/Desktop/Thesis/Results/Find_thresholds/new_test2_TE.jld2")
labels = ["None", "Q(0.5)", "Q(0.75)", "Q(0.9)", "max", "2max", "4max"]
thresholds = 0:0.1:0.6


mean_value = Matrix{Float64}(undef, size(result))
for i in eachindex(result)
    mean_value[i] = mean(result[i])
end

if any(mean_value .== 0)
    vmin = minimum(mean_value[mean_value .!= 0])*0.5
else
    vmin = minimum(mean_value)
end

plt.figure(figsize=[6.4, 4.8].*1.2)
sns.heatmap(mean_value, annot=true, cmap="rocket_r", fmt=".2g", norm=plt.matplotlib.colors.LogNorm(vmin=vmin, clip=true))
plt.xlabel("Threshold")
plt.ylabel("Limit value")
xloc, xlabels = plt.xticks()
plt.xticks(xloc, thresholds)
yloc, ylabels = plt.yticks()
plt.yticks(yloc, labels, rotation="horizontal")
plt.gcf();

In [None]:
df = load_dataset(Skripal)

In [None]:
using Dates
to_datetime = x -> DateTime(split(x, '.')[1], "yyyy-mm-ddTHH:MM:SS")
df."created_at" = to_datetime.(df."created_at")

In [None]:
sort!(df, :follower_count, rev=true)

In [None]:
minimum(df.created_at)

In [None]:
maximum(df.created_at)

In [None]:
decide = x -> x < Date(2018, 03, 18)

a = DateTime(2018, 03, 18, 00, 00, 01)
b = DateTime(2018, 03, 17, 23, 59, 59)
c = DateTime(2018, 03, 18, 00, 00, 00)

In [None]:
using StatsBase
include("../Engine/Engine.jl")
using .Engine
import PyPlot as plt
import Seaborn as sns

In [None]:
filename = "/Users/cyrilvallez/Desktop/Thesis/Results/Find_thresholds/N_1000_JDD.jld2"

result2 = load_data(filename)

In [None]:
thresholds2 = [1, 1e-1, 5e-2, 1e-2, 5e-3, 1e-3, 5e-4, 1e-4]
labels2 = ["None", "Q(0.5)", "Q(0.25)", "Q(0.1)", "min", "min/2", "min/4", "min/6"]

mean_value2 = Matrix{Float64}(undef, size(result2))
for i in eachindex(result2)
    mean_value2[i] = mean(result2[i])
end

# Set vmin a little lower than minimum, so that 0 appears on a color scale lower than minimum when using clip=true
if any(mean_value2 .== 0)
    vmin = minimum(mean_value2[mean_value2 .!= 0])/2
else
    vmin = minimum(mean_value2)
end

plt.figure(figsize=[6.4, 4.8].*1.2)
sns.heatmap(mean_value2, annot=true, cmap="rocket_r", norm=plt.matplotlib.colors.LogNorm(vmin=vmin, clip=true))
plt.xlabel("p-value")
plt.ylabel("Limit value")
xloc, xlabels = plt.xticks()
plt.xticks(xloc, thresholds2)
yloc, ylabels = plt.yticks()
plt.yticks(yloc, labels2, rotation="horizontal")
plt.savefig("/Users/cyrilvallez/Desktop/Thesis/Results/Find_thresholds/N_1000_JDD.pdf", bbox_inches="tight")
plt.gcf();

In [None]:
limit = x -> maximum(x)*2
# limit = RandomShuffle()
show(limit)

In [None]:
foo = eval(Meta.parse("x -> maximum(x)*2"))
foo2 = eval(Meta.parse("x -> maximum(x)"))

In [None]:
parse(Function, "x->maximum(x)*2")

In [None]:
using DataFrames
include("../Engine/Engine.jl")
using .Engine

datafolder = "/Users/cyrilvallez/Desktop/Thesis/Data/Twitter/COP26_processed"
datafiles = [file for file in readdir(datafolder, join=true) if occursin(".json", file)]
# frames = [Helpers.load_json(file) for file in datafiles]
# data = vcat(frames...)

In [None]:
frames = [Helpers.load_json(file) for file in datafiles]
# data = vcat(frames...)

In [None]:
igg = InfluenceGraphGenerator(SimpleTE, threshold=0.01)

In [None]:
"""
Standardize data, handling the case when one column contains only same value (this happens in our case with vectors of only 0s).
"""
function standardize(x)
    std_ = std(x, dims=1)
    return (x .- mean(x, dims=1)) ./ ifelse.(std_ .> 0, std_, ones(size(std_)))
end

In [None]:
using StatsBase
# x = sample([0,1], AnalyticWeights([0.9, 0.1]), 200)
# x = sample([0,1], AnalyticWeights([0.5, 0.5]), 200)
x = rand(200)
y = zeros(size(x))
y[1] = 0.5*x[1] + 2*x[2]
y[end] = 0.5*x[end] + 2*x[end-1]
for i = 2:length(y)-1
    y[i] = 0.5*x[i] + 2*x[i-1] + 2*x[i+1]
end
# y .+= 0.1 .* rand(200)

In [None]:
plt.figure()
plt.scatter(x, y);

In [None]:
using DataFrames
include("../Engine/Engine.jl")
using .Engine
using StatsBase: mean, minimum, maximum, quantile, std

In [None]:
igg = InfluenceGraphGenerator(JointDistanceDistribution, surrogate=nothing)

N = 10000
distribution = []
for i = 1:N
    x = rand(200)
    y = rand(200)
    x = standardize(x)
    y = standardize(y)

    push!(distribution, igg.causal_function(x, y))
end

quantile(distribution, 0.5)

In [None]:
x = rand(200)
igg.causal_function(x, x)

In [None]:
quantile(distribution, 0.05)

In [None]:
igg = InfluenceGraphGenerator(SimpleTE, threshold=-1)

x = rand(0:1, 200)
y = rand(0:1, 200)

@btime igg.causal_function(x,y)

In [None]:
2000/520^2

In [None]:
a = ["a", "b", "c"]
b = [1, 2, 3]

Dict(a .=> b)

In [None]:
1.851*1e-3*(12000^2*0.007)/60/60

In [None]:
igg = InfluenceGraphGenerator(SimpleTE, threshold=0.06)

N = 100
distribution = []
for i = 1:N
    # x = sample([0,1], AnalyticWeights([0.9, 0.1]), 200)
    # x = rand(0:10, 200)
    x = sample(0:10, AnalyticWeights(vcat(0.5, [0.5/10 for i = 1:10]...)), 200)
    # y = sample([0,1], AnalyticWeights([0.5, 0.5]), 200)
    y = zeros(size(x))
    y[1] = 0.5*x[1] + 2*x[2]
    y[end] = 0.5*x[end] + 2*x[end-1]
    for i = 2:length(y)-1
        y[i] = 0.5*x[i] + 2*x[i-1] + 2*x[i+1]
    end

    push!(distribution, igg.causal_function(x, y))
end

sum(distribution)

In [None]:
using PyPlot: @L_str

negatives = load_data("/Users/cyrilvallez/Desktop/Thesis/Results/Find_thresholds/N_1000_TE.jld2")
positives = load_data("/Users/cyrilvallez/Desktop/Thesis/Results/Find_thresholds_2/N_10_TE.jld2")

labels = ["None", "Q(0.5)", "Q(0.75)", "Q(0.9)", L"\max", L"2\cdot \max", L"4\cdot \max"]

mean_positives = Matrix{Float64}(undef, size(positives))
for i in eachindex(positives)
    mean_positives[i] = mean(positives[i])
end

mean_negatives = Matrix{Float64}(undef, size(negatives))
for i in eachindex(negatives)
    mean_negatives[i] = mean(negatives[i])
end

plt.figure()
for i = 1:size(positives)[1]
    plt.plot(mean_negatives[:, i], mean_positives[:, i], label=labels[i])
end
plt.plot(0:0.001:1, 0:0.001:1, label=L"y=x", linestyle="--")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.legend()
plt.grid()
plt.xscale("log")
plt.gcf();

In [None]:
negatives = load_data("/Users/cyrilvallez/Desktop/Thesis/Results/Find_thresholds/N_1000_JDD.jld2")
positives = load_data("/Users/cyrilvallez/Desktop/Thesis/Results/Find_thresholds_2/N_10_JDD.jld2")

labels = ["None", "Q(0.5)", "Q(0.25)", "Q(0.1)", "min", "min/2", "min/4", "min/6"]

mean_positives = Matrix{Float64}(undef, size(positives))
for i in eachindex(positives)
    mean_positives[i] = mean(positives[i])
end

mean_negatives = Matrix{Float64}(undef, size(negatives))
for i in eachindex(negatives)
    mean_negatives[i] = mean(negatives[i])
end

plt.figure()
for i = 1:size(positives)[1]
    plt.plot(mean_negatives[:, i], mean_positives[:, i], label=labels[i])
end
plt.plot(0:0.001:1, 0:0.001:1, label=L"y=x", linestyle="--")
plt.xlabel("FPR")
plt.ylabel("TPR")
plt.legend()
plt.grid()
plt.xscale("log")
plt.gcf();

In [None]:
df = Helpers.load_json("/Users/cyrilvallez/Desktop/Thesis/Data/Twitter/COP26_processed/2021-10-18T00-00_to_2021-10-22T00-00.json")

In [None]:
show(df, allcols=true)

In [None]:
unique(df.category)

In [None]:
foo = df[df.category .== Ref(["retweeted"]), :]

In [None]:
sum(startswith.(foo.text, Ref("RT")))

In [None]:
test = foo[.!startswith.(foo.text, Ref("RT")), :]

In [None]:
test.text[18]

In [None]:
test.original_text[2]

In [None]:
a = [1 1 1; 2 2 2; 3 3 3]
b = [2 2 2; 2 2 2; 2 2 2]

a ./ b

In [None]:
a = Matrix{Matrix}(undef, 2, 2)
b = Matrix{Matrix}(undef, 2, 2)
a[1, 1] = [1 2; 3 4]
a[1, 2] = [0 0; 1 2]
a[2, 1] = [1 1; 1 1]
a[2, 2] = [0 0; 1 0]

b[1, 1] = [0 1; 2 5]
b[1, 2] = [0 0; 0 0]
b[2, 1] = [-1 0; 2 3]
b[2, 2] = [1 3; 1 0]

In [None]:
c = Matrix{Matrix}(undef, 2, 2)
d = Matrix{Matrix}(undef, 2, 2)

c[1, 1] = rand(1:4, 2, 2)
c[1, 2] = rand(1:4, 2, 2)
c[2, 1] = rand(1:4, 2, 2)
c[2, 2] = rand(1:4, 2, 2)

d[1, 1] = rand(1:4, 2, 2)
d[1, 2] = rand(1:4, 2, 2)
d[2, 1] = rand(1:4, 2, 2)
d[2, 2] = rand(1:4, 2, 2)

In [None]:
e = Matrix{Matrix}(undef, 2, 2)
f = Matrix{Matrix}(undef, 2, 2)

e[1, 1] = rand(1:4, 2, 2)
e[1, 2] = rand(1:4, 2, 2)
e[2, 1] = rand(1:4, 2, 2)
e[2, 2] = rand(1:4, 2, 2)

f[1, 1] = rand(1:4, 2, 2)
f[1, 2] = rand(1:4, 2, 2)
f[2, 1] = rand(1:4, 2, 2)
f[2, 2] = rand(1:4, 2, 2)

In [None]:
foo1 = [a, b, c]
foo2 = [d, e, f]

In [None]:
foo = [foo1, foo2]

In [None]:
total = sum(foo)

In [None]:
c .+ f

In [None]:
for i = 1:length(total)
    f = x -> ifelse.(x .<= 2, -2, x)
    indices = (total[i] .<= 2) .<= 2
    total[i][indices] = -1
    # broadcast()
end

In [None]:
x = [0 0 2; 1 1 4; 3 0 5]
ifelse.(x .<= 2, -2, x)

In [None]:
total[1]

In [None]:
map!.(x -> ifelse(x < 2, -2, x), total[1], total[1])

In [None]:
total[1]

In [None]:
total[1]

In [None]:
include("../Engine/Engine.jl")
using .Engine

using DataFrames, StatsBase
using BenchmarkTools
import PyPlot as plt
import Seaborn as sns

In [None]:
df3[df3.username .== "GretaThunberg", :]

In [None]:
df = load_dataset(COP26)
df = cop_26_dates(df)
df = trust_score(df)

In [None]:
for partition in partitions
    

In [None]:
df1 = df[df.partition .== "Before COP26", :]
df2 = df[df.partition .== "During COP26", :]
df3 = df[df.partition .== "After COP26", :]

In [None]:
weights, u, v, nodes = PreProcessing.compute_IP_graph(df2)
I, P, residuals = PreProcessing.compute_IP_scores(u, v)

sorting = sortperm(I, rev=true)
I = I[sorting]
nodes = nodes[sorting]
P = P[sorting]

In [None]:
using BenchmarkTools

a = rand(0:1, 200)
b = rand(0:1, 200)

@btime iszero(a) || iszero(b)

In [1]:
include("../Engine/Engine.jl")
using ..Engine

In [5]:
using BenchmarkTools
using CausalityTools

igg = InfluenceGraphGenerator(Engine.JointDistanceDistribution, surrogate=nothing)

f = (x,y) -> pvalue(jdd(OneSampleTTest, x, y, B=10, D=5, τ=1, μ0=0.0), tail=:right) < 0.001 ? 1 : 0
f2 = (x,y) -> igg.causal_function(x,y)

x = rand(600)
y = rand(600)

@btime f2(x,y)

  11.092 ms (79 allocations: 27.56 MiB)


0

In [None]:
regex = r"^AGG[0-9]+:"
a = "AGG1678934: joeoepzocnjocz    dveonoze^12"
if occursin(regex, a)
    print("cool")
else
    print("prout")
end

In [None]:
df1 = transform(groupby(df1, "username"), "effective_category" => (x -> sum(x .== "tweet")) => "tweet_count")
df2 = transform(groupby(df2, "username"), "effective_category" => (x -> sum(x .== "tweet")) => "tweet_count")
df3 = transform(groupby(df3, "username"), "effective_category" => (x -> sum(x .== "tweet")) => "tweet_count")

In [None]:
df1 = df1[df1.tweet_count .>= 3, :]
df2 = df2[df2.tweet_count .>= 3, :]
df3 = df3[df3.tweet_count .>= 3, :]

In [None]:
length(unique(df2.username))

In [None]:
plot_action_frequency(df2)

In [None]:
length(unique(df2.username))

In [None]:
tweet_count = combine(groupby(tweeters, "username"), "created_at" => length => "count")
tweet_count = tweet_count[tweet_count.count .>= 3, :]
nodes = tweet_count.username

In [None]:
"GretaThunberg" in nodes

In [None]:
using Dates

df = load_dataset(COP26)

df = df[.~ismissing.(df."domain"), :]
if eltype(df."created_at") == String
    to_datetime = x -> DateTime(split(x, '.')[1], "yyyy-mm-ddTHH:MM:SS")
    df."created_at" = to_datetime.(df."created_at")
end

df = PreProcessing.cop_26_dates(df)
df = PreProcessing.trust_score(df)
df = transform(groupby(df, "username"), "created_at" => length => "count")

tweeters = df[df.effective_category .== "tweet", :]
retweeters = df[df.effective_category .== "retweet", :]

bad_df = df[df.action .== "U", :]
good_df = df[df.action .== "T", :]

weights, u, v, nodes = PreProcessing.IP_graph(df, min_tweets=3)

I, P, residuals = PreProcessing.IP_scores(u, v)

# Sort in the order of most influence
sorting = sortperm(I, rev=true)
nodes = nodes[sorting]
I = I[sorting]
P = P[sorting]

# Extract dataframes of 500 most influentials according to I score
isin = (x,y) -> x in y
influentials = tweeters[isin.(tweeters.username, Ref(nodes[1:500])), :]
foo = combine(groupby(influentials, "username"), "action" => (x -> sum(x .== "U")) => "U_count")

bad_users = foo.username[foo.U_count .> 0]
bad_users_rank = [findall(user .== nodes) for user in bad_users];

In [None]:
nodes_bad[1:20]

In [None]:
# I_normal = I
# P_normal = P
# nodes_normal = nodes

test_normal = nodes_normal[I_normal .> 0];

In [None]:
# I_bad = I
# P_bad = P
# nodes_bad = nodes

test_bad = nodes_bad[I_bad .> 0];

In [None]:
a = [[1, 2], [3, 4]]

In [None]:
foo = @view a[1]

In [None]:
foo2 = a[2]

In [None]:
foo[1]

In [None]:
a[1][1] = 2
a

In [None]:
isin = (x,y) -> x in y
# sum(isin.(nodes_bad[1:500], Ref(nodes_normal[1:500])))

sum(isin.(test_bad, Ref(test_normal))) 

In [None]:
sum(I_normal .> 0)

In [None]:
(12200 - 500) / 100

In [None]:
sorting = sortperm(P, rev=true)
nodes = nodes[sorting]
I = I[sorting]
P = P[sorting]

In [None]:
nodes

In [None]:
I0 = I .== 0
P0 = P .== 0

I1 = I .!= 0
P1 = P .!= 0;

# sort(I0) == sort(P1)

In [None]:
sum(I1 .&& P0)

In [None]:
sum(I1 .|| P1) / length(I0)

In [None]:
I

In [None]:
nodes

In [None]:
test = tweeters[tweeters.action .== "U", :]
test = transform(groupby(test, "username"), "created_at" => length => "Ucount")

length(unique(test[test.Ucount .>= 2, "username"]))

In [None]:
nodes[123]

In [None]:
length(residuals)

In [None]:
res_ = residuals
plt.figure()
plt.plot(1:length(res_), res_)
plt.yscale("log")


In [None]:
initial_actors = unique(test[ismissing.(test.rt_from), "username"])
counter = []

rts = test[.!ismissing.(test.rt_from), :]
# mapping = countmap(rts)
# transform(groupby(df, "username"), "created_at" => length => "tweet_count")

for (i, a) in enumerate(initial_actors)
    indices = findall(rts.rt_from .== a)
    push!(counter, rts.username[indices])
    # counter[i] = sum(rts.rt_from .== a)
end

In [None]:
N = [length(i) for i in counter]
sorting = sortperm(N, rev=true)
N = N[sorting]
initial_actors = initial_actors[sorting]
counter = counter[sorting]

In [None]:
counter[1]

In [None]:
initial_actors[1:30]

In [None]:
initial_actors = unique(test[ismissing.(test.rt_from), "username"]);

In [None]:
using BenchmarkTools

counter = []

@btime begin
    indices = findall(rts.rt_from .== initial_actors[1])
    push!(counter, rts.username[indices])
end

In [None]:
# foo = test[test.category .== Ref(["tweeted"]), :]
# foo = transform(groupby(foo, "username"), "created_at" => length => "tweet_count")
# foo = foo[foo.tweet_count .>= 1, :]
# length(unique(foo.username))


initial_actors = unique(test[.ismissing.(test.rt_from), "username"])

In [None]:
sum(.!ismissing.(test.rt_from))

In [None]:
foo = test[startswith.(test.text, Ref("RT @")) .&& test.category .!= Ref(["retweeted"]), :]

In [None]:
show(foo, allcols=true)

In [None]:
find_user(test.text[1])

In [None]:
length(unique(test.username))

In [None]:
sum(test.category .== Ref(["quoted"]))

In [None]:
followers[500]

In [None]:
sum(followers .== 0)

In [None]:
261259/50

## DECREASE BINS LOGARITHMICALLY

In [None]:
25929^2/520^2*10/60/60

In [None]:
sum(0 .< followers .< 50)

In [None]:
sum(followers .== 0)

In [None]:
sum(followers .> 500000)

In [None]:
test = collect(1:10)
sorting = sortperm(test, rev=true)
test[sorting]