# News sources study Skripal (Section 4.5.1 of the thesis report)

In [2]:
include("../Engine/Engine.jl")
using .Engine

using StatsBase, DataFrames, CSV
using JSON, Dates
using DataStructures
import PyPlot as plt
import Seaborn as sns

In [None]:
df = load_dataset(Skripal)
df = skripal_dates(df)
df = trust_score(df)
f, _ = all_users()
df = f(df);

In [None]:
df_before = df[df.partition .== "Before campaign", :]
df_before = df_before[df_before.action .== "U", :]
df_during = df[df.partition .== "During campaign", :]
df_during = df_during[df_during.action .== "U", :]
df_after = df[df.partition .== "After campaign", :]
df_after = df_after[df_after.action .== "U", :]

# 10 most influential users before Skripal according to TE
users_TE_before = ["peterpobjecky", "DontDenyThe", "JJorbyn", "wherepond", "shabbirh", "BeeAHoney_", "JuliaPolan", "PakamamaniRenew", "TheUrbanNewz", "londonfredd"]
df_TE_before = df_before[in.(df_before.username, Ref(users_TE_before)), :]

# 10 most influential users during Skripal according to JDD
users_JDD_during = ["RT_com", "newsroll", "JJorbyn", "ferozwala", "paris_2015", "tonybrooklyn5", "QueensIceZ", "RLSRUSSIANNEWS", "dwilliam9940", "lisa_alba"]
df_JDD_during = df_during[in.(df_during.username, Ref(users_JDD_during)), :]
# 10 most influential users during Skripal according to TE
users_TE_during = ["TheRealYoG", "ferozwala", "BuggerLePanda", "ProfessorsBlogg", "Arfatweet", "NecktopP", "OldRightie", "RTUKnews", "ali9l9", "zerohedge"]
df_TE_during = df_during[in.(df_during.username, Ref(users_TE_during)), :]

# 10 most influential users after Skripal according to JDD
users_JDD_after = ["JudeJack", "HillestadNils", "SQUADDICTS", "starandsixpence", "newsbloktwit", "flyer4life", "jarfizo1", "DJSiri", "Mr_Nick_Nasty", "TacticalFM"]
df_JDD_after = df_after[in.(df_after.username, Ref(users_JDD_after)), :]
# 10 most influential users after Skripal according to TE
users_TE_after = ["Pline999", "Revoche", "SnakeTera", "infidelchloe", "charlievictor16", "Char_lotte777", "LordGamblore", "iccjock06", "StephaniePetri1", "TheUrbanNewz"]
df_TE_after = df_after[in.(df_after.username, Ref(users_TE_after)), :]

In [None]:
# Change dataframe name to get data about a different partition (e.g. df_during -> df_before)
domains = collect(Iterators.flatten(df_during.domain))
count = countmap(domains)

# Change dataframe name to get data about a different partition (e.g. df_JDD_during -> df_JDD_before)
df_JDD = df_JDD_during
domains_JDD = collect(Iterators.flatten(df_JDD.domain))
count_JDD = countmap(domains_JDD)

# Change dataframe name to get data about a different partition (e.g. df_TE_during -> df_TE_before)
df_TE = df_TE_during
domains_TE = collect(Iterators.flatten(df_TE.domain))
count_TE = countmap(domains_TE)

if !(length(unique(df_JDD.username)) == 10)
    print("issue JDD")
end
if !(length(unique(df_TE.username)) == 10)
    print("issue TE")
end

println("rt :")
println("All : $(("rt.com" in keys(count) ? count["rt.com"] : 0) / length(unique(df_during.username)))")
println("JDD : $(("rt.com" in keys(count_JDD) ? count_JDD["rt.com"] : 0) / length(unique(df_JDD.username)))")
println("TE : $(("rt.com" in keys(count_TE) ? count_TE["rt.com"] : 0)/ length(unique(df_TE.username)))")

println("")
println("sputnik :")
println("All : $(("sputniknews.com" in keys(count) ? count["sputniknews.com"] : 0)/ length(unique(df_during.username)))")
println("JDD : $(("sputniknews.com" in keys(count_JDD) ? count_JDD["sputniknews.com"] : 0) / length(unique(df_JDD.username)))")
println("TE : $(("sputniknews.com" in keys(count_TE) ? count_TE["sputniknews.com"] : 0) / length(unique(df_TE.username)))")

# News sources study COP26 (Section 4.5.2 of the thesis report)

In [None]:
include("../Engine/Engine.jl")
using .Engine

using StatsBase, DataFrames, CSV
using JSON, Dates
using DataStructures
import PyPlot as plt
import Seaborn as sns

In [None]:
df = load_dataset(COP26)
df = cop_26_dates(df)
df = trust_score(df)
f, _ = all_users()
df = f(df);

In [None]:
df_during = df[df.partition .== "During COP26", :]
df_during = df_during[df_during.action .== "U", :]


# 5 most influential users during COP26 according to JDD
users_JDD_during = ["globaltimesnews", "MSNBC", "PepperInVegas", "CGTNOfficial", "Chris_1791"]
df_JDD_during = df_during[in.(df_during.username, Ref(users_JDD_during)), :]

# 5 most influential users during COP26 according to TE
users_TE_during = ["delmartian4", "AdoreUSAalways", "JJDJ1187", "TheRebeluniter", "TimMelino"]
df_TE_during = df_during[in.(df_during.username, Ref(users_TE_during)), :];

In [None]:
datafolder = PROJECT_FOLDER * "/Data/Twitter/COP26_processed"
files = [file for file in readdir(datafolder, join=true) if occursin(".json", file)]


"""
Remap tweets to the full URLs contained in them (the dataframes only contain the domains).
"""
function remap(files, df1, df2)

    to_datetime = x -> DateTime(split(x, '.')[1], "yyyy-mm-ddTHH:MM:SS")
    df1_full_urls = [[] for i = 1:length(df1[!,1])]
    df2_full_urls = [[] for i = 1:length(df2[!,1])]

    for file in files

        for line in eachline(file)

            dic = JSON.parse(line, null=missing)
            dic["created_at"] = to_datetime(dic["created_at"])

            index1 = findfirst(dic["created_at"] .== df1.created_at)
            if !isnothing(index1)
                if dic["username"] == df1[index1, "username"] && dic["domain"] == df1[index1, "domain"] && dic["sentiment"] == df1[index1, "sentiment"] 
                    df1_full_urls[index1] = dic["urls"]
                end
            end

            index2 = findfirst(dic["created_at"] .== df2.created_at)
            if !isnothing(index2)
                if dic["username"] == df2[index2, "username"] && dic["domain"] == df2[index2, "domain"] && dic["sentiment"] == df2[index2, "sentiment"] 
                    df2_full_urls[index2] = dic["urls"]
                end
            end

        end

    end
                    
    df1.urls = df1_full_urls
    df2.urls = df2_full_urls

    return df1, df2

end


df_JDD_during, df_TE_during = remap(files, df_JDD_during, df_TE_during);

In [None]:
news = CSV.read(PreProcessing.FULL_NEWSGUARD_TABLE, DataFrame, header=1)

"""
Find the URLs and domaisn that were used to derive the graphs (the first match found for tweets having more than one URL).
"""
function effective_domain_and_urls(df, news_outlet::DataFrame)

    # Initialize vectors of strings
    effective_domain = String["0" for i = 1:length(df[!,1])]
    effective_url = String["0" for i = 1:length(df[!,1])]
    
    for (i, domains) in enumerate(df.domain)

        for (j, domain) in enumerate(domains)
            index = findfirst(domain .== news_outlet."domain")
            if !isnothing(index)
                effective_domain[i] = domain
                effective_url[i] = df.urls[i][j]
            end
        end

	end

    df.effective_domain = effective_domain
    df.effective_url = effective_url

    return df

end


"""
Find the URLs that were used to derive the graphs (the first match found for tweets having more than one URL).
"""
function effective_domain(df, news_outlet::DataFrame)

    effective_domain = String["0" for i = 1:length(df[!,1])]
    
    for (i, domains) in enumerate(df.domain)

        for (j, domain) in enumerate(domains)
            index = findfirst(domain .== news_outlet."domain")
            if !isnothing(index)
                effective_domain[i] = domain
            end
        end

	end

    df.effective_domain = effective_domain

    return df

end

df_during = effective_domain(df_during, news)
df_JDD_during = effective_domain_and_urls(df_JDD_during, news)
df_TE_during = effective_domain_and_urls(df_TE_during, news);

In [None]:
domains_JDD = df_JDD_during.effective_domain
domains_TE = df_TE_during.effective_domain

countmap_JDD = countmap(domains_JDD)
countmap_TE = countmap(domains_TE)


urls_JDD = df_JDD_during.effective_url
urls_TE = df_TE_during.effective_url

In [None]:
colormap = OrderedDict(
    "msnbc.com" => "tab:green",
    "thegatewaypundit.com" => "tab:orange",
    "zerohedge.com" => "tab:purple",
    "foxnews.com" => "tab:green",
    "breitbart.com" => "tab:orange",
    "globaltimes.cn" => "tab:red",
    "cgtn.com" => "tab:red",
    "beckernews.com" => "tab:green"
)

politicmap = OrderedDict(
    "tab:red" => "China affiliated",
    "tab:orange" => "US far right",
    "tab:purple" => "far right",
    "tab:green" => "other",
)

In [None]:
colors_JDD = [colormap[domain] for domain in keys(countmap_JDD)]
legend_elements = [plt.matplotlib.patches.Patch(facecolor=color, label=politicmap[color]) for color in keys(politicmap) if color in unique(colors_JDD)]


indices = sortperm(collect(values(countmap_JDD)), rev=false)
vals = collect(values(countmap_JDD))[indices]
names = collect(keys(countmap_JDD))[indices]
colors_JDD = colors_JDD[indices]

plt.figure()
plt.barh(1:length(countmap_JDD), vals, color=colors_JDD)
plt.xlabel("Number of tweets")
plt.ylabel("News sources")
plt.legend(handles=legend_elements, loc="best")
plt.yticks(1:length(countmap_JDD), names)
lims = plt.xlim()
# plt.savefig(PROJECT_FOLDER * "/Figures/news_source_study/JDD_COP_26_during.pdf", bbox_inches="tight");

In [None]:
colors_TE = [colormap[domain] for domain in keys(countmap_TE)]
legend_elements = [plt.matplotlib.patches.Patch(facecolor=color, label=politicmap[color]) for color in keys(politicmap) if color in unique(colors_TE)]


indices = sortperm(collect(values(countmap_TE)), rev=false)
vals = collect(values(countmap_TE))[indices]
names = collect(keys(countmap_TE))[indices]
colors_TE = colors_TE[indices]


plt.figure(figsize=(6.4, 4.8*4/7))
plt.barh(1:length(countmap_TE), vals, color=colors_TE)
plt.xlabel("Number of tweets")
plt.ylabel("News sources")
plt.legend(handles=legend_elements, loc="best")
plt.yticks(1:length(countmap_TE), names)
plt.xlim(lims)
# plt.savefig(PROJECT_FOLDER * "/Figures/news_source_study/TE_COP_26_during.pdf", bbox_inches="tight");

In [None]:
unique_urls = unique(urls_TE)

print(unique_urls[occursin.("thegatewaypundit", unique_urls)])