In [1]:
import re
import warnings

import cv2
import gensim
import nltk
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
import plotly.subplots as sp
import pytesseract
from bs4 import BeautifulSoup
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from scipy.stats import chi2_contingency, kruskal
from tqdm import tqdm
from wordcloud import WordCloud

warnings.filterwarnings("ignore")

In [2]:
import os

os.environ["IMAGEIO_FFMPEG_EXE"] = "/opt/homebrew/Cellar/ffmpeg/6.0_1/bin/ffmpeg"

In [3]:
nltk.download("stopwords")
nltk.download("punkt")

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/dmytro.miedviediev/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/dmytro.miedviediev/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [4]:
stop_words = set(stopwords.words("russian"))

In [97]:
df_train = pd.read_csv("../datasets/train_extended_media.csv")
df_test = pd.read_csv("../datasets/train_extended.csv")
# df_unsupervised = pd.read_csv("../datasets/unsupervised_data.csv")

In [84]:
def strip_html(text):
    soup = BeautifulSoup(text, "html.parser")
    return soup.get_text()


def remove_between_square_brackets(text):
    return re.sub("\[[^]]*\]", "", text)


def remove_between_round_brackets(text):
    return re.sub("\([^)]*\)", "", text)


def remove_urls(text):
    return re.sub(r"http\S+", "", text)


stop_words = stopwords.words("russian")


def remove_stop_words(text, join_back=True):
    result = []
    for token in word_tokenize(text.lower()):
        if token not in stop_words and token.isalpha():
            result.append(token)
    if join_back:
        result = " ".join(result)
    return result


def preprocess(text, join_back=True):
    res = strip_html(text)
    res = remove_between_square_brackets(res)
    res = remove_between_round_brackets(res)
    res = remove_urls(res)
    res = remove_stop_words(res, join_back)
    return res

In [92]:
# Convert date columns to datetime format
df_proc_train = df_test.iloc[:, 1:]
df_proc_train["Date"] = pd.to_datetime(df_proc_train["Date"])
df_proc_train["EditDate"] = pd.to_datetime(df_proc_train["EditDate"])
df_proc_train["TimeToEdit"] = (
    (df_proc_train["EditDate"] - df_proc_train["Date"]).dt.total_seconds().fillna(0)
)
# df_proc_train["Processed_Content"] = df_proc_train["Content"].apply(preprocess)

In [90]:
df_proc_train.to_csv("../datasets/train_extended_media.csv", index=False)

In [94]:
df_test

Unnamed: 0.1,Unnamed: 0,ChannelName,ChannelId,MessageId,Date,EditDate,Content,Suspicious_Level,Sensitive Topic,num_sensitive,Sentiment,Sentiment_Score,Emotion,Emotion_Score,Dangerous,Dangerous_Score,Media
0,0,boris_rozhin,1101806611,91626,2023-07-08 16:11:34,2023-07-08 16:11:47,Работа наших бойцов к югу от Артемовска. Работ...,2,"weapons,politics",2,neutral,0.948454,neutral,0.745498,False,0.913780,mp4
1,1,sashakots,1109403194,40853,2023-07-08 16:44:44,2023-07-08 16:44:58,"Анкара нарушила договорённости, отпустив глава...",1,"politics,racism",2,neutral,0.498936,gratitude,0.868692,False,0.948626,
2,2,swodki,1144180066,280668,2023-07-09 02:00:23,2023-07-09 02:05:53,ЭТО ЕДИНСТВЕННЫЙ СПОСОБ ПОМОЧЬ НАМ! \n\nПополн...,1,none,0,neutral,0.788881,gratitude,0.769272,False,0.949768,jpg
3,3,boris_rozhin,1101806611,91573,2023-07-08 02:07:05,2023-07-08 02:07:19,МТ-ЛБ с 32-зарядной авиационной пусковой устан...,1,weapons,1,neutral,0.973560,neutral,0.906539,False,0.940683,mp4
4,4,swodki,1144180066,280695,2023-07-09 07:01:49,2023-07-09 07:05:08,ЭТО ЕДИНСТВЕННЫЙ СПОСОБ ПОМОЧЬ НАМ! \n\nПополн...,1,none,0,neutral,0.788881,gratitude,0.769272,False,0.949768,jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
582,582,swodki,1144180066,280784,2023-07-09 11:05:22,2023-07-09 11:05:33,"Вы начнёте читать Галковского, и вас обескураж...",1,politics,1,neutral,0.573739,neutral,0.238320,False,0.949642,
583,583,boris_rozhin,1101806611,91598,2023-07-08 10:01:34,2023-07-08 10:01:41,"После серии неудач на Запорожском направлении,...",1,politics,1,neutral,0.800057,neutral,0.452732,False,0.950803,
584,584,boris_rozhin,1101806611,91571,2023-07-08 00:06:05,2023-07-08 00:06:18,Работают как часы\n\nСлаженный механизм работы...,1,"weapons,politics",2,neutral,0.605742,neutral,0.308340,False,0.952170,mp4
585,585,boris_rozhin,1101806611,91614,2023-07-08 14:46:44,2023-07-08 14:46:54,Кадры 18+\nРезня под Глуховым: Боевики ТрО в С...,3,"terrorism,politics",2,negative,0.508161,neutral,0.711296,False,0.947267,mp4


## EDA

In [17]:
df_train.isnull().sum()

ChannelName          0
ChannelId            0
MessageId            0
Date                 0
EditDate            14
Content              0
Suspicious_Level     0
dtype: int64

In [18]:
df_train["Suspicious_Level"].value_counts()

Suspicious_Level
1    383
2    139
3     65
Name: count, dtype: int64

In [19]:
df_train["ChannelName"].value_counts()

ChannelName
swodki             445
boris_rozhin        81
sashakots           15
wargonzo            15
buntariy            10
vladlentatarsky      9
aleksandr_skif       5
epoddubny            4
mod_russia           2
minpravda            1
Name: count, dtype: int64

In [20]:
contingency_table = pd.crosstab(df_train["ChannelName"], df_train["Suspicious_Level"])

print("Contingency Table:")
print(contingency_table)

chi2, p, _, _ = chi2_contingency(contingency_table)

print("\nChi-square statistic:", chi2)
print("P-value:", p)

Contingency Table:
Suspicious_Level    1    2   3
ChannelName                   
aleksandr_skif      4    1   0
boris_rozhin       36   30  15
buntariy            7    3   0
epoddubny           3    1   0
minpravda           0    0   1
mod_russia          2    0   0
sashakots          12    2   1
swodki            296  102  47
vladlentatarsky     9    0   0
wargonzo           14    0   1

Chi-square statistic: 39.37697221169696
P-value: 0.0025353617956403213


In [8]:
# Calculate the time difference
df_tes_train = df_train[["Date", "EditDate", "Content", "Suspicious_Level"]]
df_tes_train["Date"] = pd.to_datetime(df_tes_train["Date"])
df_tes_train["EditDate"] = pd.to_datetime(df_tes_train["EditDate"])
df_tes_train["TimeToEdit"] = (
    (df_tes_train["EditDate"] - df_tes_train["Date"]).dt.total_seconds().fillna(0)
)

# Perform the Mann-Whitney U test
statistic, p_value = kruskal(
    df_tes_train["TimeToEdit"][df_tes_train["Suspicious_Level"] == 1],
    df_tes_train["TimeToEdit"][df_tes_train["Suspicious_Level"] == 2],
    df_tes_train["TimeToEdit"][df_tes_train["Suspicious_Level"] == 3],
)

# Print the result
print("Kruskal-Wallis statistic:", statistic)
print("P-value:", p_value)

Kruskal-Wallis statistic: 0.9623151267323573
P-value: 0.6180675252305834


In [78]:
df_content_train = df_train[["Content", "Suspicious_Level"]]
df_content_train["Processed_Content"] = df_content_train["Content"].apply(preprocess)

In [98]:
import os

root_dir = (
    "/Users/dmytro.miedviediev/Projects/ai-house-disinformation-detection/datasets"
)
media_dir = "../datasets/media/media"
dirs2check = ["train", "test"]
file_pair = {"train": {}, "test": {}}

for d2check in dirs2check:
    for ch_id_dir in os.listdir(os.path.join(root_dir, media_dir, d2check)):
        for fname in os.listdir(os.path.join(root_dir, media_dir, d2check, ch_id_dir)):
            key = int(fname.split(".")[0])
            file_pair[d2check][key] = file_pair[d2check].get(key, []) + [
                os.path.join(ch_id_dir, fname)
            ]

posts_have_media_in_train = list(file_pair["train"].keys())
df_train_with_media_df = df_train[df_train["MessageId"].isin(posts_have_media_in_train)]
df_train_with_media_df

Unnamed: 0,ChannelName,ChannelId,MessageId,Date,EditDate,Content,Suspicious_Level,Sensitive Topic,num_sensitive,Sentiment,Sentiment_Score,Emotion,Emotion_Score,Dangerous,Dangerous_Score,Media,TimeToEdit
0,boris_rozhin,1101806611,91626,2023-07-08 16:11:34,2023-07-08 16:11:47,Работа наших бойцов к югу от Артемовска. Работ...,2,"weapons,politics",2,neutral,0.948454,neutral,0.745498,False,0.913780,mp4,13.0
2,swodki,1144180066,280668,2023-07-09 02:00:23,2023-07-09 02:05:53,ЭТО ЕДИНСТВЕННЫЙ СПОСОБ ПОМОЧЬ НАМ! \n\nПополн...,1,none,0,neutral,0.788881,gratitude,0.769272,False,0.949768,jpg,330.0
3,boris_rozhin,1101806611,91573,2023-07-08 02:07:05,2023-07-08 02:07:19,МТ-ЛБ с 32-зарядной авиационной пусковой устан...,1,weapons,1,neutral,0.973560,neutral,0.906539,False,0.940683,mp4,14.0
4,swodki,1144180066,280695,2023-07-09 07:01:49,2023-07-09 07:05:08,ЭТО ЕДИНСТВЕННЫЙ СПОСОБ ПОМОЧЬ НАМ! \n\nПополн...,1,none,0,neutral,0.788881,gratitude,0.769272,False,0.949768,jpg,199.0
7,swodki,1144180066,280492,2023-07-08 13:42:32,2023-07-08 13:42:35,"Те самые азовцы, которые были обменяны при под...",1,terrorism,1,neutral,0.496241,neutral,0.646476,False,0.950963,mp4,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
580,swodki,1144180066,280341,2023-07-08 05:31:54,2023-07-08 05:32:04,"""Я защищаю свою жену, ее семью""\n\nБразильский...",1,"politics,racism",2,neutral,0.765827,neutral,0.503804,False,0.955039,mp4,10.0
581,swodki,1144180066,280578,2023-07-08 18:58:59,2023-07-08 18:59:07,"Последствия обстрела Киевского района Донецка,...",1,"weapons,politics",2,neutral,0.892267,neutral,0.587621,False,0.936738,jpg,8.0
584,boris_rozhin,1101806611,91571,2023-07-08 00:06:05,2023-07-08 00:06:18,Работают как часы\n\nСлаженный механизм работы...,1,"weapons,politics",2,neutral,0.605742,neutral,0.308340,False,0.952170,mp4,13.0
585,boris_rozhin,1101806611,91614,2023-07-08 14:46:44,2023-07-08 14:46:54,Кадры 18+\nРезня под Глуховым: Боевики ТрО в С...,3,"terrorism,politics",2,negative,0.508161,neutral,0.711296,False,0.947267,mp4,10.0


In [50]:
df_train["Media"] = (
    df_train["MessageId"]
    .apply(lambda _id: file_pair["train"].get(_id, ["."])[0].split(".")[1])
    .replace("", None)
)
df_train.to_csv("../datasets/train_extended_media.csv")

In [99]:
df_test["Media"] = (
    df_test["MessageId"]
    .apply(lambda _id: file_pair["test"].get(_id, ["."])[0].split(".")[1])
    .replace("", None)
)
df_test.to_csv("../datasets/test_extended_media.csv")

In [18]:
from faster_whisper import WhisperModel
from moviepy.editor import VideoFileClip

model = WhisperModel("small")
posts_with_media = {}

for k, v in tqdm(file_pair["train"].items()):
    if ".jpg" in v[0]:
        path2file = os.path.join(root_dir, f"media/media/train/{v[0]}")
        posts_with_media[k] = pytesseract.image_to_string(path2file, lang="rus")
    if ".mp4" in v[0]:
        path2audio_file = os.path.join(root_dir, f"media/media/train/{v[0]}")
        video_path = path2audio_file
        output_audio_path = os.path.join(root_dir, "output_audio.wav")
        video_clip = VideoFileClip(video_path)
        if video_clip.get("audio", None) is None:
            continue
        audio_clip.write_audiofile(output_audio_path)
        audio_clip.close()
        video_clip.close()
        segments, info = model.transcribe(output_audio_path)
        posts_with_media[k] = "".join(list(map(lambda x: x.text, segments)))

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 436/436 [00:32<00:00, 13.50it/s]


In [54]:
posts_with_media

NameError: name 'posts_with_mediaa' is not defined

In [None]:
df_train["ContentMedia"] = df_train["MessageId"].apply(
    lambda _id: posts_with_media.get(_id, None)
)

df_train

In [None]:
df_train = df_train[~df_train.duplicated(subset=["Content"], keep="last")]

In [None]:
df_train.to_csv("train_with_media.csv")

In [56]:
contingency_table = pd.crosstab(df_train["Media"], df_train["Suspicious_Level"])

print("Contingency Table:")
print(contingency_table)

chi2, p, _, _ = chi2_contingency(contingency_table)

print("\nChi-square statistic:", chi2)
print("P-value:", p)

Contingency Table:
Suspicious_Level    1   2   3
Media                        
jpg               143  32  20
mp4               123  91  27

Chi-square statistic: 26.286515521328855
P-value: 1.9586447838917636e-06


In [57]:
df_train

Unnamed: 0,ChannelName,ChannelId,MessageId,Date,EditDate,Content,Suspicious_Level,Sensitive Topic,num_sensitive,Sentiment,Sentiment_Score,Emotion,Emotion_Score,Dangerous,Dangerous_Score,Media
0,boris_rozhin,1101806611,91626,2023-07-08 16:11:34,2023-07-08 16:11:47,Работа наших бойцов к югу от Артемовска. Работ...,2,"weapons,politics",2,neutral,0.948454,neutral,0.745498,False,0.913780,mp4
1,sashakots,1109403194,40853,2023-07-08 16:44:44,2023-07-08 16:44:58,"Анкара нарушила договорённости, отпустив глава...",1,"politics,racism",2,neutral,0.498936,gratitude,0.868692,False,0.948626,
2,swodki,1144180066,280668,2023-07-09 02:00:23,2023-07-09 02:05:53,ЭТО ЕДИНСТВЕННЫЙ СПОСОБ ПОМОЧЬ НАМ! \n\nПополн...,1,none,0,neutral,0.788881,gratitude,0.769272,False,0.949768,jpg
3,boris_rozhin,1101806611,91573,2023-07-08 02:07:05,2023-07-08 02:07:19,МТ-ЛБ с 32-зарядной авиационной пусковой устан...,1,weapons,1,neutral,0.973560,neutral,0.906539,False,0.940683,mp4
4,swodki,1144180066,280695,2023-07-09 07:01:49,2023-07-09 07:05:08,ЭТО ЕДИНСТВЕННЫЙ СПОСОБ ПОМОЧЬ НАМ! \n\nПополн...,1,none,0,neutral,0.788881,gratitude,0.769272,False,0.949768,jpg
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
582,swodki,1144180066,280784,2023-07-09 11:05:22,2023-07-09 11:05:33,"Вы начнёте читать Галковского, и вас обескураж...",1,politics,1,neutral,0.573739,neutral,0.238320,False,0.949642,
583,boris_rozhin,1101806611,91598,2023-07-08 10:01:34,2023-07-08 10:01:41,"После серии неудач на Запорожском направлении,...",1,politics,1,neutral,0.800057,neutral,0.452732,False,0.950803,
584,boris_rozhin,1101806611,91571,2023-07-08 00:06:05,2023-07-08 00:06:18,Работают как часы\n\nСлаженный механизм работы...,1,"weapons,politics",2,neutral,0.605742,neutral,0.308340,False,0.952170,mp4
585,boris_rozhin,1101806611,91614,2023-07-08 14:46:44,2023-07-08 14:46:54,Кадры 18+\nРезня под Глуховым: Боевики ТрО в С...,3,"terrorism,politics",2,negative,0.508161,neutral,0.711296,False,0.947267,mp4
