In [None]:
!apt install tesseract-ocr tesseract-ocr-rus -y
!pip install faster_whisper
!pip install moviepy

In [None]:
import os
import pandas as pd

import re
import time
from typing import Any

import cv2
import pytesseract
import numpy as np
import pandas as pd
import plotly.express as px
import plotly.graph_objects as go
from plotly.subplots import make_subplots
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from moviepy.editor import VideoFileClip

import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
root_dir = '/kaggle/input/disinformation-detection'

In [None]:
STOP_WORDS = ['youtube', 'telegram', 'телеграм', 'twitter', 'url', 'image', 'emoji']

In [None]:
def get_n_grams(df, n_grams, top_n=10):
    df_words = pd.DataFrame(columns=["words", "counts"])
    texts = df["Content"].tolist()
    vec = CountVectorizer(
        ngram_range=(n_grams, n_grams),
        min_df=1,
        max_df=1,
        stop_words=STOP_WORDS,
    ).fit(texts)
    bag_of_words = vec.transform(texts)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [
        (word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()
    ]
    cvec_df = pd.DataFrame.from_records(
        words_freq, columns=["words", "counts"]
    ).sort_values(by="counts", ascending=False)
    cvec_df = cvec_df.iloc[:top_n, :]
    df_words = pd.concat([df_words, cvec_df], ignore_index=True)
    df_words["counts"] = df_words["counts"].astype(int)
    return df_words



def get_n_grams_by_suspicious_level(train, target_cls, n_grams, top_n=10):
    df_words = pd.DataFrame(columns=["Suspicious_Level", "words", "counts"])
    for dt in tqdm(target_cls):
        df = train[train['Suspicious_Level'] == dt]
        texts = df["Content"].tolist()
        vec = CountVectorizer(
            ngram_range=(n_grams, n_grams),
            min_df=1,
            max_df=1,
            stop_words=STOP_WORDS,
        ).fit(texts)
        bag_of_words = vec.transform(texts)
        sum_words = bag_of_words.sum(axis=0)
        words_freq = [
            (word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()
        ]
        cvec_df = pd.DataFrame.from_records(
            words_freq, columns=["words", "counts"]
        ).sort_values(by="counts", ascending=False)
        cvec_df.insert(0, "Suspicious_Level", dt)
        cvec_df = cvec_df.iloc[:top_n, :]
        df_words = pd.concat([df_words, cvec_df], ignore_index=True)
    df_words["counts"] = df_words["counts"].astype(int)
    return df_words

In [None]:
def plot_n_grams(df, ngram_name, top_n=10):
    fig = px.bar(
        df.sort_values(by="counts").head(top_n),
        x="counts",
        y="words",
        color="counts",
        title=f"Most used {ngram_name}-grams",
    )

    fig.update_coloraxes(showscale=False)
    fig.update_layout(
        xaxis=dict(autorange="reversed"),
        yaxis=dict(categoryorder='total ascending', tickangle=-90),
    )

    return fig


def plot_n_grams_by_suspicious_level(df, ngram_name, cls=1, top_n=10):
    fig = px.bar(
        df[df["Suspicious_Level"] == cls].sort_values(by="counts").head(top_n),
        x="counts",
        y="words",
        color="counts",
        title=f"Most used {ngram_name}-grams (Suspicious_Level = {cls})",
    )

    fig.update_coloraxes(showscale=False)
    fig.update_layout(
        xaxis=dict(autorange="reversed"),
        yaxis=dict(categoryorder='total ascending', tickangle=-90),
    )

    return fig

# Main Dataset

In [None]:
train_df = pd.read_csv(os.path.join(root_dir, "train.csv"))
test_df = pd.read_csv(os.path.join(root_dir, "test.csv"))

In [None]:
train_df

In [None]:
test_df

In [None]:
train_df.info()

In [None]:
test_df.info()

In [None]:
train_df['Date'] = pd.to_datetime(train_df['Date'])
test_df['Date'] = pd.to_datetime(test_df['Date'])

In [None]:
train_df.isnull().sum(axis=0)

In [None]:
test_df.isnull().sum(axis=0)

In [None]:
train_df["Date"].dt.floor('d').value_counts()

In [None]:
test_df["Date"].dt.floor('d').value_counts()

In [None]:
train_df['Suspicious_Level'].plot(kind="hist")

In [None]:
train_df.groupby(['Suspicious_Level'])['Suspicious_Level'].count()

### NGrams

In [None]:
grams_1_df = get_n_grams_by_suspicious_level(train_df, train_df["Suspicious_Level"].unique(), n_grams=1, top_n=10)
grams_2_df = get_n_grams_by_suspicious_level(train_df, train_df["Suspicious_Level"].unique(), n_grams=2, top_n=10)
grams_3_df = get_n_grams_by_suspicious_level(train_df, train_df["Suspicious_Level"].unique(), n_grams=3, top_n=10)

In [None]:
figures = [
    plot_n_grams_by_suspicious_level(grams_1_df, "1", cls=1),
    plot_n_grams_by_suspicious_level(grams_1_df, "1", cls=2),
    plot_n_grams_by_suspicious_level(grams_1_df, "1", cls=3),
    plot_n_grams_by_suspicious_level(grams_2_df, "2", cls=1),
    plot_n_grams_by_suspicious_level(grams_2_df, "2", cls=2),
    plot_n_grams_by_suspicious_level(grams_2_df, "2", cls=3),
    plot_n_grams_by_suspicious_level(grams_3_df, "3", cls=1),
    plot_n_grams_by_suspicious_level(grams_3_df, "3", cls=2),
    plot_n_grams_by_suspicious_level(grams_3_df, "3", cls=3)
    ]

subplot_titles = (
    "Most used 1-grams (Suspicious_Level = 1)",
    "Most used 2-grams (Suspicious_Level = 2)",
    "Most used 3-grams (Suspicious_Level = 3)",
    "Most used 1-grams (Suspicious_Level = 1)",
    "Most used 2-grams (Suspicious_Level = 2)",
    "Most used 3-grams (Suspicious_Level = 3)",
    "Most used 1-grams (Suspicious_Level = 1)",
    "Most used 2-grams (Suspicious_Level = 2)",
    "Most used 3-grams (Suspicious_Level = 3)",
)

fig = make_subplots(rows=int(len(figures) / 3), cols=3,
                    subplot_titles=subplot_titles,
                    shared_xaxes=True,
                   )

for i, figure in enumerate(figures):
    for trace in range(len(figure["data"])):
        fig.append_trace(figure["data"][trace], row=int(i % 3)+1, col=int(i / 3) + 1)

    
fig.update_yaxes(tickangle=45)
fig.layout.update(title="N-Grams", height=1000, width=2000, showlegend=False)
        
fig.show()

**Who is "Ярик"?**

In [None]:
for idx, content in enumerate(train_df["Content"].tolist()):
    if 'Ярик' in content:
        print(idx)

In [None]:
train_df["Content"].tolist()[472]

**Okay, it's an attempt at a story-telling. The reason is fluctuations in a small subset, with increasing data, such cases the frequency will decrease.**

## mantis_rus_dataset

In [None]:
mantis_dir = 'mantis_rus_dataset/mantis_rus_dataset'
dirs2check = ["dev", "train", "test"]
file_pair = {}

for d2check in dirs2check:
    for fname in os.listdir(os.path.join(root_dir, mantis_dir, d2check)):
        name = os.path.join(d2check, fname.split(".")[0])
        file_pair[name] = file_pair.get(name, []) + [fname]

In [None]:
texts = []
manips = []
for k, v in file_pair.items():
    with open(os.path.join(root_dir, mantis_dir, k + ".txt"), "r") as f:
        texts.append(" ".join(f.readlines()))
                            
    with open(os.path.join(root_dir, mantis_dir, k + ".labels.tsv"), "r") as f:
        manips.append(f.readlines())

In [None]:
manip_df = pd.DataFrame({"Content": texts, "manipulations": manips}, columns=["Content", "manipulations"])

In [None]:
manip_df["manipulations_nums"] = manip_df["manipulations"].apply(lambda x: len(x))

In [None]:
manip_df

In [None]:
manip_df.describe()

In [None]:
manip_df.plot(kind="hist")

In [None]:
grams_1_manip_df = get_n_grams(manip_df, n_grams=1, top_n=10)
grams_2_manip_df = get_n_grams(manip_df, n_grams=2, top_n=10)
grams_3_manip_df = get_n_grams(manip_df, n_grams=3, top_n=10)

In [None]:
grams_1_manip_df

In [None]:
figures = [
    plot_n_grams(grams_1_manip_df, "1"),
    plot_n_grams(grams_1_manip_df, "1"),
    plot_n_grams(grams_1_manip_df, "1"),
    plot_n_grams(grams_2_manip_df, "2"),
    plot_n_grams(grams_2_manip_df, "2"),
    plot_n_grams(grams_2_manip_df, "2"),
    plot_n_grams(grams_3_manip_df, "3"),
    plot_n_grams(grams_3_manip_df, "3"),
    plot_n_grams(grams_3_manip_df, "3")
    ]

subplot_titles = (
    "Most used 1-grams",
    "Most used 2-grams",
    "Most used 3-grams",
    "Most used 1-grams",
    "Most used 2-grams",
    "Most used 3-grams",
    "Most used 1-grams",
    "Most used 2-grams",
    "Most used 3-grams",
)

fig = make_subplots(rows=1, cols=3,
                    subplot_titles=subplot_titles,
                    shared_xaxes=True,
                   )

for i, figure in enumerate(figures):
    for trace in range(len(figure["data"])):
        fig.append_trace(figure["data"][trace], row=1, col=int(i / 3) + 1)

    
fig.update_yaxes(tickangle=45)
fig.layout.update(title="N-Grams", height=500, width=2000, showlegend=False)
        
fig.show()

## emnlp_trans_rus_dataset

In [None]:
mantis_dir = 'emnlp_trans_rus_dataset/emnlp_trans_rus_dataset'
dirs2check = ["dev", "train", "test"]
file_pair = {}

for d2check in dirs2check:
    for fname in os.listdir(os.path.join(root_dir, mantis_dir, d2check)):
        name = os.path.join(d2check, fname.split(".")[0])
        file_pair[name] = file_pair.get(name, []) + [fname]

In [None]:
texts = []
manips = []
for k, v in file_pair.items():
    with open(os.path.join(root_dir, mantis_dir, k + ".txt"), "r") as f:
        texts.append(" ".join(f.readlines()))
                            
    with open(os.path.join(root_dir, mantis_dir, k + ".labels.tsv"), "r") as f:
        manips.append(f.readlines())

In [None]:
emnlp_trans_rus_df = pd.DataFrame({"Content": texts, "manipulations": manips}, columns=["Content", "manipulations"])

In [None]:
emnlp_trans_rus_df["manipulations_nums"] = emnlp_trans_rus_df["manipulations"].apply(lambda x: len(x))

In [None]:
emnlp_trans_rus_df

In [None]:
emnlp_trans_rus_df.describe()

In [None]:
emnlp_trans_rus_df.plot(kind="hist")

In [None]:
emnlp_trans_rus_df.sort_values('manipulations_nums', ascending=False)

**362 manipulations in one post looks suspicious. Let's see what this text is**

In [None]:
emnlp_trans_rus_df.iloc[132]['Content']

**This text is quite long, presumably much longer than the average post, I leave it up to you to check the distribution of post lengths in the entire dataset.**

## unsupervised_data

In [None]:
unsupervised_df = pd.read_csv(os.path.join(root_dir, 'unsupervised_data.csv'))
unsupervised_df['Date'] = pd.to_datetime(unsupervised_df['Date'])

In [None]:
unsupervised_df

In [None]:
# unsupervised_df["Date"].dt.floor('d').value_counts()

## Media

Let's return to the main dataset

In [None]:
train_df

In [None]:
test_df

Remember {train or test}/{ChannelId}/{MessageId}.{file extension}



In [None]:
media_dir = 'media/media'
dirs2check = ['train', 'test']
file_pair = {'train': {}, 'test': {}}

for d2check in dirs2check:
    for ch_id_dir in os.listdir(os.path.join(root_dir, media_dir, d2check)):
        for fname in os.listdir(os.path.join(root_dir, media_dir, d2check, ch_id_dir)):
            key = int(fname.split('.')[0])
            file_pair[d2check][key] = file_pair[d2check].get(key, []) + [fname]

In [None]:
posts_have_media_in_train = list(file_pair['train'].keys())

In [None]:
df_train_with_media_df = train_df[train_df['MessageId'].isin(posts_have_media_in_train)]
df_train_with_media_df

In [None]:
df_train_with_media_df['Suspicious_Level'].plot(kind="hist")

In [None]:
posts_have_media_in_test = list(file_pair['test'].keys())
len(posts_have_media_in_test)

In [None]:
posts_with_imgs = {}

for k, v in file_pair['train'].items():
    if '.jpg' in v[0]:
        posts_with_imgs[k] = v[0]

### Let's check couple images

In [None]:
df_train_with_media_df[df_train_with_media_df['MessageId'] == 280442]

In [None]:
path2file = os.path.join(root_dir, 'media/media/train/1144180066/280442.jpg') 

In [None]:
img = cv2.imread(path2file, cv2.IMREAD_COLOR)
img = cv2.cvtColor(img, cv2.COLOR_BGR2RGB)
plt.imshow(img)
plt.show()

In [None]:
text = pytesseract.image_to_string(path2file, lang='rus')
print(text)

### Let's extract text from audio

In [None]:
path2audio_file = os.path.join(root_dir, 'media/media/train/1144180066/280336.mp4')

In [None]:
video_path = path2audio_file
output_audio_path = 'output_audio.wav'

video_clip = VideoFileClip(video_path)
audio_clip = video_clip.audio
audio_clip.write_audiofile(output_audio_path)
audio_clip.close()
video_clip.close()

In [None]:
from faster_whisper import WhisperModel

model = WhisperModel("small")

segments, info = model.transcribe('/kaggle/working/output_audio.wav')
for segment in segments:
    print(f'[{segment.start:.2f} -> {segment.end:.2f}] {segment.text}')