In [47]:
import os
import pandas as pd
import numpy as np


In [48]:
root_dir = '../datasets/'
mantis_dir = 'mantis_rus_dataset/mantis_rus_dataset'
dirs2check = ["dev", "train", "test"]
file_pair = {}

for d2check in dirs2check:
    for fname in os.listdir(os.path.join(root_dir, mantis_dir, d2check)):
        name = os.path.join(d2check, fname.split(".")[0])
        file_pair[name] = file_pair.get(name, []) + [fname]

In [49]:
texts = []
manips = []
for k, v in file_pair.items():
    with open(os.path.join(root_dir, mantis_dir, k + ".txt"), "r") as f:
        texts.append(" ".join(f.readlines()))

    with open(os.path.join(root_dir, mantis_dir, k + ".labels.tsv"), "r") as f:
        manips.append(f.readlines())

In [50]:
print(texts[0])
print(manips[0])

Объявленные в отношении ВТБ санкции позволяют банку и контрагентам в США постепенно завершить операции, заявил член правления ВТБ.
['00534_05653\tAppeal_to_Authority\t104\t130\n']


In [51]:
manip_df = pd.DataFrame({"Content": texts, "manipulations": manips}, columns=["Content", "manipulations"])

In [52]:
manip_df["manipulations_nums"] = manip_df["manipulations"].apply(lambda x: len(x))

In [53]:
manip_df.head()

Unnamed: 0,Content,manipulations,manipulations_nums
0,Объявленные в отношении ВТБ санкции позволяют ...,[00534_05653\tAppeal_to_Authority\t104\t130\n],1
1,Советник главы офиса президента Украины Михаил...,"[00890_05009\tLoaded_Language\t136\t161\n, 008...",2
2,Советник главы Офиса президента Арестович заяв...,"[00339_07749\tAppeal_to_Authority\t166\t181\n,...",2
3,"Стало известно, почему был нанесен удар по кие...",[08765_07686\tCausal_Oversimplification\t16\t8...,1
4,"А все потому, что шаббат заканчивается в суббо...",[],0


In [54]:
manip_df.describe()

Unnamed: 0,manipulations_nums
count,1255.0
mean,2.115538
std,4.073708
min,0.0
25%,0.0
50%,1.0
75%,2.0
max,58.0


In [55]:
# Split the DataFrame into three based on the specified conditions
df_0_to_20 = manip_df[(manip_df['manipulations_nums'] >= 0) & (manip_df['manipulations_nums'] <= 10)]
df_21_to_40 = manip_df[(manip_df['manipulations_nums'] > 10) & (manip_df['manipulations_nums'] <= 15)]
df_41_to_60 = manip_df[(manip_df['manipulations_nums'] > 15) & (manip_df['manipulations_nums'] <= 60)]


In [56]:
def average_manipulations(dataframe):
    # Calculate the average of manipulations_nums
    average_manipulations_nums = dataframe['manipulations_nums'].mean()

    # Display the result
    print(f"Average manipulations_nums: {average_manipulations_nums}. Count of rows: {len(dataframe)}")

In [57]:
average_manipulations(manip_df)

average_manipulations(df_0_to_20)
average_manipulations(df_21_to_40)
average_manipulations(df_41_to_60)

Average manipulations_nums: 2.1155378486055776. Count of rows: 1255
Average manipulations_nums: 1.5474031327287716. Count of rows: 1213
Average manipulations_nums: 12.818181818181818. Count of rows: 22
Average manipulations_nums: 24.8. Count of rows: 20


In [58]:
conditions = [
    (manip_df['manipulations_nums'] >= 0) & (manip_df['manipulations_nums'] <= 10),
    (manip_df['manipulations_nums'] > 15) & (manip_df['manipulations_nums'] <= 15),
    (manip_df['manipulations_nums'] > 15) & (manip_df['manipulations_nums'] <= 60)
]

# Define the values to be set for each condition
values = [1, 2, 3]

# Use numpy's select function to apply the conditions
manip_df['Suspicious_Level'] = np.select(conditions, values)

In [60]:
import random
from datetime import datetime, timedelta

# Assuming manip_df is your existing DataFrame
# Create a sample DataFrame for the new columns
new_columns_data = {
    'ChannelName': ['Channel_A', 'Channel_B', 'Channel_C', 'Channel_A', 'Channel_B', 'Channel_C', 'Channel_A', 'Channel_B'],
    'ChannelId': [str(random.randint(10**9, 10**10 - 1)) for _ in range(8)],  # Generates a random 10-digit number
    'MessageId': [str(random.randint(10000, 99999)) for _ in range(8)],  # Generates a random 5-digit number
    'Date': [datetime(2023, 1, 1) + timedelta(days=i) for i in range(8)],
    'EditDate': [datetime(2023, 1, 2) + timedelta(days=i+1) for i in range(8)]  # Ensure EditDate is greater than Date
}

new_columns_df = pd.DataFrame(new_columns_data)

# Concatenate the existing manip_df with the new_columns_df
manip_df = pd.concat([manip_df, new_columns_df], axis=1)

# Format 'Date' and 'EditDate' columns
manip_df['Date'] = manip_df['Date'].dt.strftime('%Y-%m-%d %H:%M:%S')
manip_df['EditDate'] = manip_df['EditDate'].dt.strftime('%Y-%m-%d %H:%M:%S')

In [64]:
manip_df = manip_df.drop(['manipulations', 'manipulations_nums'], axis=1)

In [65]:
manip_df.head()

Unnamed: 0,Content,Suspicious_Level,ChannelName,ChannelId,MessageId,Date,EditDate
0,Объявленные в отношении ВТБ санкции позволяют ...,1,Channel_A,2905871876,15985,2023-01-01 00:00:00,2023-01-03 00:00:00
1,Советник главы офиса президента Украины Михаил...,1,Channel_B,4698619027,80166,2023-01-02 00:00:00,2023-01-04 00:00:00
2,Советник главы Офиса президента Арестович заяв...,1,Channel_C,8982396021,45117,2023-01-03 00:00:00,2023-01-05 00:00:00
3,"Стало известно, почему был нанесен удар по кие...",1,Channel_A,2431123677,16923,2023-01-04 00:00:00,2023-01-06 00:00:00
4,"А все потому, что шаббат заканчивается в суббо...",1,Channel_B,3536374004,53849,2023-01-05 00:00:00,2023-01-07 00:00:00


In [66]:
manip_df.to_csv(os.path.join(root_dir, 'mantis_rus_data.csv'), index=False)