# Dataset

In [1]:
import math

import torch
import pandas as pd
import numpy as np

In [2]:
from pathlib import Path

DATA_PATH = Path('data/')
DATA_PATH.mkdir(parents=True, exist_ok=True)

DATA_CACHE = Path('data/cache_dir/')
DATA_CACHE.mkdir(parents=True, exist_ok=True)

DATA_PATH_SAVE_MODELS = Path('data/models/')
DATA_PATH_SAVE_MODELS.mkdir(parents=True, exist_ok=True)

DATA_SYNTHETIC = Path('synthetic/')
DATA_SYNTHETIC.mkdir(parents=True, exist_ok=True)

pd.set_option('display.max_colwidth', 500) 

## Facebook Data

In [3]:
facebook_df = pd.read_excel(DATA_PATH / 'facebook_data.xlsx', skiprows=1)
facebook_df.head(1)

Unnamed: 0,Group Name,User Name,Facebook Id,Page Category,Page Admin Top Country,Page Description,Page Created,Likes at Posting,Followers at Posting,Post Created,...,Message,Link,Final Link,Image Text,Link Text,Description,Sponsor Id,Sponsor Name,Sponsor Category,Overperforming Score (weighted — Likes 1x Shares 1x Comments 1x Love 1x Wow 1x Haha 1x Sad 1x Angry 1x Care 1x )
0,COVID19: Real Talk from Health Care Workers around the Globe,,1073058046385811,none,,,,66994.0,,2020-04-01 22:42:32 EDT,...,We extubated 2 covid patients today and they are doing awesome! Should be on a tele floor tomorrow! (TX),,,,,,,,,189.33


In [4]:
facebook_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6860 entries, 0 to 6859
Data columns (total 40 columns):
 #   Column                                                                                                              Non-Null Count  Dtype  
---  ------                                                                                                              --------------  -----  
 0   Group Name                                                                                                          6860 non-null   object 
 1   User Name                                                                                                           0 non-null      float64
 2   Facebook Id                                                                                                         6860 non-null   int64  
 3   Page Category                                                                                                       6860 non-null   object 
 4   Page Admin Top Cou

In [5]:
facebook_df.columns

Index(['Group Name', 'User Name', 'Facebook Id', 'Page Category',
       'Page Admin Top Country', 'Page Description', 'Page Created',
       'Likes at Posting', 'Followers at Posting', 'Post Created',
       'Post Created Date', 'Post Created Time', 'Type', 'Total Interactions',
       'Likes', 'Comments', 'Shares', 'Love', 'Wow', 'Haha', 'Sad', 'Angry',
       'Care', 'Video Share Status', 'Is Video Owner?', 'Post Views',
       'Total Views', 'Total Views For All Crossposts', 'Video Length', 'URL',
       'Message', 'Link', 'Final Link', 'Image Text', 'Link Text',
       'Description', 'Sponsor Id', 'Sponsor Name', 'Sponsor Category',
       'Overperforming Score (weighted  —  Likes 1x Shares 1x Comments 1x Love 1x Wow 1x Haha 1x Sad 1x Angry 1x Care 1x )'],
      dtype='object')

In [6]:
facebook_df["text"] = facebook_df["Message"]

# Обрабатываем случаи, когда Link Text или Description могут быть пустыми
facebook_df["link_text"] = (
    facebook_df["Link Text"].fillna('') + " " + facebook_df["Description"].fillna('')
).str.strip()

# Функция для удаления текста, если он содержит только одно слово
def remove_single_word(text):
    if len(text.split()) <= 1:
        return ""
    return text

# Применяем проверку к колонкам text и link_text
facebook_df["text"] = facebook_df["text"].fillna('').apply(remove_single_word)
facebook_df["link_text"] = facebook_df["link_text"].fillna('').apply(remove_single_word)

facebook_df["text"] = facebook_df["text"].replace("", None).drop_duplicates().fillna("")
facebook_df["link_text"] = facebook_df["link_text"].replace("", None).drop_duplicates().fillna("")

In [7]:
facebook_df.sample(1)


Unnamed: 0,Group Name,User Name,Facebook Id,Page Category,Page Admin Top Country,Page Description,Page Created,Likes at Posting,Followers at Posting,Post Created,...,Final Link,Image Text,Link Text,Description,Sponsor Id,Sponsor Name,Sponsor Category,Overperforming Score (weighted — Likes 1x Shares 1x Comments 1x Love 1x Wow 1x Haha 1x Sad 1x Angry 1x Care 1x ),text,link_text
1655,Covid19 Real Stories by Frontline and Affected People,,3938079882870550,none,,,,,,2020-04-06 23:11:52 EDT,...,,,Mom & Dad's House,Know anyone thinking of getting out of a large facility and moving into a home like environment? Give us a call.,,,,4.08,Can anyone help our lovely lady here? She’s putting high equity down. 😂,Mom & Dad's House Know anyone thinking of getting out of a large facility and moving into a home like environment? Give us a call.


In [8]:
facebook_df.to_excel(DATA_PATH / 'facebook_data_to_model.xlsx', index=False)

In [9]:
# Получение количества уникальных групп
unique_groups_count = facebook_df['Group Name'].nunique()

# Получение списка групп с количеством сообщений
group_message_counts = facebook_df['Group Name'].value_counts().reset_index()
group_message_counts.columns = ['Group Name', 'Message Count']
group_message_counts


Unnamed: 0,Group Name,Message Count
0,COVID19: Real Talk from Health Care Workers around the Globe,3674
1,Health Care Assistants HCA / Nurses / Support Workers - JOBS (UK),1419
2,Covid19 Real Stories by Frontline and Affected People,548
3,Key Workers Support UK,177
4,UK against Covid-19,145
5,Protect Essential Workers - Global Coronavirus Action,95
6,Grantham Healthcare Workers - Bags for Scrubs/Uniform,91
7,Covid Action UK,84
8,Covid 19 - Newmill Community Support,78
9,"NHS, Key Workers And The World Appreciation Page",72


In [10]:
group_message_counts.to_excel(DATA_PATH / 'group_message_counts.xlsx', index=False)