# Dataset

In [33]:
import math

import torch
import pandas as pd
import numpy as np

In [34]:
from pathlib import Path

DATA_PATH = Path('data/')
DATA_PATH.mkdir(parents=True, exist_ok=True)

DATA_CACHE = Path('data/cache_dir/')
DATA_CACHE.mkdir(parents=True, exist_ok=True)

DATA_PATH_SAVE_MODELS = Path('data/models/')
DATA_PATH_SAVE_MODELS.mkdir(parents=True, exist_ok=True)

DATA_SYNTHETIC = Path('synthetic/')
DATA_SYNTHETIC.mkdir(parents=True, exist_ok=True)

pd.set_option('display.max_colwidth', 500) 

## Facebook Data

In [35]:
facebook_df = pd.read_excel(DATA_PATH / 'facebook_data.xlsx', skiprows=1)
facebook_df.head(1)

Unnamed: 0,Group Name,User Name,Facebook Id,Page Category,Page Admin Top Country,Page Description,Page Created,Likes at Posting,Followers at Posting,Post Created,...,Message,Link,Final Link,Image Text,Link Text,Description,Sponsor Id,Sponsor Name,Sponsor Category,Overperforming Score (weighted — Likes 1x Shares 1x Comments 1x Love 1x Wow 1x Haha 1x Sad 1x Angry 1x Care 1x )
0,COVID19: Real Talk from Health Care Workers around the Globe,,1073058046385811,none,,,,66994.0,,2020-04-01 22:42:32 EDT,...,We extubated 2 covid patients today and they are doing awesome! Should be on a tele floor tomorrow! (TX),,,,,,,,,189.33


In [36]:
facebook_df.columns

Index(['Group Name', 'User Name', 'Facebook Id', 'Page Category',
       'Page Admin Top Country', 'Page Description', 'Page Created',
       'Likes at Posting', 'Followers at Posting', 'Post Created',
       'Post Created Date', 'Post Created Time', 'Type', 'Total Interactions',
       'Likes', 'Comments', 'Shares', 'Love', 'Wow', 'Haha', 'Sad', 'Angry',
       'Care', 'Video Share Status', 'Is Video Owner?', 'Post Views',
       'Total Views', 'Total Views For All Crossposts', 'Video Length', 'URL',
       'Message', 'Link', 'Final Link', 'Image Text', 'Link Text',
       'Description', 'Sponsor Id', 'Sponsor Name', 'Sponsor Category',
       'Overperforming Score (weighted  —  Likes 1x Shares 1x Comments 1x Love 1x Wow 1x Haha 1x Sad 1x Angry 1x Care 1x )'],
      dtype='object')

In [37]:
facebook_df["text"] = facebook_df["Message"]

# Обрабатываем случаи, когда Link Text или Description могут быть пустыми
facebook_df["link_text"] = (
    facebook_df["Link Text"].fillna('') + " " + facebook_df["Description"].fillna('')
).str.strip()

# Функция для удаления текста, если он содержит только одно слово
def remove_single_word(text):
    if len(text.split()) <= 1:
        return ""
    return text

# Применяем проверку к колонкам text и link_text
facebook_df["text"] = facebook_df["text"].fillna('').apply(remove_single_word)
facebook_df["link_text"] = facebook_df["link_text"].fillna('').apply(remove_single_word)


In [38]:
facebook_df.sample(1)


Unnamed: 0,Group Name,User Name,Facebook Id,Page Category,Page Admin Top Country,Page Description,Page Created,Likes at Posting,Followers at Posting,Post Created,...,Final Link,Image Text,Link Text,Description,Sponsor Id,Sponsor Name,Sponsor Category,Overperforming Score (weighted — Likes 1x Shares 1x Comments 1x Love 1x Wow 1x Haha 1x Sad 1x Angry 1x Care 1x ),text,link_text
4097,Health Care Assistants HCA / Nurses / Support Workers - JOBS (UK),,629521900975276,none,,,,20783.0,,2022-06-12 16:51:29 EDT,...,,,UK SPONSORS - JobsAbroad,UK Sponsor List for Care Homes Suitable for Residential care home managers Caregivers Registered nurses Others involved within the care home industry You are buying… A list of employers approved by the UK Home Office to give job offers in the care home sector Contains 129 records with sponsor name...,,,,1.58,"Looking for a caregivers job in the UK? If yes, you need a UK sponsor approved by the UK home office. You can buy this list for 69 US dollars. A spreadsheet with 129 records. Each record contains the employer name with an url link which either goes to their careers/job page or their contact page. You then apply yourself to the contacts you like and save on agency fees. Buy this list of UK sponsors for carehomes at [https://jobsabroad.lk/uk-sponsors/](https://jobsabroad.lk/uk-sponsors/?fbclid...",UK SPONSORS - JobsAbroad UK Sponsor List for Care Homes Suitable for Residential care home managers Caregivers Registered nurses Others involved within the care home industry You are buying… A list of employers approved by the UK Home Office to give job offers in the care home sector Contains 129 records with sponsor name...


In [39]:
facebook_df.to_excel(DATA_PATH / 'facebook_data_to_model.xlsx', index=False)