# Dataset

In [16]:
import math

import torch
import pandas as pd
import numpy as np

In [17]:
from pathlib import Path

DATA_PATH = Path('data/')
DATA_PATH.mkdir(parents=True, exist_ok=True)

DATA_CACHE = Path('data/cache_dir/')
DATA_CACHE.mkdir(parents=True, exist_ok=True)

DATA_PATH_SAVE_MODELS = Path('data/models/')
DATA_PATH_SAVE_MODELS.mkdir(parents=True, exist_ok=True)

DATA_SYNTHETIC = Path('synthetic/')
DATA_SYNTHETIC.mkdir(parents=True, exist_ok=True)

pd.set_option('display.max_colwidth', 500) 

## Facebook Data

In [18]:
facebook_df = pd.read_excel(DATA_PATH / 'facebook_data.xlsx', skiprows=1)
facebook_df.head(1)

Unnamed: 0,Group Name,User Name,Facebook Id,Page Category,Page Admin Top Country,Page Description,Page Created,Likes at Posting,Followers at Posting,Post Created,...,Message,Link,Final Link,Image Text,Link Text,Description,Sponsor Id,Sponsor Name,Sponsor Category,Overperforming Score (weighted — Likes 1x Shares 1x Comments 1x Love 1x Wow 1x Haha 1x Sad 1x Angry 1x Care 1x )
0,COVID19: Real Talk from Health Care Workers around the Globe,,1073058046385811,none,,,,66994.0,,2020-04-01 22:42:32 EDT,...,We extubated 2 covid patients today and they are doing awesome! Should be on a tele floor tomorrow! (TX),,,,,,,,,189.33


In [19]:
facebook_df.columns

Index(['Group Name', 'User Name', 'Facebook Id', 'Page Category',
       'Page Admin Top Country', 'Page Description', 'Page Created',
       'Likes at Posting', 'Followers at Posting', 'Post Created',
       'Post Created Date', 'Post Created Time', 'Type', 'Total Interactions',
       'Likes', 'Comments', 'Shares', 'Love', 'Wow', 'Haha', 'Sad', 'Angry',
       'Care', 'Video Share Status', 'Is Video Owner?', 'Post Views',
       'Total Views', 'Total Views For All Crossposts', 'Video Length', 'URL',
       'Message', 'Link', 'Final Link', 'Image Text', 'Link Text',
       'Description', 'Sponsor Id', 'Sponsor Name', 'Sponsor Category',
       'Overperforming Score (weighted  —  Likes 1x Shares 1x Comments 1x Love 1x Wow 1x Haha 1x Sad 1x Angry 1x Care 1x )'],
      dtype='object')

In [20]:
facebook_df["text"] = facebook_df["Message"]

# Обрабатываем случаи, когда Link Text или Description могут быть пустыми
facebook_df["link_text"] = (
    facebook_df["Link Text"].fillna('') + " " + facebook_df["Description"].fillna('')
).str.strip()

# Функция для удаления текста, если он содержит только одно слово
def remove_single_word(text):
    if len(text.split()) <= 1:
        return ""
    return text

# Применяем проверку к колонкам text и link_text
facebook_df["text"] = facebook_df["text"].fillna('').apply(remove_single_word)
facebook_df["link_text"] = facebook_df["link_text"].fillna('').apply(remove_single_word)

facebook_df["text"] = facebook_df["text"].replace("", None).drop_duplicates().fillna("")
facebook_df["link_text"] = facebook_df["link_text"].replace("", None).drop_duplicates().fillna("")

In [21]:
facebook_df.sample(1)


Unnamed: 0,Group Name,User Name,Facebook Id,Page Category,Page Admin Top Country,Page Description,Page Created,Likes at Posting,Followers at Posting,Post Created,...,Final Link,Image Text,Link Text,Description,Sponsor Id,Sponsor Name,Sponsor Category,Overperforming Score (weighted — Likes 1x Shares 1x Comments 1x Love 1x Wow 1x Haha 1x Sad 1x Angry 1x Care 1x ),text,link_text
6217,UK against Covid-19,,670592507037280,none,,,,,,2020-03-21 06:55:52 EDT,...,,,"Corner shop gives parents free care packages of baby food, milk, and wipes",'The whole community needs to come together for this.,,,,0.74,"https://www.google.com/amp/s/metro.co.uk/2020/03/20/coronavirus-uk-corner-shop-gives-parents-free-packs-baby-food-milk-wipes-12430798/amp/?fbclid=IwAR2853BYx7OAOHMGEff6yEmr19vr_yR1YBuBG6kgQ2TiqLD8Jo4eaXfPBRA Times of need bring out peoples true colours, it separates the greedy and selfish to the generous and caring!!! Big respect to everyone looking out for others!!! ❤️","Corner shop gives parents free care packages of baby food, milk, and wipes 'The whole community needs to come together for this."


In [22]:
facebook_df.to_excel(DATA_PATH / 'facebook_data_to_model.xlsx', index=False)