In [1]:
import pandas as pd

data = pd.read_csv('post2ctr_dataset.csv')

In [34]:
import torch

print(torch.__version__)
torch.cuda.is_available() 

2.0.0


True

In [None]:
import torch
from PIL import Image
from base64 import b64decode
from io import BytesIO
from transformers import ViTFeatureExtractor, ViTModel

# Инициализация feature extractor и модели ViT
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
vit_model = ViTModel.from_pretrained('google/vit-base-patch16-224')

# Определение устройства (GPU или CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Перемещаем модель на GPU
vit_model = vit_model.to(device)

In [None]:
def preprocess_image(image_base64):
    # Декодируем изображение
    img = Image.open(BytesIO(b64decode(image_base64))).convert("RGB")
    
    # Преобразуем изображение для подачи в модель
    inputs = feature_extractor(images=img, return_tensors="pt")
    return inputs['pixel_values'].squeeze(0)

In [None]:
def extract_image_features_batch(batch_images):
    # Преобразуем список изображений в батч
    batch_images_tensor = torch.stack(batch_images).to(device)
    
    # Извлекаем признаки с помощью ViT
    with torch.no_grad():
        outputs = vit_model(pixel_values=batch_images_tensor)
    
    # Получаем эмбеддинг изображений (среднее по последнему слою для каждого изображения)
    batch_embeddings = outputs.last_hidden_state.mean(dim=1)
    
    return batch_embeddings

In [None]:
# Подготовка данных (предварительная обработка изображений)
preprocessed_images = [preprocess_image(img) for img in data['photo']]

In [None]:
# Параметры батчей
batch_size = 32
image_embeddings = []

# Обработка изображений в батчах
for i in range(0, len(preprocessed_images), batch_size):
    batch_images = preprocessed_images[i:i + batch_size]
    batch_embeddings = extract_image_features_batch(batch_images)
    image_embeddings.append(batch_embeddings)

In [None]:
# Объединяем все батчи в один тензор
X = torch.cat(image_embeddings)

In [3]:
import torch
from PIL import Image
from base64 import b64decode
from io import BytesIO
from transformers import ViTFeatureExtractor, ViTModel

# Инициализация feature extractor и модели ViT
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
vit_model = ViTModel.from_pretrained('google/vit-base-patch16-224')

# Определение устройства (GPU или CPU)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Перемещаем модель на GPU
vit_model = vit_model.to(device)

def preprocess_image(image_base64):
    # Декодируем изображение
    img = Image.open(BytesIO(b64decode(image_base64))).convert("RGB")
    
    # Преобразуем изображение для подачи в модель
    inputs = feature_extractor(images=img, return_tensors="pt")
    return inputs['pixel_values'].squeeze(0)

def extract_image_features_batch(batch_images):
    # Преобразуем список изображений в батч
    batch_images_tensor = torch.stack(batch_images).to(device)
    
    # Извлекаем признаки с помощью ViT
    with torch.no_grad():
        outputs = vit_model(pixel_values=batch_images_tensor)
    
    # Получаем эмбеддинг изображений (среднее по последнему слою для каждого изображения)
    batch_embeddings = outputs.last_hidden_state.mean(dim=1)
    
    return batch_embeddings

# Параметры батчей
batch_size = 32
image_embeddings = []

# Подготовка данных (предварительная обработка изображений)
preprocessed_images = [preprocess_image(img) for img in data['photo']]

# Обработка изображений в батчах
for i in range(0, len(preprocessed_images), batch_size):
    batch_images = preprocessed_images[i:i + batch_size]
    batch_embeddings = extract_image_features_batch(batch_images)
    image_embeddings.append(batch_embeddings)

# Объединяем все батчи в один тензор
X = torch.cat(image_embeddings)

  from .autonotebook import tqdm as notebook_tqdm
Some weights of ViTModel were not initialized from the model checkpoint at google/vit-base-patch16-224 and are newly initialized: ['vit.pooler.dense.bias', 'vit.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
X_cpu = X.cpu().numpy()
image_embeddings_df = pd.DataFrame(X_cpu, columns=[f'image_emb_{i}' for i in range(X_cpu.shape[1])])

In [17]:
# Зададим веса для каждой активности
weights = {
    'like': 1.8,           # Лайк 
    'comment': 4,          # Комментарий 
    'hide': 3,             # Скрытие поста 
    'expand': 1.4,         # Развертывание поста 
    'open_photo': 1.3,     # Открытие фото 
    'open': 1.5,           # Открытие поста 
    'share_to_message': 5  # Пересылка в личные сообщения
}

In [5]:
weights_mean = {}
for i in data.columns[1:8]:
	weights_mean[i] = round((1 / data[i].mean()) * 100, 2)

In [6]:
weights_mean

{'like': 0.26,
 'comment': 9.75,
 'hide': 9.31,
 'expand': 0.13,
 'open_photo': 0.11,
 'open': 0.17,
 'share_to_message': 1.79}

In [18]:
def calculate_weighted_conversion(row):
    weighted_sum = 0
    for activity, weight in weights.items():
        weighted_sum += row[activity] * weight 
        result = weighted_sum / row['view'] if row['view'] > 0 else 0
    return result

data['weighted_conversion'] = data.apply(calculate_weighted_conversion, axis=1)

In [26]:
data[['view', 'like', 'comment', 'hide', 'expand', 'open_photo', 'open', 'share_to_message', 'weighted_conversion']].head()

Unnamed: 0,view,like,comment,hide,expand,open_photo,open,share_to_message,weighted_conversion
0,10869,185,0,2,0,1947,14,20,0.275196
1,9083,227,1,7,4,958,23,2,0.190367
2,5352,25,5,12,598,430,114,4,0.315433
3,4260,539,5,3,1,138,62,24,0.326995
4,5676,112,2,4,371,271,499,4,0.328013


In [20]:
df = pd.concat([image_embeddings_df, data['weighted_conversion']], axis=1)

In [13]:
from lightautoml.automl.presets.tabular_presets import TabularAutoML
from lightautoml.tasks import Task
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error


train, test = train_test_split(df, test_size=0.2, random_state=42)

task = Task('reg')

automl = TabularAutoML(
		task=task, 
		timeout=3600, 
		cpu_limit=6, 
		gpu_ids='0', 
		#reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE, 'verbose': 1}
		general_params= {'use_algos': [['lgb', 'cb', 'nn', 'xgb']]}
)

'nlp' extra dependecy package 'gensim' isn't installed. Look at README.md in repo 'LightAutoML' for installation instructions.
'nlp' extra dependecy package 'nltk' isn't installed. Look at README.md in repo 'LightAutoML' for installation instructions.
'nlp' extra dependecy package 'gensim' isn't installed. Look at README.md in repo 'LightAutoML' for installation instructions.
'nlp' extra dependecy package 'nltk' isn't installed. Look at README.md in repo 'LightAutoML' for installation instructions.




In [14]:
roles = {
	'target': 'weighted_conversion'
}

In [None]:
predict = automl.fit_predict(train, roles=roles, verbose=1)
test_pred = automl.predict(test)

In [None]:
mse = mean_squared_error(y_test, test_pred.data[:, 0])
mse

In [21]:
from catboost import CatBoostRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

X = df.drop(columns=['weighted_conversion'])
y = df['weighted_conversion']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [22]:
model = CatBoostRegressor(iterations=2000, learning_rate=0.1, depth=6, verbose=100)

model.fit(X_train, y_train)
y_pred = model.predict(X_test)

0:	learn: 0.1711268	total: 40.9ms	remaining: 1m 21s
100:	learn: 0.1566233	total: 3.64s	remaining: 1m 8s
200:	learn: 0.1447851	total: 7.05s	remaining: 1m 3s
300:	learn: 0.1347530	total: 10.5s	remaining: 59.1s
400:	learn: 0.1257206	total: 14s	remaining: 55.8s
500:	learn: 0.1171321	total: 17.4s	remaining: 52.1s
600:	learn: 0.1093089	total: 20.8s	remaining: 48.3s
700:	learn: 0.1021960	total: 24.3s	remaining: 45.1s
800:	learn: 0.0955971	total: 27.8s	remaining: 41.6s
900:	learn: 0.0894491	total: 31.2s	remaining: 38.1s
1000:	learn: 0.0839759	total: 34.6s	remaining: 34.5s
1100:	learn: 0.0784017	total: 37.9s	remaining: 31s
1200:	learn: 0.0734016	total: 41.4s	remaining: 27.5s
1300:	learn: 0.0687808	total: 44.8s	remaining: 24.1s
1400:	learn: 0.0644869	total: 48.2s	remaining: 20.6s
1500:	learn: 0.0604236	total: 51.6s	remaining: 17.1s
1600:	learn: 0.0565403	total: 55s	remaining: 13.7s
1700:	learn: 0.0530303	total: 58.3s	remaining: 10.2s
1800:	learn: 0.0496694	total: 1m 1s	remaining: 6.81s
1900:	lea

In [25]:
mse = mean_squared_error(y_test, y_pred)
mse

0.027470408204398132

In [24]:
df['weighted_conversion'].describe()

count    23527.000000
mean         0.196537
std          0.171080
min          0.000762
25%          0.076025
50%          0.149113
75%          0.265293
max          2.812193
Name: weighted_conversion, dtype: float64

MSE weights:      0.027470408204398132 

MSE weights_mean: 0.11655845174410165 

In [28]:
data['text'] = data['text'].fillna('')

In [32]:
import re

def clean_text(text):
    # Удаляем HTML-теги
    text = re.sub(r'<.*?>', '', text)
    # Заменяем HTML-сущности на пробелы
    text = re.sub(r'&[a-zA-Z0-9#]+;', ' ', text)
    # Разделяем цифры и буквы
    text = re.sub(r'(\d+)([а-яА-Яa-zA-Z])', r'\1 \2', text)  # Цифры перед буквами
    text = re.sub(r'([а-яА-Яa-zA-Z])(\d+)', r'\1 \2', text)  # Буквы перед цифрами
    # Удаляем специальные символы
    text = re.sub(r'[^A-Za-zА-Яа-я0-9ёЁ.,!?;:\s]', '', text)  # Сохраняем буквы, цифры и знаки препинания
    # Удаляем лишние пробелы
    text = re.sub(r'\s+', ' ', text).strip()
    return text

# Применяем очистку к колонке 'text'
data['text'] = data['text'].apply(lambda x: clean_text(x) if isinstance(x, str) else x)

In [33]:
data['text']

0                                                         
1                                                         
2        Новость, конечно, старенькая, но все равно инт...
3                                  Фантазийные бриллианты.
4        Сегодня на стадионе Динамо прошли соревнования...
                               ...                        
23522                       Тамара, выиграет в 24 сезоне ?
23523    Продажи Manor Lords превысили 1 млн копий. Сре...
23524                                                     
23525    Магическая фраза: Уже оплачено Позвольте себе ...
23526          Старонемецкая пастушья собака Овечий пудель
Name: text, Length: 23527, dtype: object

In [36]:
from transformers import BertTokenizer, BertModel

# Загружаем предобученный BERT
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
bert_model = BertModel.from_pretrained('bert-base-uncased')

bert_model.to(device)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(30522, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [37]:
def extract_text_features(text):
    # Проверяем, что текст является строкой
    if isinstance(text, str):
        # Токенизация текста
        inputs = tokenizer(text, return_tensors="pt", truncation=True, padding=True).to(device)
    else:
        raise ValueError("Input text must be a string.")

    # Извлекаем признаки с помощью BERT
    with torch.no_grad():
        outputs = bert_model(**inputs)

    # Получаем эмбеддинг текста (например, среднее по последнему слою)
    text_embedding = outputs.last_hidden_state.mean(dim=1).squeeze().cpu().numpy()
    return text_embedding

In [39]:
# Пример: извлечение признаков для одного текста
text_embedding = extract_text_features(data.loc[0, "text"])
text_embedding

array([-9.22983885e-03, -2.21861899e-01, -3.44826400e-01,  1.65108815e-02,
       -2.91983604e-01, -2.57292479e-01,  5.23197055e-01,  8.52107406e-02,
       -1.83133408e-01, -4.25763875e-01,  3.09026003e-01, -6.58390224e-02,
        3.83387893e-01,  3.26030016e-01, -4.31871176e-01, -1.78766772e-01,
       -4.68654573e-01,  2.52397388e-01,  2.98528373e-01, -4.26865637e-01,
        6.18004858e-01, -6.09591343e-02,  1.70703977e-01,  1.85088873e-01,
        3.19058865e-01, -2.45249361e-01, -6.36368394e-02,  1.20905019e-01,
       -4.17179763e-01,  4.95105833e-02, -1.33699626e-01,  1.01185471e-01,
       -3.21497381e-01,  9.15413320e-01,  1.87661871e-01, -4.36267763e-01,
        2.97277719e-01,  5.77071607e-02,  1.94488168e-02,  1.63613111e-01,
       -2.74740100e-01, -1.54002681e-01,  1.94301501e-01, -1.01843618e-01,
        3.22204977e-02, -7.65702873e-02, -8.93381715e-01,  3.37255299e-01,
       -2.08330639e-02,  4.45742577e-01,  5.73014244e-02,  1.73623845e-01,
        2.35955715e-01,  