In [None]:
import spacy
import numpy as np

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
import pandas as pd
dataset = pd.read_csv('Your dataset path')

In [None]:
# !pip install spacy
# !python -m spacy download en_core_web_md

In [None]:
# Load the spaCy model
nlp = spacy.load("en_core_web_md")

In [None]:
# Initialize lists to hold embeddings
x_content = []
x_username = []
x_company = []
x_date = []

# Batch size
batch_size = 64  # You can adjust this based on your memory capacity
num_rows = dataset.shape[0]

# Process in batches
for i in range(0, num_rows, batch_size):
    # Get the current batch
    batch_content = dataset['content'][i:i + batch_size].tolist()
    batch_username = dataset['username'][i:i + batch_size].tolist()
    batch_company = dataset['inferred company'][i:i + batch_size].tolist()
    batch_date = dataset['date'][i:i + batch_size].tolist()

    # Process the entire batch with spaCy
    docs_content = nlp.pipe(batch_content, batch_size=batch_size)
    docs_username = nlp.pipe(batch_username, batch_size=batch_size)
    docs_company = nlp.pipe(batch_company, batch_size=batch_size)
    docs_date = nlp.pipe(batch_date, batch_size=batch_size)

    # Extract embeddings for each feature in the batch
    for doc in docs_content:
        x_content.append(doc.vector)

    for doc in docs_username:
        x_username.append(doc.vector)

    for doc in docs_company:
        x_company.append(doc.vector)

    for doc in docs_date:
        x_date.append(doc.vector)

    print(f"Processed batch {i // batch_size + 1} out of {num_rows // batch_size + 1}")

# Convert lists to NumPy arrays (optional, depending on your use case)
x_content = np.array(x_content)
x_username = np.array(x_username)
x_company = np.array(x_company)
x_date = np.array(x_date)

# Now you can use x_content, x_username, x_company, and x_date as needed


In [None]:
x = pd.concat([pd.DataFrame(x_content), pd.DataFrame(x_username), pd.DataFrame(x_company), pd.DataFrame(x_date)], axis=1)

In [None]:
import joblib

In [None]:
!pip install xgboost
from xgboost import XGBRegressor, XGBClassifier

In [None]:
classifier = joblib.load("/content/drive/MyDrive/xgboost_model.joblib")

In [None]:
x = np.array(x)
y_cat = classifier.predict(x)

In [None]:
y_reg1 = joblib.load("/content/drive/MyDrive/xgb_regressor_model_cat1.joblib")
y_reg2 = joblib.load("/content/drive/MyDrive/xgb_regressor_model_cat2.joblib")
y_reg3 = joblib.load("/content/drive/MyDrive/xgb_regressor_model_cat3.joblib")
y_reg4 = joblib.load("/content/drive/MyDrive/xgb_regressor_model_cat4.joblib")
y_reg5 = joblib.load("/content/drive/MyDrive/xgb_regressor_model_cat5.joblib")
y_reg6 = joblib.load("/content/drive/MyDrive/xgb_regressor_model_cat6.joblib")

In [None]:
tweet = []
for i in range(len(dataset)):
  tweet.append(dataset['content'][i] + " " + dataset['date'][i] + " " + dataset['username'][i] + " " + dataset["inferred company"][i])

In [None]:
dataset['tweet'] = tweet

In [None]:
import torch
import numpy as np
import time
from transformers import AutoModel, AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("vinai/bertweet-base", use_fast=False)
bertweet = AutoModel.from_pretrained("vinai/bertweet-base")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
bertweet.to(device)
bertweet.eval()

batch_size = 64

def get_bertweet_embeddings(sentences, model, tokenizer, device):
    inputs = tokenizer(sentences, return_tensors="pt", padding=True, truncation=True, max_length=128).to(device)

    print("Input tensor shape:", inputs["input_ids"].shape)

    with torch.no_grad():
        outputs = model(**inputs)

    return outputs.last_hidden_state.mean(dim=1).cpu().numpy()

start = time.time()
print("train data embeddings started")

X_test = None

for start_idx in range(0, len(dataset), batch_size):
    end_idx = min(start_idx + batch_size, len(dataset))
    batch_sentences = [text for text in dataset["tweet"].iloc[start_idx:end_idx].tolist() if text]

    try:
        X_batch = get_bertweet_embeddings(batch_sentences, bertweet, tokenizer, device)
    except RuntimeError as e:
        print(f"Error processing batch {start_idx} to {end_idx}: {e}")
        continue

    if start_idx == 0:
        X_test = X_batch
    else:
        X_test = np.concatenate([X_test, X_batch])

np.save("content-test-company-test_full.npy", X_test)

print(f"Time required = {time.time() - start}")
print("test data embeddings ended")


In [None]:
y_pred = []
for i in range(len(X_test)):
  x_reg = np.array(X_test[i]).reshape(1, 768)
  if y_cat[i] == 0:
      likes = y_reg1.predict(x_reg)
  elif y_cat[i] == 1:
      likes = y_reg2.predict(x_reg)
  elif y_cat[i] == 2:
      likes = y_reg3.predict(x_reg)
  elif y_cat[i] == 3:
      likes = y_reg4.predict(x_reg)
  elif y_cat[i] == 4:
      likes = y_reg5.predict(x_reg)
  else:
      likes = y_reg6.predict(x_reg)
  y_pred.append(likes)