# Get VK groups

In [1]:
import requests
import time
import pandas as pd
import random
from google.colab import userdata

def search_vk_groups(access_token, keywords, total_count=5000):
    """
    Searches for VK groups by given keywords, extracts detailed information,
    and returns a list of dictionaries, including the number of subscribers.
    """
    found_groups = {}  # Dictionary to store unique groups (ID -> data)
    requests_count = 0

    for query in keywords:
        offset = 0
        while len(found_groups) < total_count:
            if requests_count >= 3:
                print("3 requests have been made, pausing for 1 second...")
                time.sleep(1)
                requests_count = 0

            print(f"Searching for '{query}' with an offset of {offset}. Found groups: {len(found_groups)}")

            api_url = "https://api.vk.com/method/groups.search"
            params = {
                'q': query,
                'type': 'group',
                'count': 1000,
                'offset': offset,
                'access_token': access_token,
                'v': '5.131',
                'fields': 'city,description,members_count'  # members_count added
            }

            try:
                response = requests.get(api_url, params=params)
                response.raise_for_status()
                data = response.json()

                if 'error' in data:
                    print(f"API error: {data['error']['error_msg']}")
                    break

                groups = data['response']['items']
                if not groups:
                    break

                for group in groups:
                    if group['id'] not in found_groups:
                        group_data = {
                            'group_id': group.get('id'),
                            'name': group.get('name', ''),
                            'city_id': group.get('city', {}).get('id', ''),
                            'city_name': group.get('city', {}).get('title', ''),
                            'description': group.get('description', ''),
                            'members_count': group.get('members_count', 0)  # Get the number of subscribers
                        }
                        found_groups[group['id']] = group_data

                offset += 1000
                requests_count += 1

                if len(groups) < 1000:
                    print(f"Found fewer than 1000 groups for the query '{query}', moving to the next one.")
                    break

            except requests.exceptions.RequestException as e:
                print(f"Request error: {e}")
                break

            if len(found_groups) >= total_count:
                print(f"The target number of groups ({total_count}) has been reached.")
                break

    return list(found_groups.values())

# --- Main script execution ---

MY_ACCESS_TOKEN = userdata.get('access_token')

if MY_ACCESS_TOKEN == "YOUR_ACCESS_TOKEN":
    print("Error: please replace 'YOUR_ACCESS_TOKEN' with your real access token.")
else:
    # Keywords for food delivery groups
    food_keywords = [
        '–¥–æ—Å—Ç–∞–≤–∫–∞ –µ–¥—ã', '–ø–∏—Ü—Ü–∞', '—Å—É—à–∏', '–±—É—Ä–≥–µ—Ä—ã', '—Ä–æ–ª–ª—ã',
        '–µ–¥–∞ –Ω–∞ –¥–æ–º', '–¥–æ—Å—Ç–∞–≤–∫–∞ –æ–±–µ–¥–æ–≤', '–¥–æ—Å—Ç–∞–≤–∫–∞ —à–∞—à–ª—ã–∫–∞',
        '–¥–æ—Å—Ç–∞–≤–∫–∞ –ø–∏—Ä–æ–≥–æ–≤', '–∫—É–ª–∏–Ω–∞—Ä–∏—è', 'catering'
    ]

    # Keywords for non-food delivery groups
    non_food_keywords = [
        '–Ω–æ–≤–æ—Å—Ç–∏', '—é–º–æ—Ä', '–º–µ–º—ã', '—Ñ–æ—Ç–æ–≥—Ä–∞—Ñ–∏—è', '—Å–ø–æ—Ä—Ç',
        '–∫–Ω–∏–≥–∏', '—Ñ–∏–ª—å–º—ã', '–º—É–∑—ã–∫–∞', '–ø—É—Ç–µ—à–µ—Å—Ç–≤–∏—è', '–∏—Å–∫—É—Å—Å—Ç–≤–æ'
    ]
    # Get 5000 food delivery groups
    print("--- Parsing food delivery groups ---")
    food_delivery_groups = search_vk_groups(MY_ACCESS_TOKEN, food_keywords, total_count=5000)
    for group in food_delivery_groups:
        group['label'] = 1  # Label "food delivery"

    # Get 5000 non-food delivery groups
    print("\n--- Parsing non-food delivery groups ---")
    non_food_groups = search_vk_groups(MY_ACCESS_TOKEN, non_food_keywords, total_count=5000)
    for group in non_food_groups:
        group['label'] = 0 # Label "non-food delivery"

    # Combine both lists and shuffle
    all_groups = food_delivery_groups + non_food_groups
    random.shuffle(all_groups)

    print(f"\nCombined {len(all_groups)} groups. Saving to Excel...")

    # Create a DataFrame from the list of dictionaries
    df = pd.DataFrame(all_groups)

    # Save to Excel
    df.to_excel('vk_groups_dataset.xlsx', index=False)
    print("Data successfully saved to 'vk_groups_dataset.xlsx'.")

--- Parsing food delivery groups ---
Searching for '–¥–æ—Å—Ç–∞–≤–∫–∞ –µ–¥—ã' with an offset of 0. Found groups: 0
Found fewer than 1000 groups for the query '–¥–æ—Å—Ç–∞–≤–∫–∞ –µ–¥—ã', moving to the next one.
Searching for '–ø–∏—Ü—Ü–∞' with an offset of 0. Found groups: 997
Searching for '–ø–∏—Ü—Ü–∞' with an offset of 1000. Found groups: 1982
Searching for '—Å—É—à–∏' with an offset of 0. Found groups: 1982
Found fewer than 1000 groups for the query '—Å—É—à–∏', moving to the next one.
3 requests have been made, pausing for 1 second...
Searching for '–±—É—Ä–≥–µ—Ä—ã' with an offset of 0. Found groups: 2868
Found fewer than 1000 groups for the query '–±—É—Ä–≥–µ—Ä—ã', moving to the next one.
Searching for '—Ä–æ–ª–ª—ã' with an offset of 0. Found groups: 3320
Found fewer than 1000 groups for the query '—Ä–æ–ª–ª—ã', moving to the next one.
Searching for '–µ–¥–∞ –Ω–∞ –¥–æ–º' with an offset of 0. Found groups: 3831
Found fewer than 1000 groups for the query '–µ–¥–∞ –Ω–∞ –¥–æ–º', moving to the nex

In [2]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11617 entries, 0 to 11616
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   group_id       11617 non-null  int64 
 1   name           11617 non-null  object
 2   city_id        11617 non-null  object
 3   city_name      11617 non-null  object
 4   description    11617 non-null  object
 5   members_count  11617 non-null  int64 
 6   label          11617 non-null  int64 
dtypes: int64(3), object(4)
memory usage: 635.4+ KB


In [3]:
df = df[df['description'].str.strip() != '']
display(df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 10646 entries, 0 to 11616
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   group_id       10646 non-null  int64 
 1   name           10646 non-null  object
 2   city_id        10646 non-null  object
 3   city_name      10646 non-null  object
 4   description    10646 non-null  object
 5   members_count  10646 non-null  int64 
 6   label          10646 non-null  int64 
dtypes: int64(3), object(4)
memory usage: 665.4+ KB


None

In [4]:
df['label'].value_counts()

Unnamed: 0_level_0,count
label,Unnamed: 1_level_1
1,5407
0,5239


# Get VK posts

In [5]:
import requests
import pandas as pd
import time
import os

# Replace with your actual token
MY_ACCESS_TOKEN = userdata.get('access_token')

def get_posts_and_filter(access_token, group_id, total_count=300):
    """
    Downloads posts (up to the last 300) and removes posts with low activity.
    """
    api_url = "https://api.vk.com/method/wall.get"
    all_posts = []
    offset = 0
    remaining_posts = total_count

    # Calculate the number of requests, each for 100 posts
    num_requests = (total_count + 99) // 100

    print(f"Attempting to get {total_count} posts, which will require {num_requests} request(s).")

    for i in range(num_requests):
        params = {
            'owner_id': -group_id,
            'count': min(100, remaining_posts),
            'offset': offset,
            'access_token': access_token,
            'v': '5.131'
        }

        try:
            response = requests.get(api_url, params=params)
            response.raise_for_status()
            data = response.json()

            if 'error' in data:
                print(f"API error when getting posts: {data['error']['error_msg']}")
                return [], None

            posts = data['response']['items']
            all_posts.extend(posts)

            # If fewer posts were received than requested, it's the end.
            if len(posts) < 100:
                print(f"Received fewer than 100 posts, finishing. Total: {len(all_posts)}")
                break

            # Increase the offset for the next request
            offset += 100
            remaining_posts -= 100

        except requests.exceptions.RequestException as e:
            print(f"Request error when getting posts: {e}")
            return [], None


    # Filter posts from the last year
    current_timestamp = int(time.time())
    one_year_ago = current_timestamp - 31536000  # 365 * 24 * 60 * 60

    relevant_posts = [post for post in all_posts if post['date'] >= one_year_ago]

    if not relevant_posts:
        print(f"No posts found for the last year for group {group_id}")
        return [], None

    # Calculate average activity
    total_activity = sum(p.get('likes', {}).get('count', 0) +
                         p.get('comments', {}).get('count', 0) +
                         p.get('reposts', {}).get('count', 0) for p in relevant_posts)

    if len(relevant_posts) > 0:
        avg_activity = total_activity / len(relevant_posts)
    else:
        avg_activity = 0

    print(f"Average activity in group {group_id}: {avg_activity:.2f}")

    # Filter posts with activity above average
    filtered_posts = [
        p for p in relevant_posts
        if (p.get('likes', {}).get('count', 0) +
            p.get('comments', {}).get('count', 0) +
            p.get('reposts', {}).get('count', 0)) > avg_activity
    ]

    return filtered_posts, data['response']['count']


def main():
    try:
        # For security, use environment variables if you are working with sensitive data
        # MY_ACCESS_TOKEN = os.environ.get("VK_API_TOKEN", "YOUR_ACCESS_TOKEN")
        df = pd.read_excel('vk_groups_dataset.xlsx')
    except FileNotFoundError:
        print("Error: 'vk_groups_dataset.xlsx' file not found. Please make sure it exists.")
        return

    print("Searching for the 100 most popular groups...")

    # 0. Keep only food delivery groups
    food_groups = df[df['label'] == 1]
    food_groups.reset_index(drop=True, inplace=True)

    # 1. Top 100 most popular groups.
    top_100_groups = food_groups.sort_values(by='members_count', ascending=False).head(100)
    print("Top 100 groups:")
    print(top_100_groups[['name', 'members_count']].to_string())

    # 2. Downloading and processing posts
    all_posts_data = []

    for _, row in top_100_groups.iterrows():
        group_id = row['group_id']
        group_name = row['name']
        print(f"\nProcessing group '{group_name}' (ID: {group_id})...")

        time.sleep(1)

        # Call the modified function, which now downloads 300 posts
        filtered_posts, total_posts_count = get_posts_and_filter(MY_ACCESS_TOKEN, group_id, total_count=300)

        if total_posts_count is not None:
            print(f"Total posts found: {total_posts_count}, after filtering: {len(filtered_posts)}")

        for post in filtered_posts:
            post_data = {
                'project': group_name,
                'account': group_id,
                'date': pd.to_datetime(post.get('date'), unit='s'),
                'type': 'post',
                'link': f"https://vk.com/wall{-group_id}_{post.get('id')}",
                'text': post.get('text', ''),
                'likes': post.get('likes', {}).get('count', 0),
                'comments': post.get('comments', {}).get('count', 0),
                'reposts': post.get('reposts', {}).get('count', 0),
                'views': post.get('views', {}).get('count', 0)
            }
            all_posts_data.append(post_data)

    if not all_posts_data:
        print("\nNo suitable posts found. The table will not be created.")
        return

    # 3. Saving to Excel
    final_df = pd.DataFrame(all_posts_data)

    # Clean and order columns
    final_df = final_df[[
        'project', 'account', 'date', 'type', 'link',
        'text', 'likes', 'comments', 'reposts', 'views'
    ]]

    print(f"\nTotal of {len(final_df)} posts found and processed.")
    final_df.to_excel('vk_posts_data.xlsx', index=False)
    print("Data successfully saved to 'vk_posts_data.xlsx'.")

if __name__ == "__main__":
    if MY_ACCESS_TOKEN == "YOUR_ACCESS_TOKEN":
        print("Please replace 'YOUR_ACCESS_TOKEN' with your real access token.")
    else:
        main()

Searching for the 100 most popular groups...
Top 100 groups:
                                                  name  members_count
3649              –ó–∞–∫—Ä—É—Ç–∫–∏ –Ω–∞ –∑–∏–º—É –∏ –∑–∞–≥–æ—Ç–æ–≤–∫–∏ –Ω–∞ –∑–∏–º—É         762724
3284  –ü—Ä–æ100 –µ–¥–∞ | –†–µ—Ü–µ–ø—Ç—ã  | –°–∞–ª–∞—Ç—ã | –í—ã–ø–µ—á–∫–∞ | –¢–æ—Ä—Ç—ã         627984
3699                                          –°—É—à–∏ Wok         452234
1330                             –ß–∏–±–±–∏—Å - –¥–æ—Å—Ç–∞–≤–∫–∞ –µ–¥—ã         437902
2975       –ü—Ä–∞–≤–∏–ª—å–Ω–æ–µ –ø–∏—Ç–∞–Ω–∏–µ. –ó–¥–æ—Ä–æ–≤–∞—è –µ–¥–∞. Julromanj         294419
5015                                       –°—É—à–∏ –ú–∞—Å—Ç–µ—Ä         256100
5535                              –ì—Ä—É–ø–ø–∞ PIZZA (–ü–∏—Ü—Ü–∞)         251941
5598                        –î–æ–¥–æ –ü–∏—Ü—Ü–∞ –°–∞–Ω–∫—Ç-–ü–µ—Ç–µ—Ä–±—É—Ä–≥         207773
119                        –ó–û–ñ –ü–ü –ï–î–ê –∏ –ª–µ–≥–∫–∏–µ —Ä–µ—Ü–µ–ø—Ç—ã         177507
612                –û—Ä–∏–≥–∞–º–∏ - –¥–æ—Å—Ç–∞–≤–∫–∞ —Å—É—à–∏, 

Simple markup

In [6]:
import pandas as pd
import re
import time

def classify_post_improved(text):
    """
    Improved function to classify post text into 5 categories using
    regular expressions and expanded dictionaries for increased accuracy.
    0 - non-committal
    1 - obligating to give a discount
    2 - obligating to give a gift
    3 - obligating to give cashback
    4 - obligating to deliver on time
    """
    if not isinstance(text, str) or not text.strip():
        return 0

    text_lower = text.lower()

    # --- EXPANDED DICTIONARIES WITH REGULAR EXPRESSIONS ---

    # Class 4: On-time delivery (the rarest, check first)
    delivery_keywords = [
        r'\b–¥–æ—Å—Ç–∞–≤\S+ –∑–∞ \d+\s*(–º–∏–Ω—É—Ç|—á–∞—Å|–¥–Ω—è)', r'–¥–æ—Å—Ç–∞–≤–∏–º (–≤–æ–≤—Ä–µ–º—è|–≤ —Å—Ä–æ–∫)',
        r'–≥–∞—Ä–∞–Ω—Ç\S+ –¥–æ—Å—Ç–∞–≤–∫\S+', r'–æ–ø–æ–∑–¥–∞\S+ ‚Äî', r'–µ—Å–ª–∏ –æ–ø–æ–∑–¥–∞–µ–º',
        r'–±—ã—Å—Ç—Ä\S+ –¥–æ—Å—Ç–∞–≤–∫\S+', r'—ç–∫—Å–ø—Ä–µ—Å—Å-–¥–æ—Å—Ç–∞–≤–∫\S+',
        r'–ø—Ä–∏–≤–µ–∑–µ–º –∫ \d{2}:\d{2}', r'–¥–æ—Å—Ç–∞–≤–∫–∞ —Ç–æ—á–Ω–æ –∫–æ –≤—Ä–µ–º–µ–Ω–∏'
    ]
    if any(re.search(pattern, text_lower) for pattern in delivery_keywords):
        return 4

    # Class 3: Cashback
    cashback_keywords = [
        r'–∫[–µ—ç]—à–±[–µ—ç]–∫\S*', r'cashback', r'–≤–æ–∑–≤—Ä–∞—â–∞–µ–º \d+%', r'–≤–µ—Ä–Ω[–µ—ë]–º \d+%',
        r'\b–±–∞–ª–ª\S+ –∑–∞ –ø–æ–∫—É–ø–∫\S+', r'–ø—Ä–æ–≥—Ä–∞–º–º\S+ –ª–æ—è–ª—å–Ω–æ—Å—Ç\S+',
        r'–∫–æ–ø–∏\S+ –±–∞–ª–ª\S+', r'–Ω–∞—á–∏—Å–ª–∏\S+ –±–æ–Ω—É—Å\S+', r'–±–æ–Ω—É—Å\S+ –Ω–∞ —Å—á[–µ—ë]—Ç'
    ]
    if any(re.search(pattern, text_lower) for pattern in cashback_keywords):
        return 3

    # --- Check for Gift and Discount ---
    # First, determine the presence of indicators, then decide which class to assign

    # Class 2: Gift
    gift_keywords = [
        r'–ø–æ–¥–∞—Ä\S+', r'—Ä–æ–∑—ã–≥—Ä—ã—à\S*', r'–∫–æ–Ω–∫—É—Ä—Å\S*', r'\b–ø—Ä–∏–∑\S*',
        r'\b–±–µ—Å–ø–ª–∞—Ç–Ω\S+', r'–≤—ã–∏–≥—Ä–∞–π\S*', r'–ø–æ–±–µ–¥–∏—Ç–µ–ª\S*', r'\b–¥–∞—Ä–∏–º\b',
        r'—Ä–∞–∑—ã–≥—Ä—ã–≤–∞–µ–º', r'1\s*\+\s*1', r'—Ç—Ä–µ—Ç–∏–π –≤ –ø–æ–¥–∞—Ä–æ–∫', r'–≤—Ç–æ—Ä–æ–π –±–µ—Å–ø–ª–∞—Ç–Ω–æ'
    ]

    # Class 1: Discount
    discount_keywords = [
        r'—Å–∫–∏–¥–∫\S*', r'–ø—Ä–æ–º–æ–∫–æ–¥\S*', r'–∫—É–ø–æ–Ω\S*', r'—Å–ø–µ—Ü\S*–ø—Ä–µ–¥–ª–æ–∂–µ–Ω\S+',
        r'–¥–∏—Å–∫–æ–Ω—Ç\S*', r'-\d+%', r'—Å–∫–∏–¥–∫–∞ \d+%', r'\b–∞–∫—Ü–∏\S+',
        r'—Ä–∞—Å–ø—Ä–æ–¥–∞–∂\S*', r'—Å–Ω–∏–∂–µ–Ω\S+ —Ü–µ–Ω\S*', r'–≤—ã–≥–æ–¥\S+'
    ]

    # Check the flags
    is_gift = any(re.search(pattern, text_lower) for pattern in gift_keywords)
    is_discount = any(re.search(pattern, text_lower) for pattern in discount_keywords)

    # Priority logic: Gift (Class 2) is more important than Discount (Class 1)
    if is_gift:
        # Even if there is a mention of a discount, a giveaway or a gift is a stronger obligation
        return 2

    if is_discount:
        # This block will only be triggered if no gift indicators were found
        return 1

    # Class 0: non-committal (default)
    return 0

# --- Main code block (remains unchanged) ---

# Load data
try:
    df = pd.read_excel('vk_posts_data.xlsx')
    print("File 'vk_posts_data.xlsx' successfully loaded.")
except FileNotFoundError:
    print("Error: File 'vk_posts_data.xlsx' not found.")
    exit()

# Check for 'text' column
if 'text' not in df.columns:
    print(f"Error: The file must have a 'text' column. Found columns: {df.columns.tolist()}")
    exit()

print("Starting post classification...")
start_time = time.time()

# Apply the improved classification function
df['class'] = df['text'].apply(classify_post_improved)

end_time = time.time()
print(f"Classification completed in {end_time - start_time:.2f} seconds.")

# View class distribution for analysis
print("\nPost distribution by class:")
print(df['class'].value_counts().sort_index())

# Create and save the final DataFrame
result_df = df[['text', 'class']]
output_filename = 'vk_posts_classified.xlsx'
result_df.to_excel(output_filename, index=False)

print(f"\nClassification completed. Results saved to file '{output_filename}'")

File 'vk_posts_data.xlsx' successfully loaded.
Starting post classification...
Classification completed in 0.46 seconds.

Post distribution by class:
class
0    2864
1     209
2    1616
3      10
4      11
Name: count, dtype: int64

Classification completed. Results saved to file 'vk_posts_classified.xlsx'


Balancing classes

In [7]:
import pandas as pd
from sklearn.utils import resample # Convenient library for these tasks

# --- SETTINGS ---
INPUT_FILE = 'vk_posts_classified.xlsx' # File with results from the previous step
OUTPUT_FILE = 'vk_posts_balanced.xlsx'  # Name of the file for the balanced dataset
SAMPLES_PER_CLASS = 500                 # Target number of examples for each class
RANDOM_STATE = 42                       # For reproducibility of results

# --- LOAD DATA ---
try:
    df = pd.read_excel(INPUT_FILE)
    print(f"File '{INPUT_FILE}' successfully loaded.")
    print("\nOriginal class distribution:")
    print(df['class'].value_counts().sort_index())
except FileNotFoundError:
    print(f"Error: File '{INPUT_FILE}' not found. Make sure it is in the same folder.")
    exit()

# --- BALANCING ---
print(f"\nStarting balancing. Goal: {SAMPLES_PER_CLASS} posts for each class...")

# Create an empty DataFrame for the balanced data
df_balanced = pd.DataFrame()

# Iterate through each unique class in the dataset
for post_class in df['class'].unique():
    df_class = df[df['class'] == post_class]

    # Apply Undersampling or Oversampling
    df_class_resampled = resample(
        df_class,
        replace=len(df_class) < SAMPLES_PER_CLASS, # replace=True for Oversampling (increase)
        n_samples=SAMPLES_PER_CLASS,
        random_state=RANDOM_STATE
    )

    # Add the processed class to the main DataFrame
    df_balanced = pd.concat([df_balanced, df_class_resampled])

print("\nBalancing complete!")

# --- RESULT ---
print("\nNew class distribution:")
print(df_balanced['class'].value_counts().sort_index())

# Shuffle the final dataset so that the classes are not in order
df_balanced = df_balanced.sample(frac=1, random_state=RANDOM_STATE).reset_index(drop=True)

# Save the balanced dataset to a new Excel file
df_balanced.to_excel(OUTPUT_FILE, index=False)

print(f"\nBalanced dataset saved to file '{OUTPUT_FILE}'.")
print(f"Final dataset size: {len(df_balanced)} rows.")

File 'vk_posts_classified.xlsx' successfully loaded.

Original class distribution:
class
0    2864
1     209
2    1616
3      10
4      11
Name: count, dtype: int64

Starting balancing. Goal: 500 posts for each class...

Balancing complete!

New class distribution:
class
0    500
1    500
2    500
3    500
4    500
Name: count, dtype: int64

Balanced dataset saved to file 'vk_posts_balanced.xlsx'.
Final dataset size: 2500 rows.
