In [1]:
# Install required libraries
!pip install -q -U google-generativeai
!pip install -q pandas
!pip install -q scikit-learn

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m175.4/175.4 kB[0m [31m4.2 MB/s[0m eta [36m0:00:00[0m00:01[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m28.2 MB/s[0m eta [36m0:00:00[0m00:01[0m
[?25h

In [2]:
# Import libraries
import pandas as pd
import numpy as np
import google.generativeai as genai
import json
import time
import logging
import re

# Set up logging
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger(__name__)

# Configure Gemini API
genai.configure(api_key="AIzaSyAdRIaN-x4kXLRqZG6eszKb558a7WC5-z4")

# Initialize the Gemini model
model = genai.GenerativeModel('gemini-1.5-flash')
logger.info("Gemini model initialized.")

# Define the techniques
techniques = [
    "loaded_language", "euphoria", "cherry_picking", "bandwagon", "glittering_generalities",
    "fud", "appeal_to_fear", "cliche", "whataboutism", "straw_man"
]

# Load the training dataset to calculate class priors
train_file_path = "/kaggle/input/unlpdata/NLP - exp(Encoded UTF-8).csv"
train_df = pd.read_csv(train_file_path)
train_df.columns = train_df.columns.str.strip()
train_df = train_df.applymap(lambda x: x.replace('\xa0', ' ') if isinstance(x, str) else x)
train_df = train_df.dropna(subset=['content', 'techniques'])
train_df = train_df.reset_index(drop=True)

# Calculate class priors
total_samples = len(train_df)
class_counts = {tech: train_df['techniques'].str.contains(tech, na=False).sum() for tech in techniques}
class_priors = {tech: count / total_samples for tech, count in class_counts.items()}
logger.info("Class priors (probability of being 1):")
logger.info(class_priors)

  train_df = train_df.applymap(lambda x: x.replace('\xa0', ' ') if isinstance(x, str) else x)


In [3]:
# Load the test dataset 
test_file_path = "/kaggle/input/testdatanlp/test-dataunlp.csv"
test_df = pd.read_csv(test_file_path)
test_df.columns = test_df.columns.str.strip()
test_df = test_df.applymap(lambda x: x.replace('\xa0', ' ') if isinstance(x, str) else x)
test_df = test_df.dropna(subset=['Column2'])
test_df = test_df.reset_index(drop=True)

# Adjust to 5735 rows
logger.info(f"Initial test set size: {len(test_df)}")
if len(test_df) != 5735:
    logger.warning(f"Test set has {len(test_df)} rows, but 5735 rows are required for submission.")
    if len(test_df) > 5735:
        test_df = test_df.iloc[1:].reset_index(drop=True)
        if len(test_df) > 5735:
            test_df = test_df.sample(n=5735, random_state=42).reset_index(drop=True)
        logger.info(f"Adjusted test set to 5735 rows.")
    else:
        raise ValueError(f"Test set has only {len(test_df)} rows after adjustments, but 5735 are required.")
logger.info(f"Final test set size: {len(test_df)}")
if len(test_df) != 5735:
    raise ValueError(f"Test set has {len(test_df)} rows, but 5735 are required.")

  test_df = test_df.applymap(lambda x: x.replace('\xa0', ' ') if isinstance(x, str) else x)


In [4]:
# Preprocessing function
def preprocessing(text):
    text = str(text).strip()
    text = re.sub(r'\s+', ' ', text).strip()
    return text

test_df['Column2'] = test_df['Column2'].apply(preprocessing)
logger.info("Test Dataframe Information:")
test_df.info()

# Prepare test set for prediction
test_set_sample = test_df.copy()
test_set_sample['pred_label'] = ''

# Create batches 
batch_size = 20
batches = []
for i in range(0, len(test_set_sample), batch_size):
    batches.append(test_set_sample[i:i + batch_size])
logger.info(f"Number of batches: {len(batches)}")

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5735 entries, 0 to 5734
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Column1  5735 non-null   object
 1   Column2  5735 non-null   object
dtypes: object(2)
memory usage: 89.7+ KB


In [5]:
# Function to process a batch with Gemini (zero-shot )
def gemini_completion_function(batch, current_batch, total_batch, model, max_retries=3):
    logger.info(f"Now processing batch #{current_batch + 1} of {total_batch}")
    
    json_data = batch[['Column2', 'pred_label']].to_json(orient='records', force_ascii=False)

    prompt = f"""
    You are an expert in text analysis and propaganda detection, specializing in Ukrainian and Russian language contexts. Your task is to classify whether each text sample contains specific propaganda techniques. The text samples are primarily in Ukrainian, and they may contain emotionally charged language, cultural references, or wartime rhetoric. Your goal is to identify the presence of the following techniques: loaded_language, euphoria, cherry_picking, bandwagon, glittering_generalities, fud, appeal_to_fear, cliche, whataboutism, straw_man. For each sample, output a list of 10 binary labels (0 or 1) corresponding to the presence of each technique in the order listed above. A text can contain multiple techniques, so carefully analyze the content for all possibilities.

    ### Definitions and Examples of Techniques
    - **loaded_language**: Using emotionally charged words to influence the audience.
      Example: "Военкомы продолжают паковать украинцев в микроавтобусы, пока нардепы только хотят рассмотреть нарушения в мобилизационном треке." (Transliteration: "Voenkomy prodolzhayut pakovat' ukraintsev v mikroavtobusy...") This uses "паковать украинцев" (packing Ukrainians) to evoke a strong emotional response.
    - **euphoria**: Highlighting positive events to boost morale and create optimism.
      Example: "Наши влупили на 3 направлениях, 3 переправы организовали – все успешные." (Transliteration: "Nashi vlupili na 3 napravleniyakh...") This creates a celebratory tone about military success.
    - **cherry_picking**: Selectively presenting data to support a claim while ignoring counterarguments.
      Example: "По нашим данным военкомы Одесской, Полтавской... почти перевыполняют план по мобилизации." (Transliteration: "Po nashim dannym voenkomy Odesskoy, Poltavskoy...") This highlights over-fulfillment in some areas while ignoring under-fulfillment elsewhere.
    - **bandwagon**: Suggesting that everyone is doing something, so you should too.
      Example: "В Германии набирает тренд санкционный скептицизм..." (Transliteration: "V Germanii nabiraet trend sanktsionnyy skeptitsizm...") This implies a growing trend to join.
    - **glittering_generalities**: Using vague, positive words like "freedom" or "justice" to evoke emotions.
      Example: "Дорогі українці! Сьогодні ми всі – єдині, як ніколи... Слава Україні!" (Transliteration: "Dorohi ukrayintsi! S'ohodni my vsi – yedyni, yak nikoly... Slava Ukrayini!") This uses "єдині" (united) and "Слава Україні" (Glory to Ukraine) to evoke patriotism.
    - **fud**: Spreading fear, uncertainty, and doubt.
      Example: "Останню бригаду кинули в бій, резервів не залишилось..." (Transliteration: "Ostan'nyu bryhadu kynuly v biy, rezerviv ne zalyshylos'...") This raises doubt about military resources.
    - **appeal_to_fear**: Using fear to persuade.
      Example: "УКРАЇНЦІВ ЗМУСЯТЬ СТАТИ НА ОБЛІК ЗА КОРДОНОМ? ... можуть вимкнутись навіть банківські картки." (Transliteration: "UKRAYINTSIV ZMUSYAT' STATY NA OBLІK ZA KORDONOM?...") This creates fear of losing banking access.
    - **cliche**: Using overused phrases to block critical thinking.
      Example: "Всієї правди ми ніколи не дізнаємось." (Transliteration: "Vsiyeyi pravdy my nikoly ne diznayemos'.") This cliché dismisses further inquiry.
    - **whataboutism**: Deflecting criticism by pointing to another issue.
      Example: "ВСУ выпустили точку-У, россияне её сбили... Такая же ситуация была в Одессе..." (Transliteration: "VSU vypustili tochku-U, rossiyane yeyo sbili...") This shifts focus to a similar incident.
    - **straw_man**: Misrepresenting an opponent’s argument to make it easier to attack.
      Example: "Согласно с меморандумом МОН... внедрения в образовательный процесс «гендерно чувствительных методов обучения»..." (Transliteration: "Zhidno z memorandumom MON...") This exaggerates the policy as harmful to children.

    ### Guidelines for Classification
    - A text can have multiple techniques. For example, a text might use both "loaded_language" and "appeal_to_fear" if it contains emotionally charged words and evokes fear.
    - Be cautious with rare techniques like "straw_man" or "bandwagon". They are less common, but if the text fits the definition, mark them as 1.
    - If a technique is not present, mark it as 0. Avoid overpredicting 1s, but also avoid always predicting 0 for rare classes.
    - Pay attention to the Ukrainian context. Words like "Слава Україні" (Glory to Ukraine) often indicate "glittering_generalities", while phrases like "мясорубка" (meat grinder) might indicate "loaded_language".

    ### Task
    Classify the following text samples provided in JSON format. For each sample, analyze the text in the 'Column2' field and update the 'pred_label' field with a string of 10 space-separated binary labels (e.g., "1 0 0 0 0 0 0 0 0 0"). Return the JSON data with updated 'pred_label' values—do not change the format.

    {json_data}
    """

    for attempt in range(max_retries):
        try:
            response = model.generate_content(prompt)
            raw_response = response.text.strip()
            cleaned_json = raw_response.replace("```json", "").replace("```", "").strip()
            json.loads(cleaned_json)
            return cleaned_json
        except Exception as e:
            logger.error(f"Error in batch {current_batch + 1}, attempt {attempt + 1}/{max_retries}: {e}")
            if attempt == max_retries - 1:
                logger.warning(f"Max retries reached for W batch {current_batch + 1}. Using default predictions.")
                batch['pred_label'] = "0 0 0 0 0 0 0 0 0 0"
                return batch[['Column2', 'pred_label']].to_json(orient='records', force_ascii=False)
            time.sleep(5 * (attempt + 1))

# Process all batches
batch_count = len(batches)
responses = []
for i in range(batch_count):
    print(f"Starting batch #{i + 1} of {batch_count}")  # Added print statement
    response = gemini_completion_function(batches[i], i, batch_count, model)
    responses.append(response)
    time.sleep(3)  # Reduced delay
    

Starting batch #1 of 287
Starting batch #2 of 287
Starting batch #3 of 287
Starting batch #4 of 287
Starting batch #5 of 287
Starting batch #6 of 287
Starting batch #7 of 287
Starting batch #8 of 287
Starting batch #9 of 287
Starting batch #10 of 287
Starting batch #11 of 287
Starting batch #12 of 287
Starting batch #13 of 287
Starting batch #14 of 287
Starting batch #15 of 287
Starting batch #16 of 287
Starting batch #17 of 287
Starting batch #18 of 287
Starting batch #19 of 287
Starting batch #20 of 287
Starting batch #21 of 287
Starting batch #22 of 287
Starting batch #23 of 287
Starting batch #24 of 287
Starting batch #25 of 287
Starting batch #26 of 287
Starting batch #27 of 287
Starting batch #28 of 287
Starting batch #29 of 287
Starting batch #30 of 287
Starting batch #31 of 287
Starting batch #32 of 287
Starting batch #33 of 287
Starting batch #34 of 287
Starting batch #35 of 287
Starting batch #36 of 287
Starting batch #37 of 287
Starting batch #38 of 287
Starting batch #39 of

In [None]:
# Combine all responses into a single DataFrame
df_total = pd.DataFrame()
for idx, response in enumerate(responses):
    try:
        json_data = response.strip("`")
        data = json.loads(json_data)
        df_temp = pd.DataFrame(data)
        df_total = pd.concat([df_total, df_temp], ignore_index=True)
    except json.JSONDecodeError as e:
        logger.error(f"JSON decode error in batch {idx + 1}: {e}")
        batch_size = len(batches[idx])
        default_data = [{"Column2": "", "pred_label": "0 0 0 0 0 0 0 0 0 0"} for _ in range(batch_size)]
        df_temp = pd.DataFrame(default_data)
        df_total = pd.concat([df_total, df_temp], ignore_index=True)

# Parse the pred_label strings into separate columns
pred_labels = df_total['pred_label'].apply(lambda x: [int(i) for i in x.split()]).tolist()
pred_df = pd.DataFrame(pred_labels, columns=techniques)

test_set_sample[techniques] = pred_df

In [7]:
# Post-process predictions using class priors
adjusted_preds = test_set_sample[techniques].copy()
for tech in techniques:
    prob = class_priors[tech]
    current_ones = adjusted_preds[tech].sum()
    target_ones = int(prob * len(test_set_sample))
    if current_ones < target_ones:
        zero_indices = adjusted_preds[adjusted_preds[tech] == 0].index
        num_to_flip = target_ones - current_ones
        if num_to_flip > 0 and len(zero_indices) > 0:
            flip_indices = np.random.choice(zero_indices, size=min(num_to_flip, len(zero_indices)), replace=False)
            adjusted_preds.loc[flip_indices, tech] = 1
    elif current_ones > target_ones:
        one_indices = adjusted_preds[adjusted_preds[tech] == 1].index
        num_to_flip = current_ones - target_ones
        if num_to_flip > 0 and len(one_indices) > 0:
            flip_indices = np.random.choice(one_indices, size=min(num_to_flip, len(one_indices)), replace=False)
            adjusted_preds.loc[flip_indices, tech] = 0

# Update test_set_sample with adjusted predictions
test_set_sample[techniques] = adjusted_preds.astype(int)

logger.info("Number of 1s per technique after adjustment:")
logger.info(test_set_sample[techniques].sum())

# Create submission file
submission = test_set_sample[['Column1']].copy()
submission.columns = ['id']
submission[techniques] = test_set_sample[techniques].astype(int)

logger.info(f"Number of data rows in submission (excluding header): {len(submission)}")
if len(submission) != 5735:
    raise ValueError(f"Submission has {len(submission)} data rows, but 5735 are required.")

submission.to_csv("submission_gemini_improved.csv", index=False)
logger.info("Submission file created: submission_gemini_improved.csv")
logger.info("First few rows of submission:")
logger.info(submission.head().to_string())