In [79]:
import pandas
from elasticsearch import Elasticsearch
import time
from tqdm import tqdm
from datetime import datetime, timedelta

In [None]:
elastic_address = ""
try:
    es = Elasticsearch(elastic_address)
    print("Connected to Elasticsearch: %s", es.ping())
except Exception as error:
    print("Elasticsearch connection error: %s", error)

In [None]:
import json

def get_posts_from_elastic(es, index_name, subject, sentiment_label, start_time):
    """Fetches posts from Elasticsearch, paginated by search_after."""
    
    print("Determining correct users for each subject...")

    search_after = None
    project_name = f"projects.{subject}.label.keyword"  # Use .keyword for exact match if it's a text field
    print(project_name, sentiment_label)
    
    # Ensure start_time is correctly formatted
    start_time = datetime.strptime(start_time, '%Y-%m-%dT%H:%M:%S.%fZ').strftime('%Y-%m-%d')

    batch_number = 1
    dashboard_data = []

    while True:
        print(f"Batch Number: {batch_number}")
        must_clauses = [
            {"range": {"published_at": {"gt": start_time}}},
            {"term": {project_name: sentiment_label}},
        ]
        
        query = {
            "size": 1000,
            "track_total_hits": True,
            "_source": ['text', 'published_at', project_name],
            "query": {
                "bool": {
                    "must": must_clauses,
                }
            },
            "sort": [{'published_at': {"order": "desc"}}]
        }
        

        if search_after:
            query["search_after"] = search_after

        try:
            response = es.search(index=index_name, body=query)
        except Exception as e:
            print(f'Exception in get_posts_from_elastic: {e}')
            time.sleep(10)
            continue

        documents = response['hits']['hits']
        total_hits = response['hits']['total']['value']
        print(f"Found {total_hits} posts")

        if not documents:
            break

        for doc in tqdm(documents, desc="Processing documents"):
            source = doc['_source']
            dashboard_data.append([source['text']])

        batch_number += 1
        search_after = documents[-1]['sort']

    if len(dashboard_data):
        return pd.DataFrame(dashboard_data, columns=["Text"])
    else:
        return None

def convert_to_iso_format(date_str):
    dt = datetime.strptime(date_str, "%Y/%m/%d %H:%M")
    # Convert to ISO 8601 format '2024-09-06T15:14:00.000Z'
    return dt.strftime("%Y-%m-%dT%H:%M:%S.%f")[:-3] + 'Z'


current_datetime = datetime.now() #- timedelta(days=50)
yesterday_datetime = current_datetime - timedelta(days=5)
yesterday_datetime_str = yesterday_datetime.strftime("%Y/%m/%d %H:%M")
current_datetime_str = current_datetime.strftime("%Y/%m/%d %H:%M")
s_time = convert_to_iso_format(yesterday_datetime_str)

subject = 'Emotion'
index_name = 'tweet_index'
sentiment_label_1 = 'FEAR'
sentiment_label_2 = 'SURPRISE'
dataset_df = get_posts_from_elastic(es, index_name, subject, sentiment_label_1, s_time)
print(dataset_df.shape)

In [None]:
import requests
import pandas as pd
import csv
import ast

def deep_seek_sentiment_labeling(text, deep_seek_url=""):
    """Label the sentiment of the text based on predefined rules."""
    
    url = deep_seek_url

    headers = {
        'Content-Type': 'application/json'
    }

    prompt = f'''
    You are an expert in sentiment analysis and emotion classification.
    Your task is to analyze the following text and return a vector of length 6,
    where each element corresponds to one of the following sentiments: "anger", "disgust", "fear", "joy", "sadness", and "surprise".

    The vector should follow these rules:
    1. There can be a maximum of two sentiments in each sentence.
    2. For each sentiment, label it as 1 if it is present in the text, and 0 if it is not.
    3. The emotions to classify are: "anger", "disgust", "fear", "joy", "sadness", "surprise".

    Examples:
    Example 1: Etv ዘረኛ ናቹሁ አሽቃባጭ ስንት አባቶቻችን መስዋት የሆኑበትን ባዲራ እያቃጠላቹሁ አሁን በዚህ አዲስ ባዲራ አገሯን ታባላላላቹሁ ለጊዜው ፈጩ ችግር የለም ግን ,? 
    Emotion vector: [1, 1, 0, 0, 0, 0]
    
    Example 2: የዛሬው የጦጣ ንክሻ ይለያል ክክ አይ ፊሊፖስ አንተ ጅማታም እንድህ ቅጥል ስትል ደስ ይለኛል 
    Emotion vector: [0, 1, 0, 1, 0, 0]

    Example 3: ኢሄ ፕራንክ ሚባል ነገር ቅጥ አጣ አሁንስ  
    Emotion vector: [1, 1, 0, 0, 0, 0]
    
    Example 4: መንጌ ጠላቱ ባለጌ እሱ ግን ይኖራል ለኢትዮጵያ ሁል ጊ ዜ
    Emotion vector: [0, 0, 0, 0, 0, 0]
    
    Example 5: ልጆችን ለሚዲያ መሸቀጫ ማዋል የሰው አይን እንዲገቡ ፀፀቱ እንዲጎዳን ያደርጋል ያሳዝናል ነገሩ ከዚህ ልንማር ይገባል ነፍስ ይማር
    Emotion vector: [0, 0, 0, 0, 0, 1]
    
    Example 6:ፍቅረኛውን የገደለው አሜሪካዊ ታዳኝ ከኬንያ የፖሊስ ቁጥጥር ስር አመለጠ
    Emotion vector: [0, 0, 0, 0, 0, 0]
    
    Example 7:አለም እስካሁን በአገኘው አጋጣሚ ሁሉ እስራኤላውያንን በእጅጉ በድሏል አሁን ላይ ግን ያ አይቻልም እስራኤልን መንካት ምን ያክል ዋጋ እንደሚያስከፍል ሁሉም ቅጣቱን ይቀምሳል እስራኤል ወደ ፊት
    Emotion vector: [1, 0, 0, 0, 0, 0]
    
    For example, a text that expresses both fear and surprise might return something like:  
    Emotion vector: [1, 0, 0, 0, 1, 0] (fear and surprise are present, others are not).

    Now, analyze the following text and provide **only the emotion vector** (a list of 6 numbers) with no additional explanation or text:
    Text: {text}
    Emotion vector: 
    '''
    
    data = {
        "model": "",
        "messages": [
            {"role": "user", "content": prompt}
        ],
        "temperature": 0,  
        "max_tokens": 40 
    }
    
    response = requests.post(url, headers=headers, json=data)

    try:
        label = response.json()['choices'][0]['message']['content']
    except KeyError:
        label = "Error"

    print("text:", text)
    print("label:", label)
    return label


def label_texts_and_save(df, output_file):
    """Iterates over the DataFrame, labels each text, and saves each row to a CSV file."""
    
    with open(output_file, mode='a', newline='', encoding='utf-8') as file:
        writer = csv.writer(file)
        writer.writerow(["text", "anger", "disgust", "fear", "joy", "sadness", "surprise"])  # Write header if needed
    
        for index, row in df.iterrows():
            try:
                text = row['tweet']
                label_str = deep_seek_sentiment_labeling(text)
                
                # Convert string representation of list to an actual list
                label_list = ast.literal_eval(label_str.split(': ')[1]) if ':' in label_str else ast.literal_eval(label_str)
                
                writer.writerow([text] + label_list)  # Save each row immediately
                print(f"Processed row {index + 1}/{len(df)}")

            except ConnectionError as e:
                print(f"ConnectionError encountered at row {index}: {e}. Sleeping for 5 minutes.")
                time.sleep(300)  # Sleep for 5 minutes
                continue 

            except Exception as e:
                print(f"Unexpected error encountered at row {index}: {e}.")
                
amharic_tweet_df = pd.read_csv(r"train.csv")

output_path = r"repo_amharic_train.csv"
label_texts_and_save(amharic_tweet_df[:2000], output_path)

In [4]:
def clean_df(df):
    df = df.dropna()
    df = df.drop_duplicates(subset='text', keep="first")
    df = df.reset_index(drop=True)
    return df

lab_deepseek_df = clean_df(lab_deepseek_df)
lab_deepseek_df.shape

In [6]:
import re
def clean_text(row, options):
    if options['lowercase']:
        row = row.lower()

    if options['remove_url']:
        row = re.sub(r"(?:\@|https?\://)\S+", "", row)

    if options['remove_mentions']:
        row = re.sub("@[A-Za-z0-9_]+","", row)

    if options['demojify']:
          emoj = re.compile("["
            u"\U0001F600-\U0001F64F"  # emoticons
            u"\U0001F300-\U0001F5FF"  # symbols & pictographs
            u"\U0001F680-\U0001F6FF"  # transport & map symbols
            u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
            u"\U00002500-\U00002BEF"  # chinese char
            u"\U00002702-\U000027B0"
            u"\U00002702-\U000027B0"
            u"\U000024C2-\U0001F251"
            u"\U0001f926-\U0001f937"
            u"\U00010000-\U0010ffff"
            u"\u2640-\u2642" 
            u"\u2600-\u2B55"
            u"\u200d"
            u"\u23cf"
            u"\u23e9"
            u"\u231a"
            u"\ufe0f"  # dingbats
            u"\u3030"
                          "]+", re.UNICODE)
    row = re.sub(emoj, '', row)
    return row

clean_config = {
    'remove_url': True,
    'remove_mentions': True,
    'lowercase': True,
    'demojify': True
    }

lab_deepseek_df['text'] = lab_deepseek_df['text'].apply(clean_text, args=(clean_config,))
lab_deepseek_df.head()

(1750, 7)

In [111]:
import pandas as pd
import ast

# Read the CSV file into a DataFrame
df = pd.read_csv(r"semivalcreate_dataset\fear.csv")

# Remove any leading or trailing spaces or unwanted characters in the 'Label' column
df['Label'] = df['Label'].str.strip()

# Optionally, clean up malformed strings (if you know the pattern)
df['Label'] = df['Label'].apply(lambda x: x if x.startswith('[') and x.endswith(']') else '[]')

# Now apply ast.literal_eval to safely convert string to a list
df['Label'] = df['Label'].apply(ast.literal_eval)

# Emotion column names
emotion_columns = ["fear", "disgust", "joy", "sadness", "surprise", "anger"]

# Expand the 'Label' column into separate emotion columns
emotion_df = pd.DataFrame(df['Label'].tolist(), columns=emotion_columns)

# Concatenate the new emotion columns with the original DataFrame (if needed)
df = pd.concat([df, emotion_df], axis=1)

# Drop the original 'Label' column if no longer needed
df = df.drop(columns=['Label'])

# Save the DataFrame with the new columns to a CSV file
df.to_csv("output.csv", index=False)

print("Data saved to output.csv")


Data saved to output.csv


In [None]:
import pandas as pd

sentiment_columns = ["joy", "anger", "fear", "disgust", "sadness", "surprise"]

def filter_dataframe(df):
    # Iterate through each row
    df = df[(df['Text'].apply(len) > 80) & (df['Text'].apply(len) < 300)]
    
    for col in sentiment_columns:
        df[col] = df[col].astype(int)

    for col in sentiment_columns:
        df = df[df[col] <= 1] 

    return df

df = filter_dataframe(df)

print(df.shape)

df.to_csv("filtered_semival_train_output.csv", encoding = 'utf-8-sig', index=False)


In [None]:
import requests
import pandas as pd
import time
from tqdm import tqdm

def deep_seek_sentiment_labeling(text, deep_seek_urls=""):
    """Label the sentiment of the text based on predefined rules."""
    
    url = deep_seek_urls

    headers = {
        'Content-Type': 'application/json'
    }

    prompt = f'''
    You are an expert translator specializing in converting text from Persian, English, and French into Amharic. Your task is to accurately translate the following text into Amharic while following these guidelines:

    1. Do not translate or include any text containing hashtags (#) or URLs; remove them from the output.
    2. Ensure that the translation maintains the original meaning and context.

    Text: {text}  
    Translated Amharic Text:
    '''

    data = {
        "model": "",
        "messages": [
            {"role": "user", "content": prompt}
        ],
        "temperature": 0,  
        "max_tokens": 600 
    }
    
    response = requests.post(url, headers=headers, json=data)

    try:
        label = response.json()['choices'][0]['message']['content']
    except (KeyError, IndexError):
        label = "Error"

    return label

def label_texts_and_save(amh_df, save_path):
    """Iterates over the DataFrame and applies the labeling function to each text,
    saving the result incrementally after each row."""
    
    with open(save_path, mode='w', encoding='utf-8-sig', newline='') as f:
        amh_df.iloc[0:0].to_csv(f, index=False) 

    for index, row in tqdm(amh_df.iterrows(), total=len(amh_df), desc="Processing"):
        try:
            translated_text = deep_seek_sentiment_labeling(row['Text'])
            new_row = row.to_dict()
            new_row['translated_amhric'] = translated_text
            pd.DataFrame([new_row]).to_csv(save_path, mode='a', header=False, index=False, encoding='utf-8-sig')

        except ConnectionError as e:
            print(f"ConnectionError encountered at row {index}: {e}. Sleeping for 5 minutes.")
            time.sleep(300)  # Sleep for 5 minutes
            continue 
            
        except Exception as e:
            print(f"Unexpected error encountered at row {index}: {e}.")            
    return amh_df

# Load your DataFrame
save_path = r"amharic_deepseek_fear.csv"
df = pd.read_csv("filtered_semival_train_output.csv")
print(df.shape)
amh_df = label_texts_and_save(df[330:], save_path)
amh_df.to_csv(r"amharic_deepseek_fear.csv")

In [11]:
df_conf = pd.read_csv(r"amh_train_translated.csv")

In [12]:
df_conf.head()

Unnamed: 0.1,Unnamed: 0,id,text,anger,disgust,fear,joy,sadness,surprise,translated_text
0,0,amh_train_track_a_00001,እናንተ መቸም አትማሩም ምድረ አውሬ ሁላ።,1,1,0,0,0,0,شما هرگز یاد نمی‌گیرید، ای سرزمین وحشیان.
1,1,amh_train_track_a_00002,ምነው የዶክተር እስራኤል ጥላሁን ሞት በዝምታ አለፋችሁት ባለስልጣን ስለሆ...,1,1,0,0,0,0,چرا سکوت کردید در برابر مرگ دکتر اسرائیل تلاون...
2,2,amh_train_track_a_00003,እንዲሁም ይህ እጅግ እርሶን የሚወዶት እና እጅግም የሚሳሳልዎን የአዲስአበ...,0,0,0,0,0,0,همچنین، به دلیل اینکه این کار مردم آدیس آبابا ...
3,3,amh_train_track_a_00004,Etv ዘረኛ ናቹሁ አሽቃባጭ ስንት አባቶቻችን መስዋት የሆኑበትን ባዲራ እ...,1,1,0,0,0,0,Etv شما نژادپرستان و خائنین، چندین پدران ما را...
4,4,amh_train_track_a_00005,የዛሬው የጦጣ ንክሻ ይለያል ክክ አይ ፊሊፖስ አንተ ጅማታም እንድህ ቅጥል...,0,1,0,1,0,0,امروز بازی جذاب خواهد بود، ای فیلیپس، وقتی این...


In [50]:
main_df = main_df.rename(columns = {'text':'Text'})
main_df.columns

Index(['Unnamed: 0', 'translated_text', 'fear', 'disgust', 'joy', 'sadness',
       'surprise', 'anger', 'Text'],
      dtype='object')

In [32]:
df_conf = df_conf.drop(columns = ['id'])

In [33]:
final_df = pd.concat([df_conf, main_df])

final_df.shape


(4636, 10)

In [34]:
final_df.to_csv(r"amharic_deepseek_final_train_df.csv", encoding='utf-8-sig')

In [35]:
final_df['fear'].value_counts()

0    3729
1     907
Name: fear, dtype: int64

In [36]:
final_df['surprise'].value_counts()

0    4298
1     338
Name: surprise, dtype: int64

In [37]:
final_df.columns

Index(['Unnamed: 0', 'text', 'anger', 'disgust', 'fear', 'joy', 'sadness',
       'surprise', 'translated_text', 'Text'],
      dtype='object')

In [61]:
main_df.columns

Index(['Unnamed: 0', 'translated_text', 'fear', 'disgust', 'joy', 'sadness',
       'surprise', 'anger', 'Text'],
      dtype='object')

In [14]:
df_conf = df_conf.drop(columns = ['id'])

In [17]:
df_conf = df_conf[['Unnamed: 0', 'translated_text', 'fear', 'disgust', 'joy', 'sadness', 
                   'surprise', 'anger', 'text']]

df_conf.head()

Unnamed: 0.1,Unnamed: 0,translated_text,fear,disgust,joy,sadness,surprise,anger,text
0,0,شما هرگز یاد نمی‌گیرید، ای سرزمین وحشیان.,0,1,0,0,0,1,እናንተ መቸም አትማሩም ምድረ አውሬ ሁላ።
1,1,چرا سکوت کردید در برابر مرگ دکتر اسرائیل تلاون...,0,1,0,0,0,1,ምነው የዶክተር እስራኤል ጥላሁን ሞት በዝምታ አለፋችሁት ባለስልጣን ስለሆ...
2,2,همچنین، به دلیل اینکه این کار مردم آدیس آبابا ...,0,0,0,0,0,0,እንዲሁም ይህ እጅግ እርሶን የሚወዶት እና እጅግም የሚሳሳልዎን የአዲስአበ...
3,3,Etv شما نژادپرستان و خائنین، چندین پدران ما را...,0,1,0,0,0,1,Etv ዘረኛ ናቹሁ አሽቃባጭ ስንት አባቶቻችን መስዋት የሆኑበትን ባዲራ እ...
4,4,امروز بازی جذاب خواهد بود، ای فیلیپس، وقتی این...,0,1,1,0,0,0,የዛሬው የጦጣ ንክሻ ይለያል ክክ አይ ፊሊፖስ አንተ ጅማታም እንድህ ቅጥል...


In [67]:
final_df = pd.concat([df_conf, main_df])

final_df.shape

(4636, 9)

In [70]:
final_df = final_df.rename(columns = {'Text':'text'})
final_df.columns

Index(['Unnamed: 0', 'translated_text', 'fear', 'disgust', 'joy', 'sadness',
       'surprise', 'anger', 'text'],
      dtype='object')

In [71]:
final_df.to_csv(r"amharic_deepseek_final_train_df.csv", encoding='utf-8-sig')

In [17]:
lab_deepseek_df = lab_deepseek_df.rename(columns = {'text':'Text'})

In [9]:
lab_deepseek_df.head()

Unnamed: 0,text,anger,disgust,fear,joy,sadness,surprise
0,በሞጣ በተከሰተው ነገር አልገረምም እስካሁን ከበቂ በላይ ለፍልፌ ነበር ሕ...,0,1,0,0,0,0
1,ሰላም ለምድራችን ይሁን!\nአንዳንዴ ዝም በማለትና ወሬ ባለማራገብ የሀገራ...,0,0,0,1,0,0
2,"""could you beloved"" የሚለውን የቦብ ማርሊይ ዘፈንን ""ፒቺንኪ...",0,1,0,1,0,0
3,#«የእኔንም የሕይወት ታሪክ አንዳንድ ሰዎች ባለማወቅና በመሳሳት ፡ ወይም...,0,1,0,0,1,0
4,የከተማ ወጣት ሰብስብክ መፎከር እኔም አላንስም እስኪ መንግስት ምሰል በ...,0,0,0,0,1,0


In [20]:
df_conf = pd.read_csv(r"amh_train_translated.csv")
df_conf.columns


Index(['Unnamed: 0', 'id', 'text', 'anger', 'disgust', 'fear', 'joy',
       'sadness', 'surprise', 'translated_text'],
      dtype='object')

In [23]:
df_conf = df_conf[['Text', 'anger', 'disgust', 'fear', , 'joy', 'sadness', 
                   'surprise']]
df_conf = df_conf.rename(columns = {'text':'Text'})
df_conf.head()

Unnamed: 0.1,Unnamed: 0,translated_text,fear,disgust,joy,sadness,surprise,anger,Text
0,0,شما هرگز یاد نمی‌گیرید، ای سرزمین وحشیان.,0,1,0,0,0,1,እናንተ መቸም አትማሩም ምድረ አውሬ ሁላ።
1,1,چرا سکوت کردید در برابر مرگ دکتر اسرائیل تلاون...,0,1,0,0,0,1,ምነው የዶክተር እስራኤል ጥላሁን ሞት በዝምታ አለፋችሁት ባለስልጣን ስለሆ...
2,2,همچنین، به دلیل اینکه این کار مردم آدیس آبابا ...,0,0,0,0,0,0,እንዲሁም ይህ እጅግ እርሶን የሚወዶት እና እጅግም የሚሳሳልዎን የአዲስአበ...
3,3,Etv شما نژادپرستان و خائنین، چندین پدران ما را...,0,1,0,0,0,1,Etv ዘረኛ ናቹሁ አሽቃባጭ ስንት አባቶቻችን መስዋት የሆኑበትን ባዲራ እ...
4,4,امروز بازی جذاب خواهد بود، ای فیلیپس، وقتی این...,0,1,1,0,0,0,የዛሬው የጦጣ ንክሻ ይለያል ክክ አይ ፊሊፖስ አንተ ጅማታም እንድህ ቅጥል...


In [18]:
df_conf = df_conf.drop(columns = ['Unnamed: 0', 'translated_text'])

In [20]:
df_conf = df_conf[['text', 'anger', 'disgust', 'fear','joy', 'sadness', 
                   'surprise']]
df_conf.head()

Unnamed: 0,text,anger,disgust,fear,joy,sadness,surprise
0,እናንተ መቸም አትማሩም ምድረ አውሬ ሁላ።,1,1,0,0,0,0
1,ምነው የዶክተር እስራኤል ጥላሁን ሞት በዝምታ አለፋችሁት ባለስልጣን ስለሆ...,1,1,0,0,0,0
2,እንዲሁም ይህ እጅግ እርሶን የሚወዶት እና እጅግም የሚሳሳልዎን የአዲስአበ...,0,0,0,0,0,0
3,Etv ዘረኛ ናቹሁ አሽቃባጭ ስንት አባቶቻችን መስዋት የሆኑበትን ባዲራ እ...,1,1,0,0,0,0
4,የዛሬው የጦጣ ንክሻ ይለያል ክክ አይ ፊሊፖስ አንተ ጅማታም እንድህ ቅጥል...,0,1,0,1,0,0


In [34]:
main_amharic_df = pd.concat([df_conf, lab_deepseek_df])
main_amharic_df.to_csv(r"amharic_fina_1000.csv", encoding = 'utf-8-sig')

In [44]:
main_amharic_df.head()

Unnamed: 0,Text,anger,disgust,fear,joy,sadness,surprise
0,እናንተ መቸም አትማሩም ምድረ አውሬ ሁላ።,1,1,0,0,0,0
1,ምነው የዶክተር እስራኤል ጥላሁን ሞት በዝምታ አለፋችሁት ባለስልጣን ስለሆ...,1,1,0,0,0,0
2,እንዲሁም ይህ እጅግ እርሶን የሚወዶት እና እጅግም የሚሳሳልዎን የአዲስአበ...,0,0,0,0,0,0
3,Etv ዘረኛ ናቹሁ አሽቃባጭ ስንት አባቶቻችን መስዋት የሆኑበትን ባዲራ እ...,1,1,0,0,0,0
4,የዛሬው የጦጣ ንክሻ ይለያል ክክ አይ ፊሊፖስ አንተ ጅማታም እንድህ ቅጥል...,0,1,0,1,0,0


In [40]:
main_amharic_df['fear'].value_counts()

0    4245
1     285
Name: fear, dtype: int64

In [39]:
main_amharic_df['surprise'].value_counts()

0    4327
1     203
Name: surprise, dtype: int64

In [41]:
main_amharic_df['joy'].value_counts()

0    3578
1     952
Name: joy, dtype: int64

In [42]:
main_amharic_df['sadness'].value_counts()

0    3587
1     943
Name: sadness, dtype: int64

In [24]:
import pandas as pd

sentiment_columns = ["joy", "anger", "fear", "disgust", "sadness", "surprise"]

def filter_dataframe(df):
    # Iterate through each row
    df = df[(df['text'].apply(len) > 10) & (df['text'].apply(len) < 600)]
    
    for col in sentiment_columns:
        df[col] = df[col].astype(int)

    for col in sentiment_columns:
        df = df[df[col] <= 1] 

    return df

lab_deepseek_df = filter_dataframe(lab_deepseek_df)

print(lab_deepseek_df.shape)

# main_amharic_df.to_csv(r"C:\Users\test12\Desktop\semivalcreate_dataset\amharic_fina_2000.csv", encoding = 'utf-8-sig')

(1717, 7)


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(int)


In [21]:
df_conf.head()

Unnamed: 0,text,anger,disgust,fear,joy,sadness,surprise
0,እናንተ መቸም አትማሩም ምድረ አውሬ ሁላ።,1,1,0,0,0,0
1,ምነው የዶክተር እስራኤል ጥላሁን ሞት በዝምታ አለፋችሁት ባለስልጣን ስለሆ...,1,1,0,0,0,0
2,እንዲሁም ይህ እጅግ እርሶን የሚወዶት እና እጅግም የሚሳሳልዎን የአዲስአበ...,0,0,0,0,0,0
3,Etv ዘረኛ ናቹሁ አሽቃባጭ ስንት አባቶቻችን መስዋት የሆኑበትን ባዲራ እ...,1,1,0,0,0,0
4,የዛሬው የጦጣ ንክሻ ይለያል ክክ አይ ፊሊፖስ አንተ ጅማታም እንድህ ቅጥል...,0,1,0,1,0,0


In [None]:
lab_deepseek_df.head()

In [26]:
lab_deepseek_df['joy'].value_counts()

0    1028
1     689
Name: joy, dtype: int64

In [27]:
surprise_rows = lab_deepseek_df[lab_deepseek_df['surprise'] == 1]
fear_rows = lab_deepseek_df[lab_deepseek_df['fear'] == 1]
joy_rows = lab_deepseek_df[lab_deepseek_df['joy'] == 1]

In [28]:
all_sent_df = pd.concat([surprise_rows, fear_rows, joy_rows])
all_sent_df.shape

(1060, 7)

In [29]:
amh_df = pd.read_csv(r"repo_amharic_train_1.csv")
amh_df['fear'].value_counts()

0    1001
1     178
Name: fear, dtype: int64

In [31]:
surprise_rows = amh_df[amh_df['surprise'] == 1]
fear_rows = amh_df[amh_df['fear'] == 1]
joy_rows = amh_df[amh_df['joy'] == 1]

all_old_sent_df = pd.concat([surprise_rows, fear_rows, joy_rows])
all_old_sent_df.shape

(643, 7)

In [32]:
all_sentiment_df = pd.concat([all_old_sent_df, all_sent_df])
all_sentiment_df.shape

(1703, 7)

In [33]:
all_sentiment_df.head()

Unnamed: 0,text,anger,disgust,fear,joy,sadness,surprise
7,@fitih11 ይቺን አውቆ ነው ዶ/ር አብይ ክርስቲያን የሆነዋ..........,0,1,0,0,0,1
47,@heyab_ በጣም ስውር ህዝብም ጭምር:: እኔ የሰዉ ሂሉን እሺ ብሎ መቻ...,0,0,1,0,0,1
51,ከዓፋር ወጣቶች የተላለፈው አስቸኳይ መልእክት!\n\nየዓፋር ወጣቶች የአማ...,0,0,1,0,0,1
125,የአዋሽ ባንክ የባለአክሲዮኖች 24ኛ መደበኛና 16ኛ ድንገተኛ ጠቅላላ ጉባ...,0,0,0,1,0,1
145,@tiletsige27 አትሌት ኃይሌ በጣም ጠቃሚ ነገሮችን በሃገራችን እየሰ...,0,0,0,1,0,1


In [34]:

clean_config = {
    'remove_url': True,
    'remove_mentions': True,
    'lowercase': True,
    'demojify': True
    }

all_sentiment_df['text'] = all_sentiment_df['text'].apply(clean_text, args=(clean_config,))
all_sentiment_df.head()

Unnamed: 0,text,anger,disgust,fear,joy,sadness,surprise
7,ይቺን አውቆ ነው ዶ/ር አብይ ክርስቲያን የሆነዋ.......ኧረ ጀንፈሉ ...,0,1,0,0,0,1
47,በጣም ስውር ህዝብም ጭምር:: እኔ የሰዉ ሂሉን እሺ ብሎ መቻል በጣም ነ...,0,0,1,0,0,1
51,ከዓፋር ወጣቶች የተላለፈው አስቸኳይ መልእክት!\n\nየዓፋር ወጣቶች የአማ...,0,0,1,0,0,1
125,የአዋሽ ባንክ የባለአክሲዮኖች 24ኛ መደበኛና 16ኛ ድንገተኛ ጠቅላላ ጉባ...,0,0,0,1,0,1
145,አትሌት ኃይሌ በጣም ጠቃሚ ነገሮችን በሃገራችን እየሰራ ነው እጅግ ሊደነ...,0,0,0,1,0,1


In [35]:
df_conf.head()

Unnamed: 0,text,anger,disgust,fear,joy,sadness,surprise
0,እናንተ መቸም አትማሩም ምድረ አውሬ ሁላ።,1,1,0,0,0,0
1,ምነው የዶክተር እስራኤል ጥላሁን ሞት በዝምታ አለፋችሁት ባለስልጣን ስለሆ...,1,1,0,0,0,0
2,እንዲሁም ይህ እጅግ እርሶን የሚወዶት እና እጅግም የሚሳሳልዎን የአዲስአበ...,0,0,0,0,0,0
3,Etv ዘረኛ ናቹሁ አሽቃባጭ ስንት አባቶቻችን መስዋት የሆኑበትን ባዲራ እ...,1,1,0,0,0,0
4,የዛሬው የጦጣ ንክሻ ይለያል ክክ አይ ፊሊፖስ አንተ ጅማታም እንድህ ቅጥል...,0,1,0,1,0,0


In [37]:
finalllll_df = pd.concat([df_conf, all_sentiment_df])
finalllll_df.shape

(5251, 7)

In [None]:

finalllll_df = filter_dataframe(finalllll_df)

print(finalllll_df.shape)

In [46]:
finalllll_df['surprise'].value_counts()

0    4874
1     367
Name: surprise, dtype: int64

In [45]:
finalllll_df.to_csv(r"finall_amharic_dff.csv", encoding='utf-8-sig')

In [75]:
surprise_rows = finalllll_df[finalllll_df['surprise'] == 1]
fear_rows = finalllll_df[finalllll_df['fear'] == 1]
joy_rows = finalllll_df[finalllll_df['joy'] == 1][:800]
anger_rows = finalllll_df[finalllll_df['anger'] == 1][:500]
disgust_rows = finalllll_df[finalllll_df['disgust'] == 1][:500]
sadness_rows = finalllll_df[finalllll_df['sadness'] == 1][:500]

all_sentiment_df = pd.concat([sadness_rows, surprise_rows, fear_rows, joy_rows, anger_rows, disgust_rows])
all_sentiment_df.shape

(3317, 7)

In [78]:
print(all_sentiment_df['sadness'].value_counts())
print(all_sentiment_df['anger'].value_counts())
print(all_sentiment_df['joy'].value_counts())
print(all_sentiment_df['disgust'].value_counts())
print(all_sentiment_df['surprise'].value_counts())
print(all_sentiment_df['fear'].value_counts())

0    2450
1     867
Name: sadness, dtype: int64
0    2352
1     965
Name: anger, dtype: int64
0    2339
1     978
Name: joy, dtype: int64
0    2328
1     989
Name: disgust, dtype: int64
0    2740
1     577
Name: surprise, dtype: int64
0    2515
1     802
Name: fear, dtype: int64


In [77]:
all_sentiment_df.to_csv(r"finall_amharic_balanced_df.csv", encoding='utf-8-sig')