In [11]:
bad_rows = []
with open("data.csv", "r", encoding="utf-8") as f:
    header = f.readline()
    expected_columns = len(header.strip().split(","))
    for i, line in enumerate(f, start=2):  # start=2 because header is line 1
        if len(line.strip().split(",")) != expected_columns:
            bad_rows.append((i, line.strip()))

print(f"Found {len(bad_rows)} bad rows:")
for row in bad_rows[:5]:  # show only first 5 for brevity
    print(row)


Found 9996 bad rows:
(2, '😊🌞 Feeling great today! Ready to take on new challenges! 💪✨,"[\'😊\', \'🌞\', \'💪\', \'✨\']",2,"The person is feeling happy and cheerful today, basking in the sunshine, and feels motivated and confident to face new challenges.",positive,"[\'emotion\', \'nature\', \'motivation\']",Feeling great today! Ready to take on new challenges!,daily_status')
(3, '😊🌞 Let\'s enjoy the sunny day! 🌳🏖️,"[\'😊\', \'🌞\', \'🌳\', \'🏖️\']",4,"A cheerful person expressing happiness about the sunny weather and looking forward to spending time outdoors in nature, perhaps at the park or beach.",positive,"[\'emotion\', \'nature\', \'activity\']",Let\'s enjoy the sunny day!,daily_status')
(4, 'Had a great day! 😊🌞🎉,"[\'😊\', \'🌞\', \'🎉\']",3,"The person is expressing happiness about their day, mentioning sunshine and celebration.",positive,"[\'emotion\', \'nature\', \'celebration\']",Had a great day!,daily_status')
(5, '😊🌞✨ Have a bright and joyful day! 🌻🌈,"[\'😊\', \'🌞\', \'✨\', \'🌻\', \'🌈\'

# Cleaning for unnecessary "," in rows

In [20]:
import csv
import re

def smart_split_line(line):
    # Start with comma splitting, but only keep commas outside of quotes and brackets
    parts = []
    buffer = ""
    in_quotes = False
    bracket_depth = 0

    for c in line:
        if c == '"' and (not buffer or buffer[-1] != '\\'):
            in_quotes = not in_quotes
        elif c == "[":
            bracket_depth += 1
        elif c == "]":
            bracket_depth -= 1

        if c == "," and not in_quotes and bracket_depth == 0:
            parts.append(buffer)
            buffer = ""
        else:
            buffer += c

    parts.append(buffer)  # last part
    return parts

def clean_commas(text):
    if not text:
        return text
    return text.replace(",", "")  # remove internal commas

input_file = "data.csv"
output_file = "cleaned_strict.csv"

with open(input_file, "r", encoding="utf-8") as infile, open(output_file, "w", newline='', encoding="utf-8") as outfile:
    writer = csv.writer(outfile)
    header_written = False

    for i, line in enumerate(infile):
        line = line.strip()
        if not line:
            continue

        parts = smart_split_line(line)

        if len(parts) != 8:
            print(f"Skipping malformed row {i}: found {len(parts)} fields")
            continue

        # Clean internal commas from specific columns
        parts[0] = clean_commas(parts[0])  # original_message
        parts[3] = clean_commas(parts[3])  # translated_meaning
        parts[6] = clean_commas(parts[6])  # target_text

        if not header_written:
            writer.writerow(["original_message", "emoji_used", "emoji_context_window", "translated_meaning",
                             "sentiment_detected", "emoji_category", "target_text", "message_type"])
            header_written = True

        writer.writerow(parts)

print("✅ Cleaning done. Output written to 'cleaned_strict.csv'")


Skipping malformed row 399: found 15 fields
✅ Cleaning done. Output written to 'cleaned_strict.csv'


# Cleaning Each Emoji Array to be a string

In [34]:
import csv

input_file = "cleaned_strict.csv"
output_file = "cleaned_strict_emojies.csv"

def fix_split_emojis(row):
    new_row = []
    in_emoji_block = False
    emoji_tokens = []

    for item in row:
        # Start of emoji block
        if not in_emoji_block and "[" in item:
            in_emoji_block = True
            emoji_tokens.append(item)
        elif in_emoji_block:
            emoji_tokens.append(item)
            if "]" in item:
                # End of emoji block
                in_emoji_block = False
                combined = ",".join(emoji_tokens)
                emojis_str = combined[combined.index("[")+1:combined.index("]")]
                emojis = [e.strip(" '") for e in emojis_str.split(",")]
                new_row.append("-".join(emojis))
                emoji_tokens = []
        else:
            new_row.append(item)
    
    return new_row

# Process the file
with open(input_file, 'r', encoding='utf-8') as infile, open(output_file, 'w', newline='', encoding='utf-8') as outfile:
    reader = csv.reader(infile)
    writer = csv.writer(outfile)

    header = next(reader)
    writer.writerow(header)

    for row in reader:
        fixed_row = fix_split_emojis(row)
        writer.writerow(fixed_row)

print("✅ Done. Saved fixed rows in 'cleaned_strict_emojies.csv'")


✅ Done. Saved fixed rows in 'cleaned_strict_emojies.csv'


# Checking for data-structure integrity

In [36]:
import csv

input_file = "strict_data.csv"
expected_columns = 8

malformed_rows = []

with open(input_file, 'r', encoding='utf-8') as f:
    reader = csv.reader(f)
    header = next(reader)
    
    for i, row in enumerate(reader, start=2):  # start=2 to count header as line 1
        if len(row) != expected_columns:
            malformed_rows.append((i, len(row), row))

# Report
if malformed_rows:
    print(f"❌ Found {len(malformed_rows)} malformed rows:\n")
    for line_num, col_count, row in malformed_rows[:5]:  # Show first 5 examples
        print(f"  - Line {line_num}: has {col_count} columns → {row}")
    if len(malformed_rows) > 5:
        print(f"\n  ...and {len(malformed_rows) - 5} more rows with incorrect column count.")
else:
    print("✅ All rows match the expected structure of 8 columns.")


✅ All rows match the expected structure of 8 columns.


# Checking if the target_message has correct format


import pandas as pd

# Check if entries in the "target_message" column contain emojis
import emoji
input_file = "final.csv"
df = pd.read_csv(input_file, encoding='utf-8')
# Check for emojis in the 'target_message' column
emoji_lines = []
for i, row in df.iterrows():
    if any(char in row['target_message'] for char in emoji.UNICODE_EMOJI['en']):
        emoji_lines.append(i)

# Report the line numbers
if emoji_lines:
    print(f"Entries with emojis found in the 'target_message' column:")
    for line_num in emoji_lines:
        print(f"  - Line {line_num + 1}")
else:
    print("No entries with emojis found in the 'target_message' column.")
