# Data Cleaning on Tweets

In [None]:
from datasets import load_dataset
import pandas as pd
import re



In [None]:
!pip install langdetect


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.2.1[0m[39;49m -> [0m[32;49m24.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


# Topic Dataset

In [None]:
# Load the dataset
dataset = load_dataset("AlanYky/tweets_topic_with_instructions")

# Convert to DataFrame for easier processing
df = dataset['train'].to_pandas()

In [None]:
def print_detection(data):
    num_rows = len(df)
    print("Number of rows: ", num_rows)
    print("-----")
    link_count = data['text'].str.contains(r"http\S+").sum()
    mention_count = data['text'].str.contains(r"@\w+").sum()
    hashtag_count = data['text'].str.contains(r"#\w+").sum()
    long_text_count = data['text'].str.split().apply(len).gt(250).sum()

    multiple_mentions_count = data['text'].str.count(r"@\w+").gt(1).sum()
    multiple_hashtags_count = data['text'].str.count(r"#\w+").gt(2).sum()
    low_quality_content_count = data['text'].str.contains("click", case=False).sum()

    print("link_count", link_count)
    print("mention_count", mention_count)
    print("hashtag_count", hashtag_count)
    print("long_text_count", long_text_count)
    print("Texts with more than 1 mention:", multiple_mentions_count)
    print("Texts with more than 2 hashtag:", multiple_hashtags_count)
    print("low_quality_content_count", low_quality_content_count)

In [None]:
print_detection(df)

Number of rows:  22174
-----
link_count 0
mention_count 930
hashtag_count 3494
long_text_count 1
Texts with more than 1 mention: 4
Texts with more than 2 hashtag: 24
low_quality_content_count 14


In [None]:
df

Unnamed: 0,instruction,text,target,__index_level_0__
0,Generate a tweet about FAMU.,School Monday and honestly I've always hated s...,FAMU,0
1,Generate a tweet about dbh.,dbh the worst game of all time its awful its t...,dbh,1
2,Generate a tweet about Madrid.,I wish I had a @user cheki to do that wota thi...,Madrid,2
3,Generate a tweet about Patronato.,"10’ | 0-0 | Good start, Patronato dangerous on...",Patronato,3
4,Generate a tweet about Torrey Pines.,I don't have a big opinion on the Torrey Pines...,Torrey Pines,4
...,...,...,...,...
22169,Generate a tweet about Dolph Ziggler.,Need Omos to sell like Dolph Ziggler for the R...,Dolph Ziggler,22171
22170,Generate a tweet about Connor Williams.,"As expected, the entire starting offensive lin...",Connor Williams,22172
22171,Generate a tweet about HRT.,looking at pics of girls 1 year into HRT alrea...,HRT,22173
22172,Generate a tweet about Charlene.,At this wedding and the dj just played Charlen...,Charlene,22174


# Remove Stategies

- Remove Mentions @
- Remove the too long tweets
- Remove the excessive hashtag
- Remove the excessive symbol
- Remove the data with Link
- Remove the excessive emoji
- Remove the retweet (RT) and link in bio

In [None]:
# remove the data with mention @ (Reduce noise and it does not affect our purpose)
def remove_mentions(dataframe):
    """
    Removes rows containing mentions (indicated by '@') in the 'text' column.

    Parameters:
    dataframe (pd.DataFrame): The DataFrame containing a 'text' column.

    Returns:
    pd.DataFrame: A DataFrame without rows that have mentions.
    """
    # Filter out rows with any '@' mention in the text
    return dataframe[~dataframe['text'].str.contains(r"@\w+")]

# Apply the function to remove mention data
df = remove_mentions(df)
print_detection(df)

Number of rows:  21244
-----
link_count 0
mention_count 0
hashtag_count 3202
long_text_count 1
Texts with more than 1 mention: 0
Texts with more than 2 hashtag: 14
low_quality_content_count 13


In [None]:
def remove_long_texts(dataframe, word_limit=250):
    """
    Removes rows where the text exceeds the specified word count limit.

    Parameters:
    dataframe (pd.DataFrame): The DataFrame containing a 'text' column.
    word_limit (int): The maximum allowed number of words in the text.

    Returns:
    pd.DataFrame: A DataFrame without rows that have text exceeding the word limit.
    """
    # Filter out rows where the text length exceeds the word limit
    return dataframe[~dataframe['text'].str.split().apply(len).gt(word_limit)]

df = remove_long_texts(df)
print_detection(df)

Number of rows:  21243
-----
link_count 0
mention_count 0
hashtag_count 3201
long_text_count 0
Texts with more than 1 mention: 0
Texts with more than 2 hashtag: 13
low_quality_content_count 13


In [None]:
def remove_excessive_hashtags(dataframe, hashtag_limit=2):
    """
    Removes rows where the text contains more than the specified number of hashtags.

    Parameters:
    dataframe (pd.DataFrame): The DataFrame containing a 'text' column.
    hashtag_limit (int): The maximum allowed number of hashtags in the text.

    Returns:
    pd.DataFrame: A DataFrame without rows that have more than the specified number of hashtags.
    """
    # Filter out rows with more hashtags than the specified limit
    return dataframe[~dataframe['text'].str.count(r"#\w+").gt(hashtag_limit)]

df = remove_excessive_hashtags(df)
print_detection(df)

Number of rows:  21230
-----
link_count 0
mention_count 0
hashtag_count 3188
long_text_count 0
Texts with more than 1 mention: 0
Texts with more than 2 hashtag: 0
low_quality_content_count 13


In [None]:
df

Unnamed: 0,instruction,text,target,__index_level_0__
0,Generate a tweet about FAMU.,School Monday and honestly I've always hated s...,FAMU,0
1,Generate a tweet about dbh.,dbh the worst game of all time its awful its t...,dbh,1
3,Generate a tweet about Patronato.,"10’ | 0-0 | Good start, Patronato dangerous on...",Patronato,3
4,Generate a tweet about Torrey Pines.,I don't have a big opinion on the Torrey Pines...,Torrey Pines,4
5,Generate a tweet about uma musume.,saw someone have to censor their uma musume ar...,uma musume,5
...,...,...,...,...
22169,Generate a tweet about Dolph Ziggler.,Need Omos to sell like Dolph Ziggler for the R...,Dolph Ziggler,22171
22170,Generate a tweet about Connor Williams.,"As expected, the entire starting offensive lin...",Connor Williams,22172
22171,Generate a tweet about HRT.,looking at pics of girls 1 year into HRT alrea...,HRT,22173
22172,Generate a tweet about Charlene.,At this wedding and the dj just played Charlen...,Charlene,22174


In [None]:
# Function to drop rows with specific instruction
def drop_specific_instruction(dataframe, instruction_text):
    """
    Drops rows where the 'instruction' column matches the specified text.

    Parameters:
    dataframe (pd.DataFrame): The DataFrame containing an 'instruction' column.
    instruction_text (str): The specific instruction text to drop.

    Returns:
    pd.DataFrame: A DataFrame without rows that match the specified instruction.
    """
    # Drop rows where the 'instruction' matches the specified text
    return dataframe[dataframe['instruction'] != instruction_text]

# Apply the function to drop rows with the specific instruction
df = drop_specific_instruction(df, "Generate a tweet about 𝙉𝙖𝙪𝙩𝙞𝙘𝙖.")
print_detection(df)

Number of rows:  21229
-----
link_count 0
mention_count 0
hashtag_count 3188
long_text_count 0
Texts with more than 1 mention: 0
Texts with more than 2 hashtag: 0
low_quality_content_count 13


In [None]:
def show_repetitive_symbols(dataframe):
    """
    Returns rows where the 'text' column contains repetitive punctuation (e.g., '!!!', ',,,', etc.).

    Parameters:
    dataframe (pd.DataFrame): The DataFrame containing a 'text' column.

    Returns:
    pd.DataFrame: A DataFrame with rows that have repetitive punctuation.
    """
    # Filter rows with repetitive punctuation in the text
    return dataframe[dataframe['text'].str.contains(r"([.,!?])\1{2,}", regex=True)]

show_repetitive_symbols(df)['text']

  return dataframe[dataframe['text'].str.contains(r"([.,!?])\1{2,}", regex=True)]


11       Mary Cosby is the Kim Richards of #RHOSLC and ...
28       Well, I kinda want to stream today, but I don'...
41       This cameo that Ramsey Nouah keeps making at t...
54       (dubcon noncon impreg breeding toxic relations...
74       im upset nescafe discontinued the fruity latte...
                               ...                        
22124    Monica Lewinsky wasn’t THE ONLY person involve...
22141    krispy kreme boxes are IMPOSSIBLE to open do i...
22147    Did Rod Smith have grey hair during the 2019 s...
22157    rihanna really gave us love on the brain... li...
22158    They let Mercedes Martinez go and kept Eva Mar...
Name: text, Length: 1406, dtype: object

In [None]:
# Function to limit repetitive punctuation
def remove_repetitive_symbols(dataframe):
    """
    Removes excessive punctuation (e.g., ',,,', etc.) by limiting to a single instance.
    """
    dataframe['text'] = dataframe['text'].apply(lambda x: re.sub(r"([,&*%~])\1+", r"\1", x))
    return dataframe

df = remove_repetitive_symbols(df)
print_detection(df)

Number of rows:  21229
-----
link_count 0
mention_count 0
hashtag_count 3188
long_text_count 0
Texts with more than 1 mention: 0
Texts with more than 2 hashtag: 0
low_quality_content_count 13


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe['text'] = dataframe['text'].apply(lambda x: re.sub(r"([,&*%~])\1+", r"\1", x))


In [None]:
df['text']

0        School Monday and honestly I've always hated s...
1        dbh the worst game of all time its awful its t...
3        10’ | 0-0 | Good start, Patronato dangerous on...
4        I don't have a big opinion on the Torrey Pines...
5        saw someone have to censor their uma musume ar...
                               ...                        
22169    Need Omos to sell like Dolph Ziggler for the R...
22170    As expected, the entire starting offensive lin...
22171    looking at pics of girls 1 year into HRT alrea...
22172    At this wedding and the dj just played Charlen...
22173    Today has been a two pack of hot ass. *Joe Bud...
Name: text, Length: 21229, dtype: object

In [None]:
print_detection(df)

Number of rows:  21229
-----
link_count 0
mention_count 0
hashtag_count 3188
long_text_count 0
Texts with more than 1 mention: 0
Texts with more than 2 hashtag: 0
low_quality_content_count 13


In [None]:
# remove the noise data
def remove_other_noises(dataframe):
    dataframe = dataframe[~dataframe['text'].str.contains("click", case=False)]
    dataframe = dataframe[~dataframe['text'].str.startswith("RT")]

    # Remove extra whitespace
    dataframe['text'] = dataframe['text'].str.replace(r'\s+', ' ', regex=True).str.strip()

    return dataframe

df = remove_other_noises(df)
print_detection(df)

Number of rows:  21215
-----
link_count 0
mention_count 0
hashtag_count 3187
long_text_count 0
Texts with more than 1 mention: 0
Texts with more than 2 hashtag: 0
low_quality_content_count 0


In [None]:
df

Unnamed: 0,instruction,text,target,__index_level_0__
0,Generate a tweet about FAMU.,School Monday and honestly I've always hated s...,FAMU,0
1,Generate a tweet about dbh.,dbh the worst game of all time its awful its t...,dbh,1
3,Generate a tweet about Patronato.,"10’ | 0-0 | Good start, Patronato dangerous on...",Patronato,3
4,Generate a tweet about Torrey Pines.,I don't have a big opinion on the Torrey Pines...,Torrey Pines,4
5,Generate a tweet about uma musume.,saw someone have to censor their uma musume ar...,uma musume,5
...,...,...,...,...
22169,Generate a tweet about Dolph Ziggler.,Need Omos to sell like Dolph Ziggler for the R...,Dolph Ziggler,22171
22170,Generate a tweet about Connor Williams.,"As expected, the entire starting offensive lin...",Connor Williams,22172
22171,Generate a tweet about HRT.,looking at pics of girls 1 year into HRT alrea...,HRT,22173
22172,Generate a tweet about Charlene.,At this wedding and the dj just played Charlen...,Charlene,22174


In [None]:
import re

def remove_excessive_repetitions(dataframe):
    """
    Removes excessive repetitions of the same character (like 'DDDD...') by limiting to a maximum of 2 or 3 instances.

    Parameters:
    dataframe (pd.DataFrame): The DataFrame containing a 'text' column.

    Returns:
    pd.DataFrame: A DataFrame with excessive character repetitions reduced.
    """
    # Replace sequences of the same character repeated more than 3 times with just 2 of them
    dataframe['text'] = dataframe['text'].apply(lambda x: re.sub(r"(.)\1{2,}", r"\1\1", x))
    return dataframe

# Apply the function to clean excessive character repetitions
df = remove_excessive_repetitions(df)
print_detection(df)

Number of rows:  21215
-----
link_count 0
mention_count 0
hashtag_count 3187
long_text_count 0
Texts with more than 1 mention: 0
Texts with more than 2 hashtag: 0
low_quality_content_count 0


In [None]:
# Function to count rows with more than a specified number of emojis in the text
def count_rows_with_excessive_emojis(dataframe, emoji_limit=2):
    """
    Counts the number of rows in the DataFrame where the 'text' column contains more than the specified number of emojis.

    Parameters:
    dataframe (pd.DataFrame): The DataFrame containing a 'text' column.
    emoji_limit (int): The maximum allowed number of emojis in each text.

    Returns:
    int: The number of rows with more than the specified number of emojis.
    """
    # Define a regex pattern for matching emojis
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # Emoticons
                               u"\U0001F300-\U0001F5FF"  # Symbols & Pictographs
                               u"\U0001F680-\U0001F6FF"  # Transport & Map Symbols
                               u"\U0001F700-\U0001F77F"  # Alchemical Symbols
                               u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                               u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                               u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                               u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                               u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                               u"\U00002702-\U000027B0"  # Dingbats
                               u"\U000024C2-\U0001F251"  # Enclosed characters
                               "]+", flags=re.UNICODE)

    # Count rows with more than 'emoji_limit' emojis in the 'text' column
    rows_with_excessive_emojis = dataframe['text'].apply(lambda x: len(emoji_pattern.findall(x)) > emoji_limit).sum()

    return rows_with_excessive_emojis

# Demonstration on the sample DataFrame
rows_with_excessive_emojis_count = count_rows_with_excessive_emojis(df, emoji_limit=2)
rows_with_excessive_emojis_count


np.int64(196)

In [None]:
# Adjusted function to apply to a DataFrame column
def limit_sequential_emojis_df(dataframe, column_name='text', emoji_limit=2):
    """
    Limits sequential expressive emojis in the specified DataFrame column to a maximum of 'emoji_limit' per sequence.

    Parameters:
    dataframe (pd.DataFrame): The DataFrame containing the column with text.
    column_name (str): The name of the column containing text with emojis.
    emoji_limit (int): The maximum allowed number of consecutive emojis.

    Returns:
    pd.DataFrame: A DataFrame with the specified column's text processed to limit consecutive emojis.
    """
    # Define a regex pattern for matching emojis
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # Emoticons
                               u"\U0001F300-\U0001F5FF"  # Symbols & Pictographs
                               u"\U0001F680-\U0001F6FF"  # Transport & Map Symbols
                               u"\U0001F700-\U0001F77F"  # Alchemical Symbols
                               u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                               u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                               u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                               u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                               u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                               u"\U00002702-\U000027B0"  # Dingbats
                               u"\U000024C2-\U0001F251"  # Enclosed characters
                               "]+", flags=re.UNICODE)

    # Function to apply to each text entry in the specified column
    def limit_emojis(text):
        return re.sub(rf"({emoji_pattern.pattern})\1{{{emoji_limit},}}", r"\1" * emoji_limit, text)

    # Apply the limit_emojis function to the specified column in the DataFrame
    dataframe[column_name] = dataframe[column_name].apply(limit_emojis)
    return dataframe

# Apply the function to the DataFrame
df = limit_sequential_emojis_df(df, column_name='text', emoji_limit=2)
print_detection(df)


Number of rows:  21215
-----
link_count 0
mention_count 0
hashtag_count 3187
long_text_count 0
Texts with more than 1 mention: 0
Texts with more than 2 hashtag: 0
low_quality_content_count 0


In [None]:
df

Unnamed: 0,instruction,text,target,__index_level_0__
0,Generate a tweet about FAMU.,School Monday and honestly I've always hated s...,FAMU,0
1,Generate a tweet about dbh.,dbh the worst game of all time its awful its t...,dbh,1
3,Generate a tweet about Patronato.,"10’ | 0-0 | Good start, Patronato dangerous on...",Patronato,3
4,Generate a tweet about Torrey Pines.,I don't have a big opinion on the Torrey Pines...,Torrey Pines,4
5,Generate a tweet about uma musume.,saw someone have to censor their uma musume ar...,uma musume,5
...,...,...,...,...
22169,Generate a tweet about Dolph Ziggler.,Need Omos to sell like Dolph Ziggler for the R...,Dolph Ziggler,22171
22170,Generate a tweet about Connor Williams.,"As expected, the entire starting offensive lin...",Connor Williams,22172
22171,Generate a tweet about HRT.,looking at pics of girls 1 year into HRT alrea...,HRT,22173
22172,Generate a tweet about Charlene.,At this wedding and the dj just played Charlen...,Charlene,22174


In [None]:
def drop_rows_with_target_start(dataframe, column_name='target', target_start="$"):
    """
    Drops rows in the specified DataFrame column where the text starts with the specified target character(s).

    Parameters:
    dataframe (pd.DataFrame): The DataFrame containing the column with text.
    column_name (str): The name of the column to check for the target start.
    target_start (str): The target character(s) to match at the start of the text.

    Returns:
    pd.DataFrame: A DataFrame with rows removed where the text starts with the target character(s).
    """
    # Drop rows where the text in the specified column starts with the target character(s)
    return dataframe[~dataframe[column_name].str.startswith(target_start)]

df = drop_rows_with_target_start(df, column_name='target', target_start="$")
print_detection(df)

Number of rows:  21207
-----
link_count 0
mention_count 0
hashtag_count 3186
long_text_count 0
Texts with more than 1 mention: 0
Texts with more than 2 hashtag: 0
low_quality_content_count 0


In [None]:
# Function to count rows with repetitive values in a specified column
def count_repetitive_values(dataframe, column_name='target'):
    """
    Counts the number of rows in the DataFrame with repetitive values in the specified column.

    Parameters:
    dataframe (pd.DataFrame): The DataFrame containing the column to check for repetitive values.
    column_name (str): The name of the column to check for repetitions.

    Returns:
    int: The count of rows with repetitive values in the specified column.
    """
    # Find duplicate values in the specified column and count them
    repetitive_count = dataframe[column_name].duplicated(keep=False).sum()

    return repetitive_count

count_repetitive_values(df, column_name='target')

np.int64(9622)

In [None]:
# Function to keep only one row per unique target, keeping the row with the longest 'text' value
def keep_longest_text_per_target(dataframe, target_column='target', text_column='text'):
    """
    Keeps only one row per unique value in the target column, selecting the row with the longest text in the text column.

    Parameters:
    dataframe (pd.DataFrame): The DataFrame containing the target and text columns.
    target_column (str): The name of the column with target values.
    text_column (str): The name of the column with text values.

    Returns:
    pd.DataFrame: A DataFrame with one row per unique target, keeping the longest text.
    """
    # Sort by the length of the text column in descending order and drop duplicates by keeping the first (longest)
    dataframe['text_length'] = dataframe[text_column].str.len()  # Calculate text length
    dataframe_sorted = dataframe.sort_values(by=['target', 'text_length'], ascending=[True, False])
    unique_targets_df = dataframe_sorted.drop_duplicates(subset=target_column, keep='first').drop(columns='text_length')

    return unique_targets_df

# Apply the function to keep the longest text per target
df = keep_longest_text_per_target(df, target_column='target', text_column='text')
print_detection(df)

Number of rows:  14034
-----
link_count 0
mention_count 0
hashtag_count 2291
long_text_count 0
Texts with more than 1 mention: 0
Texts with more than 2 hashtag: 0
low_quality_content_count 0


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  dataframe['text_length'] = dataframe[text_column].str.len()  # Calculate text length


In [None]:
df

Unnamed: 0,instruction,text,target,__index_level_0__
1438,Generate a tweet about #ApexLegends.,Please remove the L-Star from Ranked Arenas. S...,#ApexLegends,1438
576,Generate a tweet about #FSU.,So can everyone walk back off that KZ ledge no...,#FSU,576
21461,Generate a tweet about #MIT.,Been waiting for a while to shout it out loud ...,#MIT,21463
11233,Generate a tweet about #Srinagar.,#BREAKING : TRF commander Mehran and Basit Mar...,#Srinagar,11235
16934,Generate a tweet about #İstanbul.,Finally arrived to #İstanbul and immediately s...,#İstanbul,16936
...,...,...,...,...
21100,"Generate a tweet about 𝐃𝐈𝐀𝐁𝐄𝐓𝐄𝐒"".",𝐅𝐀𝐂𝐓𝐒 𝐀𝐁𝐎𝐔𝐓 𝐇𝐁𝐏 & 𝐃𝐈𝐀𝐁𝐄𝐓𝐄𝐒 to everyone. Knowle...,"𝐃𝐈𝐀𝐁𝐄𝐓𝐄𝐒""",21102
8745,Generate a tweet about 𝐡𝐢𝐠𝐡 𝐬𝐜𝐡𝐨𝐨𝐥.,anyone still calling kuroo a rooster head is b...,𝐡𝐢𝐠𝐡 𝐬𝐜𝐡𝐨𝐨𝐥,8747
1248,Generate a tweet about 🇩🇪.,🇩🇪 AfD set to lose status as Third Party in Bu...,🇩🇪,1248
2721,Generate a tweet about 🇪🇬.,RETURN ON TOKYO 2020 OLYMPIC 51 🇸🇰Slovakia - 1...,🇪🇬,2721


In [None]:
from huggingface_hub import HfApi, HfFolder
from datasets import Dataset

huggingface_token = "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"  # Replace with your token
HfFolder.save_token(huggingface_token)

In [None]:
# Function to count rows with excessive non-important symbols in the text column
def count_excessive_symbols(dataframe, text_column='text', keep_symbols=[",", ".", "!", "?", "#"], max_repeats=3):
    """
    Counts the number of rows in the DataFrame where the text contains excessive non-important symbols.

    Parameters:
    dataframe (pd.DataFrame): The DataFrame containing the text data.
    text_column (str): The name of the text column to process.
    keep_symbols (list): List of symbols to retain, regardless of repetition.
    max_repeats (int): The maximum number of times other symbols can consecutively appear before being considered excessive.

    Returns:
    int: The count of rows with excessive symbols.
    """
    # Create a regex pattern for any symbol except the ones in 'keep_symbols'
    keep_pattern = ''.join(re.escape(sym) for sym in keep_symbols)
    pattern = rf"([^a-zA-Z0-9{keep_pattern}\s])\1{{{max_repeats},}}"

    # Count rows with excessive symbols
    excessive_count = dataframe[text_column].apply(lambda x: bool(re.search(pattern, x))).sum()

    return excessive_count

excessive_symbols_count = count_excessive_symbols(df, text_column='text', keep_symbols=[",", ".", "!", "?", "#"], max_repeats=1)
excessive_symbols_count

np.int64(576)

In [None]:
print_detection(df)

Number of rows:  14034
-----
link_count 0
mention_count 0
hashtag_count 2291
long_text_count 0
Texts with more than 1 mention: 0
Texts with more than 2 hashtag: 0
low_quality_content_count 0


In [None]:
def remove_excessive_symbols(dataframe, text_column='text', keep_symbols=[",", ".", "!", "?", "#"], max_repeats=1):
    """
    Removes rows in the DataFrame where the text contains excessive non-important symbols.

    Parameters:
    dataframe (pd.DataFrame): The DataFrame containing the text data.
    text_column (str): The name of the text column to process.
    keep_symbols (list): List of symbols to retain, regardless of repetition.
    max_repeats (int): The maximum number of times other symbols can consecutively appear before being considered excessive.

    Returns:
    pd.DataFrame: A DataFrame with rows containing excessive symbols removed from the specified text column.
    """
    # Create a regex pattern for any symbol except the ones in 'keep_symbols'
    keep_pattern = ''.join(re.escape(sym) for sym in keep_symbols)
    pattern = rf"([^a-zA-Z0-9{keep_pattern}\s])\1{{{max_repeats},}}"

    # Filter out rows with excessive symbols
    cleaned_dataframe = dataframe[~dataframe[text_column].apply(lambda x: bool(re.search(pattern, x)))]

    return cleaned_dataframe

df = remove_excessive_symbols(df)
print_detection(df)

Number of rows:  13458
-----
link_count 0
mention_count 0
hashtag_count 2210
long_text_count 0
Texts with more than 1 mention: 0
Texts with more than 2 hashtag: 0
low_quality_content_count 0


In [None]:
def remove_link_in_bio(dataframe, text_column='text'):
    """
    Removes rows in the DataFrame where the specified text column contains "LINK IN BIO" in any case (uppercase, lowercase, or mixed).

    Parameters:
    dataframe (pd.DataFrame): The DataFrame containing the text data.
    text_column (str): The name of the text column to check.

    Returns:
    pd.DataFrame: A DataFrame with rows containing "LINK IN BIO" removed from the specified text column.
    """
    # Case-insensitive search for "LINK IN BIO"
    pattern = r"(?i)link in bio"

    # Filter out rows with "LINK IN BIO" in the text column
    cleaned_dataframe = dataframe[~dataframe[text_column].str.contains(pattern, regex=True)]

    return cleaned_dataframe

df = remove_link_in_bio(df)
print_detection(df)

Number of rows:  13453
-----
link_count 0
mention_count 0
hashtag_count 2210
long_text_count 0
Texts with more than 1 mention: 0
Texts with more than 2 hashtag: 0
low_quality_content_count 0


In [None]:
def push_to_huggingface(df, dataset_name, repo_id):
    # Convert the DataFrame to Hugging Face Dataset format
    dataset = Dataset.from_pandas(df)

    # Push the dataset to Hugging Face
    dataset.push_to_hub(repo_id)

    print(f"Dataset '{dataset_name}' pushed to Hugging Face at: https://huggingface.co/datasets/{repo_id}")

In [None]:
push_to_huggingface(
    df[['instruction', 'text', 'target']],
    "AlanYky/tweets-topic-instruct-filtered",
    "AlanYky/tweets-topic-instruct-filtered"
)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/14 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/395 [00:00<?, ?B/s]

Dataset 'AlanYky/tweets-topic-instruct-filtered' pushed to Hugging Face at: https://huggingface.co/datasets/AlanYky/tweets-topic-instruct-filtered
