# Data Cleaning on Tweets

In [None]:
from datasets import load_dataset
import pandas as pd
import re



In [None]:
# Load the dataset
dataset = load_dataset("AlanYky/tweets_instruct")

In [None]:
dataset

DatasetDict({
    people_event_instruct: Dataset({
        features: ['instruction', 'text'],
        num_rows: 22174
    })
    topic_instruct: Dataset({
        features: ['instruction', 'text'],
        num_rows: 6067
    })
    general_instruct: Dataset({
        features: ['instruction', 'text'],
        num_rows: 94035
    })
    tones_instruct: Dataset({
        features: ['instruction', 'text'],
        num_rows: 78184
    })
})

In [None]:
# Function to count rows with more than a specified number of emojis in the text
def count_rows_with_excessive_emojis(dataframe, emoji_limit=2):
    """
    Counts the number of rows in the DataFrame where the 'text' column contains more than the specified number of emojis.

    Parameters:
    dataframe (pd.DataFrame): The DataFrame containing a 'text' column.
    emoji_limit (int): The maximum allowed number of emojis in each text.

    Returns:
    int: The number of rows with more than the specified number of emojis.
    """
    # Define a regex pattern for matching emojis
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # Emoticons
                               u"\U0001F300-\U0001F5FF"  # Symbols & Pictographs
                               u"\U0001F680-\U0001F6FF"  # Transport & Map Symbols
                               u"\U0001F700-\U0001F77F"  # Alchemical Symbols
                               u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                               u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                               u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                               u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                               u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                               u"\U00002702-\U000027B0"  # Dingbats
                               u"\U000024C2-\U0001F251"  # Enclosed characters
                               "]+", flags=re.UNICODE)

    # Count rows with more than 'emoji_limit' emojis in the 'text' column
    rows_with_excessive_emojis = dataframe['text'].apply(lambda x: len(emoji_pattern.findall(x)) > emoji_limit).sum()

    return rows_with_excessive_emojis


# Print the Noise Signal
def print_detection(data):
    num_rows = len(data)
    print("Number of data: ", num_rows)
    print("-----")
    link_count = data['text'].str.contains(r"http\S+|www\.\S+", case=False).sum()
    mention_count = data['text'].str.contains(r"@\w+").sum()
    hashtag_count = data['text'].str.contains(r"#\w+").sum()
    long_text_count = data['text'].str.split().apply(len).gt(250).sum()

    multiple_mentions_count = data['text'].str.count(r"@\w+").gt(1).sum()
    multiple_hashtags_count = data['text'].str.count(r"#\w+").gt(2).sum()
    # low_quality_content_count = data['text'].str.contains("click", case=False).sum()
    low_quality_content_count = data['text'].str.contains(r"click|sale|subscribe|link in bio|RT", case=False).sum()

    print("link_count", link_count)
    print("mention_count", mention_count)
    print("hashtag_count", hashtag_count)
    print("long_text_count", long_text_count)
    print("Texts with more than 1 mention:", multiple_mentions_count)
    print("Texts with more than 2 hashtag:", multiple_hashtags_count)
    print("low_quality_content_count", low_quality_content_count)
    print("Excessive Emoji", count_rows_with_excessive_emojis(data))

# Remove the Link Data
# Delete rows containing links in the text column
def remove_links(dataframe):
    dataframe = dataframe[~dataframe['text'].str.contains(r"http\S+|www\.\S+", case=False, regex=True)]
    return dataframe


# Remove the Extreme Long Data
def remove_long_texts(dataframe, word_limit=250):
    return dataframe[~dataframe['text'].str.split().apply(len).gt(word_limit)]

# Remove the data have mentions @
def remove_mentions(dataframe):
    return dataframe[~dataframe['text'].str.contains(r"@\w+")]

# Remove the data have more than 2 hashtags
def remove_excessive_hashtags(dataframe, hashtag_limit=2):
    # Filter out rows with more hashtags than the specified limit
    return dataframe[~dataframe['text'].str.count(r"#\w+").gt(hashtag_limit)]

# Remove the low quality data (with too advertisement text)
def remove_low_quality_content(dataframe):
    low_quality_pattern = r"click|subscribe|link in bio|RT"
    dataframe = dataframe[~dataframe['text'].str.contains(low_quality_pattern, case=False, regex=True)]
    return dataframe

# Remove the excessive symbol (repetitive symbol) (e.g., ',,,', etc.)
def remove_excessive_symbols(dataframe, text_column='text', keep_symbols=[",", ".", "!", "?", "#"], max_repeats=1):
    # Create a regex pattern for any symbol except the ones in 'keep_symbols'
    keep_pattern = ''.join(re.escape(sym) for sym in keep_symbols)
    pattern = rf"([^a-zA-Z0-9{keep_pattern}\s])\1{{{max_repeats},}}"

    # Filter out rows with excessive symbols
    cleaned_dataframe = dataframe[~dataframe[text_column].apply(lambda x: bool(re.search(pattern, x)))]

    return cleaned_dataframe

# Remove the excessive emoji
def limit_sequential_emojis_df(dataframe, column_name='text', emoji_limit=2):
    """
    Limits sequential expressive emojis in the specified DataFrame column to a maximum of 'emoji_limit' per sequence.

    Parameters:
    dataframe (pd.DataFrame): The DataFrame containing the column with text.
    column_name (str): The name of the column containing text with emojis.
    emoji_limit (int): The maximum allowed number of consecutive emojis.

    Returns:
    pd.DataFrame: A DataFrame with the specified column's text processed to limit consecutive emojis.
    """
    # Define a regex pattern for matching emojis
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # Emoticons
                               u"\U0001F300-\U0001F5FF"  # Symbols & Pictographs
                               u"\U0001F680-\U0001F6FF"  # Transport & Map Symbols
                               u"\U0001F700-\U0001F77F"  # Alchemical Symbols
                               u"\U0001F780-\U0001F7FF"  # Geometric Shapes Extended
                               u"\U0001F800-\U0001F8FF"  # Supplemental Arrows-C
                               u"\U0001F900-\U0001F9FF"  # Supplemental Symbols and Pictographs
                               u"\U0001FA00-\U0001FA6F"  # Chess Symbols
                               u"\U0001FA70-\U0001FAFF"  # Symbols and Pictographs Extended-A
                               u"\U00002702-\U000027B0"  # Dingbats
                               u"\U000024C2-\U0001F251"  # Enclosed characters
                               "]+", flags=re.UNICODE)

    # Function to apply to each text entry in the specified column
    def limit_emojis(text):
        return re.sub(rf"({emoji_pattern.pattern})\1{{{emoji_limit},}}", r"\1" * emoji_limit, text)

    # Apply the limit_emojis function to the specified column in the DataFrame
    dataframe[column_name] = dataframe[column_name].apply(limit_emojis)
    return dataframe

# Drop Specific Data by Instruction
def drop_specific_instruction(dataframe, instruction_text):
    return dataframe[dataframe['instruction'] != instruction_text]

# people_event_instruct

In [None]:
df_people = dataset['people_event_instruct'].to_pandas()
df_people

Unnamed: 0,instruction,text
0,Generate a tweet about FAMU.,School Monday and honestly I've always hated s...
1,Generate a tweet about dbh.,dbh the worst game of all time its awful its t...
2,Generate a tweet about Madrid.,I wish I had a @user cheki to do that wota thi...
3,Generate a tweet about Patronato.,"10‚Äô | 0-0 | Good start, Patronato dangerous on..."
4,Generate a tweet about Torrey Pines.,I don't have a big opinion on the Torrey Pines...
...,...,...
22169,Generate a tweet about Dolph Ziggler.,Need Omos to sell like Dolph Ziggler for the R...
22170,Generate a tweet about Connor Williams.,"As expected, the entire starting offensive lin..."
22171,Generate a tweet about HRT.,looking at pics of girls 1 year into HRT alrea...
22172,Generate a tweet about Charlene.,At this wedding and the dj just played Charlen...


In [None]:
print_detection(df_people)

Number of data:  22174
-----
link_count 0
mention_count 930
hashtag_count 3494
long_text_count 1
Texts with more than 1 mention: 4
Texts with more than 2 hashtag: 24
low_quality_content_count 4563
Excessive Emoji 210


In [None]:
df_people = remove_links(df_people)
print_detection(df_people)

Number of data:  22174
-----
link_count 0
mention_count 930
hashtag_count 3494
long_text_count 1
Texts with more than 1 mention: 4
Texts with more than 2 hashtag: 24
low_quality_content_count 4563
Excessive Emoji 210


In [None]:
df_people = drop_specific_instruction(df_people, "Generate a tweet about ùôâùôñùô™ùô©ùôûùôòùôñ.")
print_detection(df_people)

Number of data:  22173
-----
link_count 0
mention_count 930
hashtag_count 3494
long_text_count 1
Texts with more than 1 mention: 4
Texts with more than 2 hashtag: 24
low_quality_content_count 4563
Excessive Emoji 209


In [None]:
df_people = remove_long_texts(df_people)
print_detection(df_people)

Number of data:  22172
-----
link_count 0
mention_count 930
hashtag_count 3493
long_text_count 0
Texts with more than 1 mention: 4
Texts with more than 2 hashtag: 23
low_quality_content_count 4562
Excessive Emoji 208


In [None]:
df_people = remove_mentions(df_people)
print_detection(df_people)

Number of data:  21242
-----
link_count 0
mention_count 0
hashtag_count 3201
long_text_count 0
Texts with more than 1 mention: 0
Texts with more than 2 hashtag: 13
low_quality_content_count 4301
Excessive Emoji 196


In [None]:
df_people = remove_excessive_hashtags(df_people)
print_detection(df_people)

Number of data:  21229
-----
link_count 0
mention_count 0
hashtag_count 3188
long_text_count 0
Texts with more than 1 mention: 0
Texts with more than 2 hashtag: 0
low_quality_content_count 4293
Excessive Emoji 196


In [None]:
df_people = remove_low_quality_content(df_people)
print_detection(df_people)

Number of data:  16981
-----
link_count 0
mention_count 0
hashtag_count 2379
long_text_count 0
Texts with more than 1 mention: 0
Texts with more than 2 hashtag: 0
low_quality_content_count 45
Excessive Emoji 140


In [None]:
df_people = remove_excessive_symbols(df_people)
print_detection(df_people)

Number of data:  16145
-----
link_count 0
mention_count 0
hashtag_count 2290
long_text_count 0
Texts with more than 1 mention: 0
Texts with more than 2 hashtag: 0
low_quality_content_count 41
Excessive Emoji 117


In [None]:
df_people = limit_sequential_emojis_df(df_people)
print_detection(df_people)

Number of data:  16145
-----
link_count 0
mention_count 0
hashtag_count 2290
long_text_count 0
Texts with more than 1 mention: 0
Texts with more than 2 hashtag: 0
low_quality_content_count 41
Excessive Emoji 117


In [None]:
df_people

Unnamed: 0,instruction,text
1,Generate a tweet about dbh.,dbh the worst game of all time its awful its t...
4,Generate a tweet about Torrey Pines.,I don't have a big opinion on the Torrey Pines...
6,Generate a tweet about TAZ.,Listening to TAZ balance and Griffin‚Äôs ‚Äúour ca...
7,Generate a tweet about Norman Reedus.,I've only seen random eps of Helluva Boss but ...
8,Generate a tweet about Nutella.,Banana+Nutella snack pack=someone is gonna see...
...,...,...
22165,Generate a tweet about ARLINGTON ROAD.,P.S. one of my favorite Nicholls scripts is Eh...
22168,Generate a tweet about Keith Taylor.,"Keith Taylor down, taps on his right shoulder ..."
22169,Generate a tweet about Dolph Ziggler.,Need Omos to sell like Dolph Ziggler for the R...
22172,Generate a tweet about Charlene.,At this wedding and the dj just played Charlen...


# topic_instruct

In [None]:
df_topic = dataset['topic_instruct'].to_pandas()
df_topic

Unnamed: 0,instruction,text
0,Write a tweet on the topic of sports and gaming.,The LumberKings beat the Rapids Kernels 4-0 ...
1,Write a tweet on the topic of sports and gaming.,I would rather hear Eli Gold announce this Aub...
2,Write a tweet on the topic of sports and gaming.,"Someone take my phone away, I‚Äôm trying to not ..."
3,Write a tweet on the topic of sports and gaming.,"A year ago, Louisville struggled to beat an FC..."
4,Write a tweet on the topic of sports and gaming.,Anyone know why the #Dodgers #Orioles game nex...
...,...,...
6062,Write a tweet on the topic of daily life.,Praying for family friends riding out IDA be ...
6063,Write a tweet on the topic of business and ent...,"Etsy: Hi there, I m sorry to hear that your a..."
6064,Write a tweet on the topic of pop culture.,Imagine how hard it is to be a Teume. Everyone...
6065,Write a tweet on the topic of pop culture.,Ride With Me - Mental Health Awareness Charity...


In [None]:
print_detection(df_topic)

Number of data:  6067
-----
link_count 2678
mention_count 0
hashtag_count 2419
long_text_count 0
Texts with more than 1 mention: 0
Texts with more than 2 hashtag: 699
low_quality_content_count 1394
Excessive Emoji 66


In [None]:
df_topic = remove_links(df_topic)
print_detection(df_topic)

Number of data:  3389
-----
link_count 0
mention_count 0
hashtag_count 1362
long_text_count 0
Texts with more than 1 mention: 0
Texts with more than 2 hashtag: 307
low_quality_content_count 757
Excessive Emoji 40


In [None]:
df_topic = remove_long_texts(df_topic)
print_detection(df_topic)

Number of data:  3389
-----
link_count 0
mention_count 0
hashtag_count 1362
long_text_count 0
Texts with more than 1 mention: 0
Texts with more than 2 hashtag: 307
low_quality_content_count 757
Excessive Emoji 40


In [None]:
df_topic = remove_mentions(df_topic)
print_detection(df_topic)

Number of data:  3389
-----
link_count 0
mention_count 0
hashtag_count 1362
long_text_count 0
Texts with more than 1 mention: 0
Texts with more than 2 hashtag: 307
low_quality_content_count 757
Excessive Emoji 40


In [None]:
df_topic = remove_excessive_hashtags(df_topic)
print_detection(df_topic)

Number of data:  3082
-----
link_count 0
mention_count 0
hashtag_count 1055
long_text_count 0
Texts with more than 1 mention: 0
Texts with more than 2 hashtag: 0
low_quality_content_count 665
Excessive Emoji 24


In [None]:
df_topic = remove_low_quality_content(df_topic)
print_detection(df_topic)

Number of data:  2426
-----
link_count 0
mention_count 0
hashtag_count 800
long_text_count 0
Texts with more than 1 mention: 0
Texts with more than 2 hashtag: 0
low_quality_content_count 9
Excessive Emoji 21


In [None]:
df_topic = remove_excessive_symbols(df_topic)
print_detection(df_topic)

Number of data:  2397
-----
link_count 0
mention_count 0
hashtag_count 790
long_text_count 0
Texts with more than 1 mention: 0
Texts with more than 2 hashtag: 0
low_quality_content_count 8
Excessive Emoji 13


In [None]:
df_topic = limit_sequential_emojis_df(df_topic)
print_detection(df_topic)

Number of data:  2397
-----
link_count 0
mention_count 0
hashtag_count 790
long_text_count 0
Texts with more than 1 mention: 0
Texts with more than 2 hashtag: 0
low_quality_content_count 8
Excessive Emoji 13


In [None]:
df_topic

Unnamed: 0,instruction,text
0,Write a tweet on the topic of sports and gaming.,The LumberKings beat the Rapids Kernels 4-0 ...
1,Write a tweet on the topic of sports and gaming.,I would rather hear Eli Gold announce this Aub...
2,Write a tweet on the topic of sports and gaming.,"Someone take my phone away, I‚Äôm trying to not ..."
3,Write a tweet on the topic of sports and gaming.,"A year ago, Louisville struggled to beat an FC..."
4,Write a tweet on the topic of sports and gaming.,Anyone know why the #Dodgers #Orioles game nex...
...,...,...
6056,Write a tweet on the topic of daily life.,Stay safe and healthy hope you see you soon‚ù§Ô∏è...
6057,Write a tweet on the topic of daily life.,Sivaangi isssaaa vibe. Once you see the world ...
6058,Write a tweet on the topic of sports and gaming.,The fact that 90% of my timeline is post about...
6062,Write a tweet on the topic of daily life.,Praying for family friends riding out IDA be ...


# general_instruct

In [None]:
df_general = dataset['general_instruct'].to_pandas()
df_general

Unnamed: 0,instruction,text
0,Can you write a tweet to me?,why am i awake so early? damn projects. super...
1,Can you write a tweet to me?,watching church online because I'd be half an ...
2,Write a human-like tweet.,Hillsong!
3,Can you generate a tweet?,is at Stafford Train Station and just watched ...
4,Can you write a tweet to me?,thanks everyone for the follow fridays!
...,...,...
94030,Write a tweet that looks human written.,"I'm listening to Global Communication - 9 25, ..."
94031,Write a tweet that looks human written.,What why do you say that?
94032,Can you generate a tweet?,back to work... but started with a trip to the...
94033,Generate a human-like tweet to me?,finishing the front of these shirts myself bec...


In [None]:
print_detection(df_general)

Number of data:  94035
-----
link_count 5
mention_count 0
hashtag_count 2025
long_text_count 0
Texts with more than 1 mention: 0
Texts with more than 2 hashtag: 63
low_quality_content_count 8675
Excessive Emoji 0


In [None]:
df_general = remove_links(df_general)
print_detection(df_general)


Number of data:  94030
-----
link_count 0
mention_count 0
hashtag_count 2024
long_text_count 0
Texts with more than 1 mention: 0
Texts with more than 2 hashtag: 63
low_quality_content_count 8674
Excessive Emoji 0


In [None]:
df_general = remove_long_texts(df_general)
print_detection(df_general)


Number of data:  94030
-----
link_count 0
mention_count 0
hashtag_count 2024
long_text_count 0
Texts with more than 1 mention: 0
Texts with more than 2 hashtag: 63
low_quality_content_count 8674
Excessive Emoji 0


In [None]:
df_general = remove_mentions(df_general)
print_detection(df_general)


Number of data:  94030
-----
link_count 0
mention_count 0
hashtag_count 2024
long_text_count 0
Texts with more than 1 mention: 0
Texts with more than 2 hashtag: 63
low_quality_content_count 8674
Excessive Emoji 0


In [None]:
df_general = remove_excessive_hashtags(df_general)
print_detection(df_general)


Number of data:  93967
-----
link_count 0
mention_count 0
hashtag_count 1961
long_text_count 0
Texts with more than 1 mention: 0
Texts with more than 2 hashtag: 0
low_quality_content_count 8670
Excessive Emoji 0


In [None]:
df_general = remove_low_quality_content(df_general)
print_detection(df_general)


Number of data:  85396
-----
link_count 0
mention_count 0
hashtag_count 1756
long_text_count 0
Texts with more than 1 mention: 0
Texts with more than 2 hashtag: 0
low_quality_content_count 99
Excessive Emoji 0


In [None]:
df_general = remove_excessive_symbols(df_general)
print_detection(df_general)


Number of data:  84660
-----
link_count 0
mention_count 0
hashtag_count 1748
long_text_count 0
Texts with more than 1 mention: 0
Texts with more than 2 hashtag: 0
low_quality_content_count 97
Excessive Emoji 0


In [None]:
df_general

Unnamed: 0,instruction,text
0,Can you write a tweet to me?,why am i awake so early? damn projects. super...
1,Can you write a tweet to me?,watching church online because I'd be half an ...
2,Write a human-like tweet.,Hillsong!
3,Can you generate a tweet?,is at Stafford Train Station and just watched ...
4,Can you write a tweet to me?,thanks everyone for the follow fridays!
...,...,...
94028,Can you generate a tweet?,i loooove ice cream and television. i wanna go...
94029,Write a human-like tweet.,is watching 'Sholay'
94030,Write a tweet that looks human written.,"I'm listening to Global Communication - 9 25, ..."
94031,Write a tweet that looks human written.,What why do you say that?


# tones_instruct

In [None]:
df_tones = dataset['tones_instruct'].to_pandas()
df_tones

Unnamed: 0,instruction,text
0,Can you write a tweet with a casual tone?,Bono... who cares. Soon people will understand...
1,Can you write a tweet with a controversial tone?,Eight years the republicans denied obama‚Äôs pic...
2,Can you write a tweet with a casual tone?,Get him some line help. He is gonna be just fi...
3,Can you write a tweet with a controversial tone?,She has become a parody unto herself? She has ...
4,Can you write a tweet with a controversial tone?,Your looking more like a plant #maga #walkaway
...,...,...
78179,Can you write a tweet with a friendly tone?,You're eating skin that could have been sent t...
78180,Can you write a tweet with a controversial tone?,Very important thing for today: \n\nDo not #bu...
78181,Can you write a tweet with a controversial tone?,Which #chutiya #producer #invested in #crap #d...
78182,Can you write a tweet with a controversial tone?,Russia story will infuriate Trump today. Media...


In [None]:
print_detection(df_tones)


Number of data:  78184
-----
link_count 575
mention_count 0
hashtag_count 19258
long_text_count 0
Texts with more than 1 mention: 0
Texts with more than 2 hashtag: 4627
low_quality_content_count 14199
Excessive Emoji 138


In [None]:
df_tones = remove_links(df_tones)
print_detection(df_tones)


Number of data:  77609
-----
link_count 0
mention_count 0
hashtag_count 18978
long_text_count 0
Texts with more than 1 mention: 0
Texts with more than 2 hashtag: 4506
low_quality_content_count 14083
Excessive Emoji 135


In [None]:
df_tones = remove_long_texts(df_tones)
print_detection(df_tones)


Number of data:  77609
-----
link_count 0
mention_count 0
hashtag_count 18978
long_text_count 0
Texts with more than 1 mention: 0
Texts with more than 2 hashtag: 4506
low_quality_content_count 14083
Excessive Emoji 135


In [None]:
df_tones = remove_mentions(df_tones)
print_detection(df_tones)


Number of data:  77609
-----
link_count 0
mention_count 0
hashtag_count 18978
long_text_count 0
Texts with more than 1 mention: 0
Texts with more than 2 hashtag: 4506
low_quality_content_count 14083
Excessive Emoji 135


In [None]:
df_tones = remove_excessive_hashtags(df_tones)
print_detection(df_tones)


Number of data:  73103
-----
link_count 0
mention_count 0
hashtag_count 14472
long_text_count 0
Texts with more than 1 mention: 0
Texts with more than 2 hashtag: 0
low_quality_content_count 12876
Excessive Emoji 95


In [None]:
df_tones = remove_low_quality_content(df_tones)
print_detection(df_tones)


Number of data:  60440
-----
link_count 0
mention_count 0
hashtag_count 11653
long_text_count 0
Texts with more than 1 mention: 0
Texts with more than 2 hashtag: 0
low_quality_content_count 213
Excessive Emoji 84


In [None]:
df_tones = remove_excessive_symbols(df_tones)
print_detection(df_tones)

Number of data:  57196
-----
link_count 0
mention_count 0
hashtag_count 11085
long_text_count 0
Texts with more than 1 mention: 0
Texts with more than 2 hashtag: 0
low_quality_content_count 204
Excessive Emoji 71


In [None]:
df_tones

Unnamed: 0,instruction,text
2,Can you write a tweet with a casual tone?,Get him some line help. He is gonna be just fi...
4,Can you write a tweet with a controversial tone?,Your looking more like a plant #maga #walkaway
5,Can you write a tweet with a casual tone?,Been a Willie fan since before most of you wer...
6,Can you write a tweet with a casual tone?,Here's a link to my channel with a plethora of...
7,Can you write a tweet with a controversial tone?,Antifa would burn a Conservatives house down a...
...,...,...
78176,Can you write a tweet with a controversial tone?,Everything I order online just comes looking l...
78177,Can you write a tweet with a controversial tone?,Some 'friends' get bitter when it seems your l...
78179,Can you write a tweet with a friendly tone?,You're eating skin that could have been sent t...
78182,Can you write a tweet with a controversial tone?,Russia story will infuriate Trump today. Media...


# Combine df_people, df_general, df_tones, df_topic

In [None]:
from huggingface_hub import HfApi, HfFolder
from datasets import Dataset

huggingface_token = "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"  # Replace with your token
HfFolder.save_token(huggingface_token)

In [None]:
df_people['subset'] = 'people_event_instruct'
df_general['subset'] = 'general_instruct'
df_tones['subset'] = 'tones_instruct'
df_topic['subset'] = 'topic_instruct'

# Concatenate the DataFrames
combined_df = pd.concat([df_people, df_general, df_tones, df_topic], ignore_index=True)

In [None]:
combined_df

Unnamed: 0,instruction,text,subset
0,Generate a tweet about dbh.,dbh the worst game of all time its awful its t...,people_event_instruct
1,Generate a tweet about Torrey Pines.,I don't have a big opinion on the Torrey Pines...,people_event_instruct
2,Generate a tweet about TAZ.,Listening to TAZ balance and Griffin‚Äôs ‚Äúour ca...,people_event_instruct
3,Generate a tweet about Norman Reedus.,I've only seen random eps of Helluva Boss but ...,people_event_instruct
4,Generate a tweet about Nutella.,Banana+Nutella snack pack=someone is gonna see...,people_event_instruct
...,...,...,...
160393,Write a tweet on the topic of daily life.,Stay safe and healthy hope you see you soon‚ù§Ô∏è...,topic_instruct
160394,Write a tweet on the topic of daily life.,Sivaangi isssaaa vibe. Once you see the world ...,topic_instruct
160395,Write a tweet on the topic of sports and gaming.,The fact that 90% of my timeline is post about...,topic_instruct
160396,Write a tweet on the topic of daily life.,Praying for family friends riding out IDA be ...,topic_instruct


In [None]:
def push_to_huggingface(df, dataset_name, repo_id):
    # Convert the DataFrame to Hugging Face Dataset format
    dataset = Dataset.from_pandas(df)

    # Push the dataset to Hugging Face
    dataset.push_to_hub(repo_id)

    print(f"Dataset '{dataset_name}' pushed to Hugging Face at: https://huggingface.co/datasets/{repo_id}")

In [None]:
dataset_name = "AlanYky/tweets_instruct_v2"
repo_id = "AlanYky/tweets_instruct_v2"

push_to_huggingface(
    df=combined_df,
    dataset_name=dataset_name,
    repo_id=repo_id
)

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/161 [00:00<?, ?ba/s]

Dataset 'AlanYky/tweets_instruct_v2' pushed to Hugging Face at: https://huggingface.co/datasets/AlanYky/tweets_instruct_v2
