In [1]:
%load_ext cudf.pandas
import pandas as pd

In [19]:
%%cudf.pandas.profile

df = pd.read_csv("./dataset_folder/archive/file.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,tweets,labels
0,0,ChatGPT: Optimizing Language Models for Dialog...,neutral
1,1,"Try talking with ChatGPT, our new AI system wh...",good
2,2,ChatGPT: Optimizing Language Models for Dialog...,neutral
3,3,"THRILLED to share that ChatGPT, our new model ...",good
4,4,"As of 2 minutes ago, @OpenAI released their ne...",bad


In [20]:
df = df.drop(columns=['Unnamed: 0'])
df.head()

Unnamed: 0,tweets,labels
0,ChatGPT: Optimizing Language Models for Dialog...,neutral
1,"Try talking with ChatGPT, our new AI system wh...",good
2,ChatGPT: Optimizing Language Models for Dialog...,neutral
3,"THRILLED to share that ChatGPT, our new model ...",good
4,"As of 2 minutes ago, @OpenAI released their ne...",bad


In [21]:
df.isna().sum()

tweets    0
labels    0
dtype: int64

In [22]:
df.duplicated().sum()

1671

In [23]:
df = df.drop_duplicates()

In [24]:
df_good = df[df.labels == "good"].sample(
    n=5000,
    replace=False,
    ignore_index=True
)
df_good

Unnamed: 0,tweets,labels
0,Fascinating. Where ChatGPT draws the line is s...,good
1,Day 10 / #100DaysOfCode \n\nToday I explored #...,good
2,Won't be surprised if a few companies replace ...,good
3,Predictions from #ChatGPT should be interestin...,good
4,Someone told me about this OpenAI ChatGPT thin...,good
...,...,...
4995,"The difference I see in OpenAI's ChatGPT, vs p...",good
4996,I just spent about 40min playing around with C...,good
4997,Question : What is the best freemium model for...,good
4998,#technology #artificialintelligence #online He...,good


In [25]:
df_bad = df[df.labels == "bad"].sample(
    n=5000,
    replace=False,
    ignore_index=True
)
df_bad

Unnamed: 0,tweets,labels
0,I’m officially a ChatGPT user https://t.co/agp...,bad
1,ChatGPT Holds Promise and Peril /// https://t....,bad
2,Most tweeted articles today in Artificial Inte...,bad
3,"What is ChatGPT, the AI taking the web by stor...",bad
4,"Again To ChatGPT:\n""Refactor the script by usi...",bad
...,...,...
4995,"Instead of making ChatGPT an oracle, what if y...",bad
4996,Top story: Ist die ChatGPT AI der ultimative G...,bad
4997,Chatgpt is the future 💯,bad
4998,ChatGPT: Everything you really need to know (I...,bad


In [26]:
df_neutral = df[df.labels == "neutral"].sample(
    n=5000,
    replace=False,
    ignore_index=True
)
df_neutral

Unnamed: 0,tweets,labels
0,"Mosaic (1993), iPhone (2007), ChatGPT (2022). ...",neutral
1,US Top News | Wed | 14 Dec | 20:36 | UTC | Wha...,neutral
2,ChatGPT is the solution to all the people that...,neutral
3,I can't stop laughing #memes #meme #funny #Ope...,neutral
4,💡 Is there a path to enlightenment? I asked an...,neutral
...,...,...
4995,ChatGPT Could Soon Be the Better Way to Google...,neutral
4996,I used ChatGPT to do my holiday shopping this ...,neutral
4997,"#ChatGPT says no, but I think it means yes 😂 h...",neutral
4998,Asked ChatGPT an interesting onc clinical ques...,neutral


In [27]:
df_new = pd.concat([df_good, df_bad, df_neutral]).sample(
    frac=1,
    replace=False
)
df_new

Unnamed: 0,tweets,labels
3292,I just spoke in #Esperanto with #ChatGPT :D ht...,bad
4533,1/ @OpenAI's ChatGPT + @__UBERDUCK__ + my lac...,neutral
4631,ChatGPT is interesting but I am curious when a...,good
2200,ChatGPT is an amazing achievement but it won't...,good
2482,Wow! #ChatGPT can learn some pretty nifty tric...,good
...,...,...
644,Looks like the Cavemen put #chatGPT through a ...,good
4456,Bro HHHHHHHHHHHHHH\nthis #ChatGPT is just amaz...,good
4295,I copied a pasted a math problem from https://...,bad
3130,chatGPT writes better code than half the profs...,neutral


In [28]:
df_new.duplicated().sum()

0

# Defining A function for processing the text

In [15]:
import spacy
import re

nlp = spacy.load('en_core_web_sm')


def preprocess(text):
    
    text_without_websites = re.sub(r'https?://\S+', '', text)
    text_without_line_breaks = text_without_websites.replace('\n', ' ')
    text = text_without_line_breaks.replace("  ", " ").strip()
    
    filtered_tokens = []
    
    for token in nlp(text):
        if token.is_punct or token.is_stop:
            continue
        filtered_tokens.append(token.lemma_)
        
    return ' '.join(filtered_tokens)

In [29]:
df_new['num_labels'] = df_new.labels.map({
    "neutral":0,
    "good":1,
    "bad":2
})
df_new.head()

Unnamed: 0,tweets,labels,num_labels
3292,I just spoke in #Esperanto with #ChatGPT :D ht...,bad,2
4533,1/ @OpenAI's ChatGPT + @__UBERDUCK__ + my lac...,neutral,0
4631,ChatGPT is interesting but I am curious when a...,good,1
2200,ChatGPT is an amazing achievement but it won't...,good,1
2482,Wow! #ChatGPT can learn some pretty nifty tric...,good,1


In [30]:
%%cudf.pandas.profile

df_new['processed_tweets'] = df['tweets'].apply(preprocess)

Unnamed: 0,tweets,labels
0,ChatGPT: Optimizing Language Models for Dialog...,neutral
1,"Try talking with ChatGPT, our new AI system wh...",good
2,ChatGPT: Optimizing Language Models for Dialog...,neutral
3,"THRILLED to share that ChatGPT, our new model ...",good
4,"As of 2 minutes ago, @OpenAI released their ne...",bad
...,...,...
219289,Other Software Projects Are Now Trying to Repl...,bad
219290,I asked #ChatGPT to write a #NYE Joke for SEOs...,good
219291,chatgpt is being disassembled until it can onl...,bad
219292,2023 predictions by #chatGPT. Nothing really s...,bad


In [31]:
df_new

Unnamed: 0,tweets,labels,num_labels,processed_tweets
3292,I just spoke in #Esperanto with #ChatGPT :D ht...,bad,2,chatgpt like fr
4533,1/ @OpenAI's ChatGPT + @__UBERDUCK__ + my lac...,neutral,0,ask ChatGPT AI write movie plot Rob Schneider ...
4631,ChatGPT is interesting but I am curious when a...,good,1,chatgpt actually crazy
2200,ChatGPT is an amazing achievement but it won't...,good,1,Zouk Partners PeTA India Unveils Vegan Creator...
2482,Wow! #ChatGPT can learn some pretty nifty tric...,good,1,chatgpt amazing complaint amazing.\n\nevery ti...
...,...,...,...,...
644,Looks like the Cavemen put #chatGPT through a ...,good,1,clear chatgpt language model text synthesis ma...
4456,Bro HHHHHHHHHHHHHH\nthis #ChatGPT is just amaz...,good,1,\n\ni see experience remotely shocking chatgpt...
4295,I copied a pasted a math problem from https://...,bad,2,Elon Musk twitter completely ChatGPT getting r...
3130,chatGPT writes better code than half the profs...,neutral,0,cursory look pretty impressed chatgpt ability ...


In [32]:
df_new.to_csv(
    "processed_file.csv",
    index=False
)