# Loading Dataset

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("./dataset_folder/archive/file.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,tweets,labels
0,0,ChatGPT: Optimizing Language Models for Dialog...,neutral
1,1,"Try talking with ChatGPT, our new AI system wh...",good
2,2,ChatGPT: Optimizing Language Models for Dialog...,neutral
3,3,"THRILLED to share that ChatGPT, our new model ...",good
4,4,"As of 2 minutes ago, @OpenAI released their ne...",bad


## Drop Unnecessary columns

In [3]:
df = df.drop(columns=['Unnamed: 0'])
df.head()

Unnamed: 0,tweets,labels
0,ChatGPT: Optimizing Language Models for Dialog...,neutral
1,"Try talking with ChatGPT, our new AI system wh...",good
2,ChatGPT: Optimizing Language Models for Dialog...,neutral
3,"THRILLED to share that ChatGPT, our new model ...",good
4,"As of 2 minutes ago, @OpenAI released their ne...",bad


## Data-wrangling
- Clearing __null value__ records
- Deleting __duplicate__ records

In [4]:
df.isna().sum()

tweets    0
labels    0
dtype: int64

In [5]:
df.duplicated().sum()

1671

In [6]:
df = df.drop_duplicates()

## Creating a sub dataframe for testing purpose
Reason: main dataframe is too large. If this sub dataframe performs great, no need for full dataframe
- taking 5000 samples from each category
- Creates a sample that has **15000** total tweets

### label=good dataframe

In [7]:
df_good = df[df.labels == "good"].sample(
    n=20000,
    replace=False,
    ignore_index=True,
    random_state=0
)
df_good

Unnamed: 0,tweets,labels
0,#artificialintelligence #ChatGPT Amazing but F...,good
1,"Even though I know it's not the first, and pro...",good
2,"Fast Company: The internet loves ChatGPT, but ...",good
3,RT @jameselder@tweetrex.co.uk\nOh my God.\n\nI...,good
4,"I may be late to the party, but I am seriously...",good
...,...,...
19995,Having fun with #ChatGPT on my AI-related prof...,good
19996,A little late to the party but ChatGPT is great.,good
19997,My daily experience of AI is the chatbots on w...,good
19998,Having a great discussion with ChatGPT today a...,good


### label=bad dataframe

In [8]:
df_bad = df[df.labels == "bad"].sample(
    n=20000,
    replace=False,
    ignore_index=True,
    random_state=0
)
df_bad

Unnamed: 0,tweets,labels
0,Deconstructing ChatGPT on the Future of Contin...,bad
1,Has anyone ported Doom to ChatGPT yet?,bad
2,Yet #ChatGPT has given me multiple incorrect d...,bad
3,Generative AI is progressing furiously—and edu...,bad
4,⚗️ChatGPT Most Extraordinary Tweets\n#ChatGPT ...,bad
...,...,...
19995,I’m going to try to get ChatGPT to write my ne...,bad
19996,ChatGPT: Everything you need to know about cha...,bad
19997,My feed is 80% ChatGPT. What else is going on?,bad
19998,Article summary: https://t.co/XQPMPEVqn9 (I'm ...,bad


### label=neutral dataframe

In [9]:
df_neutral = df[df.labels == "neutral"].sample(
    n=20000,
    replace=False,
    ignore_index=True,
    random_state=0
)
df_neutral

Unnamed: 0,tweets,labels
0,We're going to look back years from now and se...,neutral
1,Prompt-engineered artwork via stable-diffusion...,neutral
2,Google Introduces ChatGPT-like ChatBot for Hea...,neutral
3,The Brilliance and Weirdness of ChatGPT\n\n#Op...,neutral
4,Another interesting result from ChatGPT: https...,neutral
...,...,...
19995,"Thank you ChatGPT, for the ETL Tool 🤣🤣 https:/...",neutral
19996,The Brilliance and Weirdness of ChatGPT – A ne...,neutral
19997,ChatGPT tells DALL.E to create image. https://...,neutral
19998,Welcome to our team Gregory Mitchell 💡\nhttps:...,neutral


### Concatenating the 3 (good, bad, neutral) datframes

In [10]:
df_new = pd.concat([df_good, df_bad, df_neutral]).sample(
    frac=1,
    replace=False,
    random_state=0,
    ignore_index=True
)
df_new

Unnamed: 0,tweets,labels
0,"Alright, the ChatGPT AI is pretty neat @NoCont...",good
1,Ok I'm definitely going to be replaced soon. B...,good
2,I asked the ChatGPT if it could explain 4 famo...,neutral
3,ChatGPT Is Too Popular for Its Own Good https:...,good
4,I wrote an IDA plugin that queries #ChatGPT an...,neutral
...,...,...
59995,"thanks chatgpt, it will be `burn20(uint256 id,...",neutral
59996,&lt;&lt; ...Explain your error message with Ch...,neutral
59997,Everyone is talking about #chatgpt3 and its ma...,neutral
59998,"I spent the weekend playing with ChatGPT, MidJ...",neutral


In [11]:
df_new.duplicated().sum()

0

# Defining A function for processing the text

In [12]:
import spacy
import re

nlp = spacy.load('en_core_web_sm')


def preprocess(text):
    
    text_without_websites = re.sub(r'https?://\S+', '', text)
    text_without_line_breaks = text_without_websites.replace(r'\n', ' ')
    text = text_without_line_breaks.replace("  ", " ").strip()
    
    filtered_tokens = []
    
    for token in nlp(text):
        if token.is_punct or token.is_stop:
            continue
        filtered_tokens.append(token.lemma_)
        
    return ' '.join(filtered_tokens)

In [13]:
df_new['num_labels'] = df_new.labels.map({
    "neutral":0,
    "good":1,
    "bad":2
})
df_new.head()

Unnamed: 0,tweets,labels,num_labels
0,"Alright, the ChatGPT AI is pretty neat @NoCont...",good,1
1,Ok I'm definitely going to be replaced soon. B...,good,1
2,I asked the ChatGPT if it could explain 4 famo...,neutral,0
3,ChatGPT Is Too Popular for Its Own Good https:...,good,1
4,I wrote an IDA plugin that queries #ChatGPT an...,neutral,0


In [14]:
df_new['processed_tweets'] = df_new.tweets.apply(preprocess)
df_new

Unnamed: 0,tweets,labels,num_labels,processed_tweets
0,"Alright, the ChatGPT AI is pretty neat @NoCont...",good,1,alright ChatGPT AI pretty neat @nocontextvarg
1,Ok I'm definitely going to be replaced soon. B...,good,1,ok definitely go replace soon importantly chat...
2,I asked the ChatGPT if it could explain 4 famo...,neutral,0,ask chatgpt explain 4 famous chess quote Timma...
3,ChatGPT Is Too Popular for Its Own Good https:...,good,1,chatgpt popular good
4,I wrote an IDA plugin that queries #ChatGPT an...,neutral,0,write ida plugin query chatgpt explain decompi...
...,...,...,...,...
59995,"thanks chatgpt, it will be `burn20(uint256 id,...",neutral,0,thank chatgpt ` burn20(uint256 id owner addres...
59996,&lt;&lt; ...Explain your error message with Ch...,neutral,0,lt;&lt explain error message ChatGPT gt;&gt 🤦 ...
59997,Everyone is talking about #chatgpt3 and its ma...,neutral,0,talk chatgpt3 massive application see talk ben...
59998,"I spent the weekend playing with ChatGPT, MidJ...",neutral,0,spend weekend play ChatGPT MidJourney AI tool ...


In [15]:
df_new = df_new.drop(columns=['labels', 'tweets'])
df_new.head()

Unnamed: 0,num_labels,processed_tweets
0,1,alright ChatGPT AI pretty neat @nocontextvarg
1,1,ok definitely go replace soon importantly chat...
2,0,ask chatgpt explain 4 famous chess quote Timma...
3,1,chatgpt popular good
4,0,write ida plugin query chatgpt explain decompi...


In [17]:
df_new.to_csv(
    "processed_file.csv",
    index=False
)