# Loading Dataset

In [2]:
import pandas as pd

In [3]:
df = pd.read_csv("./dataset_folder/archive/file.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,tweets,labels
0,0,ChatGPT: Optimizing Language Models for Dialog...,neutral
1,1,"Try talking with ChatGPT, our new AI system wh...",good
2,2,ChatGPT: Optimizing Language Models for Dialog...,neutral
3,3,"THRILLED to share that ChatGPT, our new model ...",good
4,4,"As of 2 minutes ago, @OpenAI released their ne...",bad


## Drop Unnecessary columns

In [4]:
df = df.drop(columns=['Unnamed: 0'])
df.head()

Unnamed: 0,tweets,labels
0,ChatGPT: Optimizing Language Models for Dialog...,neutral
1,"Try talking with ChatGPT, our new AI system wh...",good
2,ChatGPT: Optimizing Language Models for Dialog...,neutral
3,"THRILLED to share that ChatGPT, our new model ...",good
4,"As of 2 minutes ago, @OpenAI released their ne...",bad


## Data-wrangling
- Clearing __null value__ records
- Deleting __duplicate__ records

In [5]:
df.isna().sum()

tweets    0
labels    0
dtype: int64

In [6]:
df.duplicated().sum()

1671

In [7]:
df = df.drop_duplicates()

## Creating a sub dataframe for testing purpose
Reason: main dataframe is too large. If this sub dataframe performs great, no need for full dataframe
- taking 5000 samples from each category
- Creates a sample that has **15000** total tweets

### label=good dataframe

In [31]:
df_good = df[df.labels == "good"].sample(
    n=5000,
    replace=False,
    ignore_index=True,
    random_state=0
)
df_good

Unnamed: 0,tweets,labels
0,#artificialintelligence #ChatGPT Amazing but F...,good
1,"Even though I know it's not the first, and pro...",good
2,"Fast Company: The internet loves ChatGPT, but ...",good
3,RT @jameselder@tweetrex.co.uk\nOh my God.\n\nI...,good
4,"I may be late to the party, but I am seriously...",good
...,...,...
4995,Happy New Year! May all my wildest dreams come...,good
4996,Holymoly.... ChatGPT has gone fast with replie...,good
4997,dear chatGPT please write me a script that mak...,good
4998,I’ve spent more hours this week talking to an ...,good


### label=bad dataframe

In [32]:
df_bad = df[df.labels == "bad"].sample(
    n=5000,
    replace=False,
    ignore_index=True,
    random_state=0
)
df_bad

Unnamed: 0,tweets,labels
0,Deconstructing ChatGPT on the Future of Contin...,bad
1,Has anyone ported Doom to ChatGPT yet?,bad
2,Yet #ChatGPT has given me multiple incorrect d...,bad
3,Generative AI is progressing furiously—and edu...,bad
4,⚗️ChatGPT Most Extraordinary Tweets\n#ChatGPT ...,bad
...,...,...
4995,Building an interpreter for my programming lan...,bad
4996,I asked ChatGPT to draft a letter to me compla...,bad
4997,YouTube Summary with ChatGPT / Glasp #Startup ...,bad
4998,#ChatGPT is CHADGPT for me. \n\nIt will be my ...,bad


### label=neutral dataframe

In [33]:
df_neutral = df[df.labels == "neutral"].sample(
    n=5000,
    replace=False,
    ignore_index=True,
    random_state=0
)
df_neutral

Unnamed: 0,tweets,labels
0,We're going to look back years from now and se...,neutral
1,Prompt-engineered artwork via stable-diffusion...,neutral
2,Google Introduces ChatGPT-like ChatBot for Hea...,neutral
3,The Brilliance and Weirdness of ChatGPT\n\n#Op...,neutral
4,Another interesting result from ChatGPT: https...,neutral
...,...,...
4995,Interesting: it seems ChatGPT has been neutere...,neutral
4996,"A lot of people in tech, and most people outsi...",neutral
4997,"Okay sure ChatGPT can *code*, but can it atten...",neutral
4998,"Q: ""How many genders are there?""\n\n#ChatGPT: ...",neutral


### Concatenating the 3 (good, bad, neutral) datframes

In [34]:
df_new = pd.concat([df_good, df_bad, df_neutral]).sample(
    frac=1,
    replace=False,
    random_state=0,
    ignore_index=True
)
df_new

Unnamed: 0,tweets,labels
0,My first hand experience and how I took benefi...,good
1,chatGPT by @OpenAI is awesome and scary at the...,neutral
2,ChatGPT solves (or will) the two biggest issue...,neutral
3,Massive datasets are important for LLMs like #...,good
4,ChatGPT is redpilled on the fed https://t.co/b...,bad
...,...,...
14995,new date idea: getting dinner then generating ...,neutral
14996,Fun Fact: This entire tweet was generated by C...,good
14997,During Web3 craze what was weird is how non-te...,bad
14998,ChatGpt bot is cool 😇,neutral


In [13]:
df_new.duplicated().sum()

0

# Defining A function for processing the text

In [27]:
import spacy
import re

nlp = spacy.load('en_core_web_sm')


def preprocess(text):
    
    text_without_websites = re.sub(r'https?://\S+', '', text)
    text_without_line_breaks = text_without_websites.replace(r'\n', ' ')
    text = text_without_line_breaks.replace("  ", " ").strip()
    
    filtered_tokens = []
    
    for token in nlp(text):
        if token.is_punct or token.is_stop:
            continue
        filtered_tokens.append(token.lemma_)
        
    return ' '.join(filtered_tokens)

In [35]:
df_new['num_labels'] = df_new.labels.map({
    "neutral":0,
    "good":1,
    "bad":2
})
df_new.head()

Unnamed: 0,tweets,labels,num_labels
0,My first hand experience and how I took benefi...,good,1
1,chatGPT by @OpenAI is awesome and scary at the...,neutral,0
2,ChatGPT solves (or will) the two biggest issue...,neutral,0
3,Massive datasets are important for LLMs like #...,good,1
4,ChatGPT is redpilled on the fed https://t.co/b...,bad,2


In [38]:
df_new['processed_tweets'] = df_new.tweets.apply(preprocess)
df_new

Unnamed: 0,tweets,labels,num_labels,processed_tweets
0,My first hand experience and how I took benefi...,good,1,hand experience take benefit AI chatgpt3 folk ...
1,chatGPT by @OpenAI is awesome and scary at the...,neutral,0,chatgpt @openai awesome scary time tool crea...
2,ChatGPT solves (or will) the two biggest issue...,neutral,0,ChatGPT solve big issue Stack Overflow 1 begin...
3,Massive datasets are important for LLMs like #...,good,1,massive dataset important llm like chatgpt get...
4,ChatGPT is redpilled on the fed https://t.co/b...,bad,2,ChatGPT redpille fed
...,...,...,...,...
14995,new date idea: getting dinner then generating ...,neutral,0,new date idea get dinner generate romantic sto...
14996,Fun Fact: This entire tweet was generated by C...,good,1,Fun fact entire tweet generate ChatGPT 😂
14997,During Web3 craze what was weird is how non-te...,bad,2,Web3 craze weird non technical people explain ...
14998,ChatGpt bot is cool 😇,neutral,0,ChatGpt bot cool 😇


In [39]:
df_new = df_new.drop(columns=['labels', 'tweets'])
df_new.head()

Unnamed: 0,num_labels,processed_tweets
0,1,hand experience take benefit AI chatgpt3 folk ...
1,0,chatgpt @openai awesome scary time tool crea...
2,0,ChatGPT solve big issue Stack Overflow 1 begin...
3,1,massive dataset important llm like chatgpt get...
4,2,ChatGPT redpille fed


In [41]:
df_new.to_csv(
    "processed_file.csv",
    index=False
)