# Data preprocessing 

In [26]:
import re
import numpy as np
import pandas as pd

# remove english stopwords with the help of the gensim library 
from gensim.parsing.preprocessing import remove_stopwords

import string

from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

## All the methods that will be used to preprocess

In [2]:
english_punctuations = string.punctuation
punctuations_list = english_punctuations

def cleaning_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)

In [3]:
# cleaning and removing URLs

def cleaning_URLs(data):
    return re.sub('((www.[^s]+)|(https?://[^s]+))',' ',data)

def remove_hyperlink(word):
    return re.sub(r"http\S+", "", word)

In [4]:
# cleaning and removing mentions 

def remove_mentions(word):
    return re.sub(r"@\S+", "", word)

In [5]:
# removing emojis 
emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)

In [6]:
# cleaning and removing repeating characters

def cleaning_repeating_char(text):
    return re.sub(r'(.)1+', r'1', text)

In [7]:
stemmer = PorterStemmer()

def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text])

In [28]:
lemmatizer = WordNetLemmatizer()

def lematize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text])

## Example of data preprocessing

In [166]:
# loading the dataset 
# english language dataset 

df = pd.read_csv('datasets/sentiment.csv', usecols=['tweets','labels'])

In [167]:
df.sample(5)

Unnamed: 0,tweets,labels
174246,This is 5 Script Idea for webseries by #ChatGP...,good
13084,#ChatGPT describes the interior life of a corg...,bad
197369,Why check #crypto prices on other websites whe...,bad
35632,"I asked #ChatGPT this question: \n\n""Will AI ...",bad
53288,Ripple CTO shuts down ChatGPT's XRP conspiracy...,bad


In [168]:
df.labels.value_counts() 

bad        107796
good        56011
neutral     55487
Name: labels, dtype: int64

In [169]:
# change the labeling: neutral = 0, positive = 1, negative = -1

df.labels = df.labels.replace('bad', -1)
df.labels = df.labels.replace('good', 1)
df.labels = df.labels.replace('neutral', 0)

In [170]:
df.sample(5)

Unnamed: 0,tweets,labels
90147,ChatGPT is out here serving up real human dram...,-1
115503,Even a robot knows this one 🤖 #chatGPT #person...,-1
24658,OpenAI ChatGPT is anti national..\nI asked it ...,-1
191774,"Delighted to hear this news, but it is frustra...",0
82536,I asked #chatGPT to automate editorial decisio...,0


In [171]:
df.labels.value_counts()

-1    107796
 1     56011
 0     55487
Name: labels, dtype: int64

In [172]:
# we have too much negative tweets compared to the other classes
# we divide the set of negative tweets by 2

mask = (df.labels == -1)
idx, = np.where(mask)
df = df.drop(df.index[idx[:len(idx)//2]])

In [173]:
df.labels.value_counts() 

 1    56011
 0    55487
-1    53898
Name: labels, dtype: int64

In [5]:
# lower cases 
df.tweets = df.tweets.str.lower()
df.tweets.head()

AttributeError: 'DataFrame' object has no attribute 'str'

In [None]:
df.tweets = df.tweets.apply(lambda text: remove_stopwords(text))
df.tweets.head()

In [None]:
df.tweets = df.tweets.apply(lambda x: cleaning_punctuations(x))
df.tweets.head()

In [None]:
df.tweets = df.tweets.apply(lambda x: cleaning_URLs(x))
df.tweets = df.tweets.apply(lambda x: remove_hyperlink(x))
df.tweets.head()

In [None]:
df.tweets = df.tweets.apply(lambda x: remove_mentions(x))
df.tweets.head()

In [None]:
df.tweets = df.tweets.apply(lambda x: emoji_pattern.sub(r'', x))
df.tweets.head()

In [183]:
# remove numbers 
df.tweets = df.tweets.apply(lambda x: re.sub("[^a-zA-Z]", " ", x))
df.tweets.head()

0    chatgpt optimizing language models dialogue  o...
1    try talking chatgpt new ai optimized dialogue ...
2    chatgpt optimizing language models dialogue  a...
3    thrilled share chatgpt new model optimized dia...
5    launched chatgpt new ai optimized dialogue  here 
Name: tweets, dtype: object

In [None]:
df.tweets = df.tweets.apply(lambda x: cleaning_repeating_char(x))
df.tweets.head()

In [185]:
# remove short words that are of no use 
df.tweets = df.tweets.apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))
df.tweets.head()

0    chatgpt optimizing language models dialogue op...
1    try talking chatgpt new optimized dialogue fee...
2    chatgpt optimizing language models dialogue ma...
3    thrilled share chatgpt new model optimized dia...
5         launched chatgpt new optimized dialogue here
Name: tweets, dtype: object

In [186]:
# Text normalization : tokenize the tweets 
# split tweets of text into tokens (individual words of terms)
tokenize_tweets = df.tweets.apply(lambda x: x.split())
tokenize_tweets.head()

0    [chatgpt, optimizing, language, models, dialog...
1    [try, talking, chatgpt, new, optimized, dialog...
2    [chatgpt, optimizing, language, models, dialog...
3    [thrilled, share, chatgpt, new, model, optimiz...
5    [launched, chatgpt, new, optimized, dialogue, ...
Name: tweets, dtype: object

In [None]:
tokenize_tweets = tokenize_tweets.apply(lambda text: stem_words(text))
tokenize_tweets.head()

In [191]:
# check if there is any NaN values 
df.tweets = tokenize_tweets 

df.tweets.isnull().values.any()
df.tweets.isnull().sum()

0

In [192]:
# write new csv file containing the preprocessed dataset
df.to_csv('datasets/preprocessed_sentiment.csv')

## Defining the function for preprocessing the data

In [31]:
import string

def preprocessing(df, path):
    # lower cases 
    df = df.str.lower()
    print("Text of tweets transformed to lower cases.")
    
    # remove english stopwords with the help of the gensim library 
    df = df.apply(lambda text: remove_stopwords(text))
    print("Removed stopwords from the text of tweets.")
    
    #cleaning and removing punctuation
    df = df.apply(lambda x: cleaning_punctuations(x))
    print("Cleaned and removed the punctuations.")
    
    # cleaning and removing URLs
    df = df.apply(lambda x: cleaning_URLs(x))
    df = df.apply(lambda x: remove_hyperlink(x))
    print("Cleaned and removed the URLs and hyperlinks.")
    
    # cleaning and removing mentions 
    df = df.apply(lambda x: remove_mentions(x))
    print("Cleaned and removed the mentions.")
    
    # removing emojis 
    df = df.apply(lambda x: emoji_pattern.sub(r'', x))
    print("Removed the emojis.")
    
    # remove numbers 
    df = df.apply(lambda x: re.sub("[^a-zA-Z]", " ", x))
    print("Removed the numbers.")
    
    # cleaning and removing repeating characters
    df = df.apply(lambda x: cleaning_repeating_char(x))
    print("Cleaned and removed the repeating characters.")
    
    # remove short words that are of no use 
    df = df.apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))
    print("Removed the short words (length less than 3).")
    
    # Text normalization : tokenize the tweets 
    # split tweets of text into tokens (individual words of terms)
    tokenize_tweets = df.apply(lambda x: x.split())
    print("Tokenized the text.")
    
    # Normalization: use of PorterStemmer()
    tokenize_tweets = tokenize_tweets.apply(lambda text: lematize_words(text))
    print("Normalized the text.")
    
    # write new csv file containing the preprocessed dataset
    df.to_csv(path)

### Preprocessing the dataset of the first month of launch

In [31]:
path = 'datasets/preprocessed_chatgptfirst.csv'

df = pd.read_csv('datasets/chatgptfirst.csv')

In [32]:
df.head()

Unnamed: 0,tweet_id,created_at,like_count,quote_count,reply_count,retweet_count,tweet,country,photo_url,city,country_code
0,1598014056790622225,2022-11-30 18:00:15+00:00,2,0,0,0,ChatGPT: Optimizing Language Models for Dialog...,,,,
1,1598014522098208769,2022-11-30 18:02:06+00:00,12179,889,1130,3252,"Try talking with ChatGPT, our new AI system wh...",,,,
2,1598014741527527435,2022-11-30 18:02:58+00:00,2,0,0,1,ChatGPT: Optimizing Language Models for Dialog...,,https://pbs.twimg.com/media/Fi1J8HbWAAMv_yi.jpg,,
3,1598015493666766849,2022-11-30 18:05:58+00:00,561,8,25,66,"THRILLED to share that ChatGPT, our new model ...",,https://pbs.twimg.com/media/Fi1Km3WUYAAfzHS.jpg,,
4,1598015509420994561,2022-11-30 18:06:01+00:00,1,0,0,0,"As of 2 minutes ago, @OpenAI released their ne...",,,,


In [33]:
df = df['tweet'] # we only retrieve the text of the tweet column

In [34]:
# check if there is any NaN value
df.isnull().values.any()
df.isnull().sum()

0

In [35]:
preprocessing(df, path)

### Preprocess the two last month of chatGPT tweets

In [35]:
#path = 'datasets/preprocessed_chatgpt.csv'

df = pd.read_csv('datasets/chatgpt.csv', engine='python')

In [38]:
len(df)

400775

In [37]:
# check if there is any NaN value

df.isnull().values.any()
df.isnull().sum()

user_name               13
text                    42
user_location       112641
user_description     25950
user_created            60
user_followers          60
user_friends            60
user_favourites         60
user_verified           67
date                    68
hashtags             90810
source                 106
dtype: int64

In [39]:
df.columns

Index(['user_name', 'text', 'user_location', 'user_description',
       'user_created', 'user_followers', 'user_friends', 'user_favourites',
       'user_verified', 'date', 'hashtags', 'source'],
      dtype='object')

### Selecting sample of users as scientists

In [40]:
user = df[['user_name', 'text', 'user_description', 'user_verified', 'date']]

In [55]:
user.head(15)

Unnamed: 0,user_name,text,user_description,user_verified,date
0,Bilal Haidar,I'm happy to share with you that @draftmateai ...,Building @draftmateai ▪️ Developer & Author ▪️...,False,2023-04-26 13:02:16+00:00
1,Josh Arrington,🤖 - Integrate ChatGPT into your Marketo Smart ...,"Nomadic marketing vigilante, graphic + web des...",False,2023-04-26 13:02:06+00:00
3,Smartling,We’re pleased to announce new patent-pending t...,The leading #translation management platform t...,False,2023-04-26 13:02:05+00:00
4,Richard,Starting to tire of being scolded by ChatGPT f...,"Engineering leader, coach & mentor building re...",False,2023-04-26 13:02:00+00:00
5,ɃlockRoca ⚡️,@SpartaJustice It's notable that wikipedia ent...,"Never stop learning! Research, develop, ฿uild ...",False,2023-04-26 13:01:52+00:00
6,Susi Wallner,💬 How to utilize advancements in natural #lang...,Head of Communications @StartUsInsights 🚀 \n\...,False,2023-04-26 13:01:41+00:00
7,Jay Scott,@zerohedge #ChatGPT called it 👀 https://t.co/a...,Feed Your Mind 🤯 Nourish Your Soul 🙌 Raise You...,False,2023-04-26 13:01:24+00:00
8,Black Law Firm Marketing Agency,THE #CHATGPT A.I LOCAL SEO DIGITAL PLAYBOOK\nF...,Affordable Legal Marketing Solutions For Black...,False,2023-04-26 13:01:22+00:00
10,Coconut_Bob 🍞,2023-04-26T13:01:04+00:00\nI’m buying some \nD...,"🍁 Hungry to feel.\nAlways a story, sometimes t...",False,2023-04-26 13:01:14+00:00
11,"Benedict Nwachukwu MD, MBA",There is a huge potential for #artificialintel...,Sports Surgeon @HSpecialSurgery | Founder @bes...,False,2023-04-26 13:00:34+00:00


In [42]:
len(user)

400775

In [43]:
user = user.dropna()

In [44]:
len(user)

374777

In [46]:
user.isnull().values.any()
user.isnull().sum()

user_name           0
text                0
user_description    0
user_verified       0
date                0
dtype: int64

In [149]:
user = user.reset_index(drop=True)

In [150]:
user.head(15)

Unnamed: 0,user_name,text,user_description,user_verified,date
0,Bilal Haidar,I'm happy to share with you that @draftmateai ...,Building @draftmateai ▪️ Developer & Author ▪️...,False,2023-04-26 13:02:16+00:00
1,Josh Arrington,🤖 - Integrate ChatGPT into your Marketo Smart ...,"Nomadic marketing vigilante, graphic + web des...",False,2023-04-26 13:02:06+00:00
2,Smartling,We’re pleased to announce new patent-pending t...,The leading #translation management platform t...,False,2023-04-26 13:02:05+00:00
3,Richard,Starting to tire of being scolded by ChatGPT f...,"Engineering leader, coach & mentor building re...",False,2023-04-26 13:02:00+00:00
4,ɃlockRoca ⚡️,@SpartaJustice It's notable that wikipedia ent...,"Never stop learning! Research, develop, ฿uild ...",False,2023-04-26 13:01:52+00:00
5,Susi Wallner,💬 How to utilize advancements in natural #lang...,Head of Communications @StartUsInsights 🚀 \n\...,False,2023-04-26 13:01:41+00:00
6,Jay Scott,@zerohedge #ChatGPT called it 👀 https://t.co/a...,Feed Your Mind 🤯 Nourish Your Soul 🙌 Raise You...,False,2023-04-26 13:01:24+00:00
7,Black Law Firm Marketing Agency,THE #CHATGPT A.I LOCAL SEO DIGITAL PLAYBOOK\nF...,Affordable Legal Marketing Solutions For Black...,False,2023-04-26 13:01:22+00:00
8,Coconut_Bob 🍞,2023-04-26T13:01:04+00:00\nI’m buying some \nD...,"🍁 Hungry to feel.\nAlways a story, sometimes t...",False,2023-04-26 13:01:14+00:00
9,"Benedict Nwachukwu MD, MBA",There is a huge potential for #artificialintel...,Sports Surgeon @HSpecialSurgery | Founder @bes...,False,2023-04-26 13:00:34+00:00


In [151]:
len(user)

374777

In [152]:
sample = user.iloc[:374, :]

In [153]:
sample

Unnamed: 0,user_name,text,user_description,user_verified,date
0,Bilal Haidar,I'm happy to share with you that @draftmateai ...,Building @draftmateai ▪️ Developer & Author ▪️...,False,2023-04-26 13:02:16+00:00
1,Josh Arrington,🤖 - Integrate ChatGPT into your Marketo Smart ...,"Nomadic marketing vigilante, graphic + web des...",False,2023-04-26 13:02:06+00:00
2,Smartling,We’re pleased to announce new patent-pending t...,The leading #translation management platform t...,False,2023-04-26 13:02:05+00:00
3,Richard,Starting to tire of being scolded by ChatGPT f...,"Engineering leader, coach & mentor building re...",False,2023-04-26 13:02:00+00:00
4,ɃlockRoca ⚡️,@SpartaJustice It's notable that wikipedia ent...,"Never stop learning! Research, develop, ฿uild ...",False,2023-04-26 13:01:52+00:00
...,...,...,...,...,...
369,🟩 Dr Martin Hiesboeck,How long before someone puts #ChatGPT in a hum...,Head of Research @upholdinc • Founder @alpineb...,False,2023-04-26 11:31:54+00:00
370,THE DECODER - EVERYTHING AI,👉 #ChatGPT could threaten Google search in the...,News and updates on AI one 👉 at a time\n\nDECO...,False,2023-04-26 11:30:36+00:00
371,Bernard Wainaina,#ChatGPT I don’t think ChatGPT and other langu...,Independent Agribusiness Advisor,False,2023-04-26 11:30:36+00:00
372,"Cybersécurité, IA, Cryptos",AI #Chatbots and the #Humans Who Love Them\nht...,Top 10 Influencers #insurtech / #cybersecurity...,False,2023-04-26 11:30:23+00:00


In [66]:
# to consider a scientists 1, othw 0
label = np.zeros(length)

In [144]:
sample.tail(320)

Unnamed: 0,user_name,text,user_description,user_verified,date
56,Bitone Great,📉#Sell!📉\n🔥RSI over 85!🔥\n[#Bybit]\n👇 Recommen...,#ChatGPT (AI) powered Free Trading Signal! \nL...,False,2023-04-26 12:52:37+00:00
57,Bitone Great,📉#Sell!📉\n🔥RSI over 85!🔥\n[#Bybit]\n👇 Recommen...,#ChatGPT (AI) powered Free Trading Signal! \nL...,False,2023-04-26 12:52:28+00:00
58,Bitone Great,📉#Sell!📉\n🔥RSI over 85!🔥\n[#Bybit]\n👇 Recommen...,#ChatGPT (AI) powered Free Trading Signal! \nL...,False,2023-04-26 12:51:27+00:00
59,Bend.AI,https://t.co/WOcvr56ryv 🎉 Perfect timing to be...,Marketing Science | Attribution | Analytics | ...,False,2023-04-26 12:51:07+00:00
60,Bitone Great,📉#Sell!📉\n🔥RSI over 85!🔥\n[#Bybit]\n👇 Recommen...,#ChatGPT (AI) powered Free Trading Signal! \nL...,False,2023-04-26 12:51:06+00:00
...,...,...,...,...,...
388,🟩 Dr Martin Hiesboeck,How long before someone puts #ChatGPT in a hum...,Head of Research @upholdinc • Founder @alpineb...,False,2023-04-26 11:31:54+00:00
389,THE DECODER - EVERYTHING AI,👉 #ChatGPT could threaten Google search in the...,News and updates on AI one 👉 at a time\n\nDECO...,False,2023-04-26 11:30:36+00:00
390,Bernard Wainaina,#ChatGPT I don’t think ChatGPT and other langu...,Independent Agribusiness Advisor,False,2023-04-26 11:30:36+00:00
391,"Cybersécurité, IA, Cryptos",AI #Chatbots and the #Humans Who Love Them\nht...,Top 10 Influencers #insurtech / #cybersecurity...,False,2023-04-26 11:30:23+00:00


In [145]:
# A REFAIRE
label[4] = 1
label[11] = 1
label[18] = 1
label[25] = 1
label[27] = 1
label[278] = 1
label[186] = 1
label[352] = 1
label[23] = 1
label[274] = 1
label[188] = 1
label[185] = 1
label[110] = 1
label[282] = 1
label[298] = 1
label[72] = 1
label[139] = 1
label[79] = 1
label[364] = 1
label[261] = 1
label[182] = 1
label[241] = 1
label[360] = 1
label[207] = 1
label[231] = 1
label[294] = 1
label[388] = 1
label[364] = 1
label[352] = 1
label[301] = 1
label[298] = 1
label[290] = 1
label[274] = 1
label[244] = 1
label[219] = 1
label[213] = 1
label[146] = 1
label[99] = 1
label[83] = 1
label[79] = 1
label[72] = 1
label[62] = 1
label[63] = 1

IndexError: index 388 is out of bounds for axis 0 with size 374

### Constructing preprocessed dataset of the two last months of launching

In [15]:
temp = df['date'].sample()
temp

270392    2023-02-07 07:52:33+00:00
Name: date, dtype: object

In [16]:
start_date = '2023-02-26 12:30:18+00:00'
end_date = '2023-04-26 12:30:18+00:00'

# Select DataFrame rows between two dates
mask = (df['date'] > start_date) & (df['date'] <= end_date)
df2 = df.loc[mask]

In [17]:
df2

Unnamed: 0,user_name,text,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,hashtags,source
153,TechStory,Fake ChatGpts flood the App Store 🤥\nhttps://t...,United States,"Latest News from Tech, Crypto, EV, Space, AI, ...",2014-08-14 18:49:46+00:00,6947.0,47.0,3439,False,2023-04-26 12:30:18+00:00,"['chatgpt', 'appstore', 'apple', 'chatgptnews'...",Hootsuite Inc.
154,Neoskeptics,"""Republicans release AI-generated attack ad on...",,The Neoskeptics blog discusses public policy b...,2020-04-23 06:03:16+00:00,45.0,1.0,23,False,2023-04-26 12:30:12+00:00,,Hootsuite Inc.
155,Bitone Great,📉#Sell!📉\n🔥RSI over 85!🔥\n[#Bybit]\n👇 Recommen...,Hong Kong,#ChatGPT (AI) powered Free Trading Signal! \nL...,2022-11-21 04:42:18+00:00,1397.0,505.0,64,False,2023-04-26 12:30:07+00:00,"['Sell', 'Bybit', 'Short', 'ZRXUSDT']",rsi1
156,L.Ali,Interesting how #chatGPT kept failing this one...,,CEO/Founder @KeplerCleanNRG \n...,2022-02-27 15:12:08+00:00,11.0,31.0,13,False,2023-04-26 12:30:05+00:00,['chatGPT'],Twitter Web App
157,Joshsanger.eth 🦎🕯️,Favourite #ChatGPT #prompt for my day to day:\...,Ontario,Senior Front End Developer @ Shopify | Web3 cu...,2012-02-28 15:28:18+00:00,505.0,465.0,790,False,2023-04-26 12:30:01+00:00,"['ChatGPT', 'prompt']",Twitter for Android
...,...,...,...,...,...,...,...,...,...,...,...,...
190488,John Fuchs,"AI - The Good, the Bad and the Ugly\nThis is N...",,Yip.,2022-12-20 19:11:43+00:00,6.0,129.0,71,False,2023-02-26 14:16:09+00:00,"['chatgpt', 'Microsoft', 'Bing', 'BillGates']",Twitter Web App
190489,Keith Cook,Check out a new FREE tradingstrategy I made us...,"New Hampshire, USA",The Quadfather. husband and father of 5. i enj...,2020-02-07 15:35:24+00:00,77.0,139.0,382,False,2023-02-26 13:43:47+00:00,['chatGPT'],Twitter for Android
190490,aigirl.tez/eth,I made a new enemy today.😤😬#ChatGPT https://t....,,She combines imagination with AI🏄🏽‍♀️ - \nSFC-...,2022-11-07 07:00:28+00:00,320.0,345.0,5449,False,2023-02-26 13:35:54+00:00,['ChatGPT'],Twitter Web App
190491,ayjay 💫 freetag,what a cutie girl 🌸\n\n@DegenAI2 #altcoins #al...,Asia,not a multi || prev accts in my 📌 || ga accoun...,2023-01-29 01:46:05+00:00,46.0,424.0,872,False,2023-02-26 13:22:54+00:00,"['altcoins', 'altcoin', 'DegenAI', 'Gems', 'im...",Twitter for Android


In [22]:
df2.sample(5)

Unnamed: 0,user_name,text,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,hashtags,source
172880,videopictures,for people with #adhd using #chatgpt can be go...,"New York, NY",i make images photographically and sometimes t...,2010-07-27 10:21:28+00:00,778.0,2503.0,38267,False,2023-03-01 19:43:53+00:00,"['adhd', 'chatgpt']",Twitter for iPhone
139268,KnowledgeCollective,What is this thing called AI?\nCan we control ...,Somewhere in the cosmos,"Expanding consciousness, one thought at a time...",2022-02-01 12:06:48+00:00,50.0,184.0,126,False,2023-03-15 20:12:04+00:00,,Make (formerly Integromat)
58733,ⒹⒺⒶⒻ②⑤⑧ 🦄,Observation: Professors need not worry about #...,"Phoenix, AZ/Washington, DC","My quote: ""When I lived as a hearing person, i...",2007-01-25 02:32:15+00:00,3318.0,4989.0,198261,False,2023-04-11 14:35:56+00:00,['ChatGPT'],Twitter Web App
173738,LeviTrade,"$IEF Awaiting Buy Signal based off 37 trades, ...",,Ultra Bear - 98% Win rate. That which has no s...,2021-03-18 05:26:20+00:00,232.0,1.0,2,False,2023-03-01 18:04:28+00:00,"['trading', 'stocks', 'Options', 'chatgpt']",Zapier.com
146784,ipfconline,Microsoft will launch #ChatGPT 4 with AI video...,"Marseille, France",Digital Transformation Consulting \n#AI #Machi...,2016-03-03 23:46:02+00:00,130931.0,110726.0,171647,False,2023-03-12 21:15:04+00:00,['ChatGPT'],Twittimer


In [18]:
len(df2)

188578

In [19]:
# check if there is any NaN value
df2.isnull().values.any()
df2.isnull().sum()

user_name               1
text                    0
user_location       55543
user_description    14169
user_created            0
user_followers          0
user_friends            0
user_favourites         0
user_verified           0
date                    0
hashtags            47844
source                  0
dtype: int64

In [23]:
# write new csv file containing the preprocessed dataset
df2.to_csv('datasets/chatgpt2last.csv')

In [24]:
df = df2['text']

In [25]:
df.head()

153    Fake ChatGpts flood the App Store 🤥\nhttps://t...
154    "Republicans release AI-generated attack ad on...
155    📉#Sell!📉\n🔥RSI over 85!🔥\n[#Bybit]\n👇 Recommen...
156    Interesting how #chatGPT kept failing this one...
157    Favourite #ChatGPT #prompt for my day to day:\...
Name: text, dtype: object

In [32]:
path = 'datasets/preprocessed_chatgpt2last.csv'
preprocessing(df, path)

Text of tweets transformed to lower cases.
Removed stopwords from the text of tweets.
Cleaned and removed the punctuations.
Cleaned and removed the URLs and hyperlinks.
Cleaned and removed the mentions.
Removed the emojis.
Removed the numbers.
Cleaned and removed the repeating characters.
Removed the short words (length less than 3).
Tokenized the text.
Normalized the text.


In [33]:
df = pd.read_csv('datasets/preprocessed_chatgpt2last.csv')

In [34]:
df.head()

Unnamed: 0.1,Unnamed: 0,text
0,153,fake chatgpts flood app store chatgpt appstore...
1,154,republicans release aigenerated attack preside...
2,155,sell rsi bybit recommendation short ticker zrx...
3,156,interesting chatgpt kept failing test promptge...
4,157,favourite chatgpt prompt day day act end devel...
