# Data preprocessing 

In [5]:
import re
import numpy as np
import pandas as pd

# remove english stopwords with the help of the gensim library 
from gensim.parsing.preprocessing import remove_stopwords

import string

from nltk.stem.porter import PorterStemmer
from nltk.stem import WordNetLemmatizer

## All the methods that will be used to preprocess

In [6]:
english_punctuations = string.punctuation
punctuations_list = english_punctuations

def cleaning_punctuations(text):
    translator = str.maketrans('', '', punctuations_list)
    return text.translate(translator)

In [7]:
# cleaning and removing URLs

def cleaning_URLs(data):
    return re.sub('((www.[^s]+)|(https?://[^s]+))',' ',data)

def remove_hyperlink(word):
    return re.sub(r"http\S+", "", word)

In [8]:
# cleaning and removing mentions 

def remove_mentions(word):
    return re.sub(r"@\S+", "", word)

In [9]:
# removing emojis 
emoji_pattern = re.compile("["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags=re.UNICODE)

In [10]:
# cleaning and removing repeating characters

def cleaning_repeating_char(text):
    return re.sub(r'(.)1+', r'1', text)

In [11]:
stemmer = PorterStemmer()

def stem_words(text):
    return " ".join([stemmer.stem(word) for word in text])

In [12]:
lemmatizer = WordNetLemmatizer()

def lematize_words(text):
    return " ".join([lemmatizer.lemmatize(word) for word in text])

## Defining the function for preprocessing the data

In [9]:
import string

def preprocessing(df):
    # lower cases 
    df = df.str.lower()
    print("Text of tweets transformed to lower cases.")
    
    # remove english stopwords with the help of the gensim library 
    df = df.apply(lambda text: remove_stopwords(text))
    print("Removed stopwords from the text of tweets.")
    
    #cleaning and removing punctuation
    df = df.apply(lambda x: cleaning_punctuations(x))
    print("Cleaned and removed the punctuations.")
    
    # cleaning and removing URLs
    df = df.apply(lambda x: cleaning_URLs(x))
    df = df.apply(lambda x: remove_hyperlink(x))
    print("Cleaned and removed the URLs and hyperlinks.")
    
    # cleaning and removing mentions 
    df = df.apply(lambda x: remove_mentions(x))
    print("Cleaned and removed the mentions.")
    
    # removing emojis 
    df = df.apply(lambda x: emoji_pattern.sub(r'', x))
    print("Removed the emojis.")
    
    # remove numbers 
    df = df.apply(lambda x: re.sub("[^a-zA-Z]", " ", x))
    print("Removed the numbers.")
    
    # cleaning and removing repeating characters
    df = df.apply(lambda x: cleaning_repeating_char(x))
    print("Cleaned and removed the repeating characters.")
    
    # remove short words that are of no use 
    df = df.apply(lambda x: ' '.join([w for w in x.split() if len(w)>2]))
    print("Removed the short words (length less than 3).")
    
    # Text normalization : tokenize the tweets 
    # split tweets of text into tokens (individual words of terms)
    tokenize_tweets = df.apply(lambda x: x.split())
    print("Tokenized the text.")
    
    # Normalization: use of PorterStemmer()
    tokenize_tweets = tokenize_tweets.apply(lambda text: lematize_words(text))
    print("Normalized the text.")
    
    # write new csv file containing the preprocessed dataset
    #df.to_csv(path)
    
    return tokenize_tweets

### Preprocessing of the whole dataset 

In [1]:
import pandas as pd

df = pd.read_csv('datasets/chatgpt.csv', engine='python')

In [3]:
len(df)

400775

In [4]:
# check if there is any NaN value

df.isnull().values.any()
df.isnull().sum()

user_name               13
text                    42
user_location       112641
user_description     25950
user_created            60
user_followers          60
user_friends            60
user_favourites         60
user_verified           67
date                    68
hashtags             90810
source                 106
dtype: int64

In [13]:
df.columns

Index(['user_name', 'text', 'user_location', 'user_description',
       'user_created', 'user_followers', 'user_friends', 'user_favourites',
       'user_verified', 'date', 'hashtags', 'source'],
      dtype='object')

In [14]:
df = df[['user_name', 'text', 'user_description', 'date']]

In [15]:
df.isnull().values.any()
df.isnull().sum()

user_name              13
text                   42
user_description    25950
date                   68
dtype: int64

In [16]:
df = df.dropna()

In [17]:
df.isnull().values.any()
df.isnull().sum()

user_name           0
text                0
user_description    0
date                0
dtype: int64

In [18]:
len(df)

374777

In [18]:
df.columns

Index(['user_name', 'text', 'user_description', 'date'], dtype='object')

In [19]:
user = df['user_name']

In [20]:
description = df['user_description']
text = df['text']

In [21]:
# check if there is any NaN value
description.isnull().values.any()
description.isnull().sum()

0

In [22]:
user.isnull().values.any()
user.isnull().sum()

0

In [23]:
# check if there is any NaN value
text.isnull().values.any()
text.isnull().sum()

0

In [24]:
clean_description = preprocessing(description)
clean_text = preprocessing(text)
clean_user = preprocessing(user)

Text of tweets transformed to lower cases.
Removed stopwords from the text of tweets.
Cleaned and removed the punctuations.
Cleaned and removed the URLs and hyperlinks.
Cleaned and removed the mentions.
Removed the emojis.
Removed the numbers.
Cleaned and removed the repeating characters.
Removed the short words (length less than 3).
Tokenized the text.
Normalized the text.
Text of tweets transformed to lower cases.
Removed stopwords from the text of tweets.
Cleaned and removed the punctuations.
Cleaned and removed the URLs and hyperlinks.
Cleaned and removed the mentions.
Removed the emojis.
Removed the numbers.
Cleaned and removed the repeating characters.
Removed the short words (length less than 3).
Tokenized the text.
Normalized the text.
Text of tweets transformed to lower cases.
Removed stopwords from the text of tweets.
Cleaned and removed the punctuations.
Cleaned and removed the URLs and hyperlinks.
Cleaned and removed the mentions.
Removed the emojis.
Removed the numbers.
Cl

In [25]:
date = df['date']

In [26]:
clean_description

0         building draftmateai developer author chatgpt ...
1         nomadic marketing vigilante graphic web design...
3         leading translation management platform locali...
4         engineering leader coach mentor building remot...
5         stop learning research develop uild future eyo...
                                ...                        
400769    brain meant processing million tweet post vide...
400770    blockchain enthusiast philanthropist slave jav...
400772    mathematician developer amazon previously geob...
400773      passionate nature software developer profession
400774    postdoc gipplab unigoettingen phd unikonstanz ...
Name: user_description, Length: 374777, dtype: object

In [27]:
type(clean_description)

pandas.core.series.Series

In [28]:
clean_description.values

array(['building draftmateai developer author chatgpt starter kit laravel',
       'nomadic marketing vigilante graphic web designer coo kapturall',
       'leading translation management platform localize content device platform',
       ..., 'mathematician developer amazon previously geoblink',
       'passionate nature software developer profession',
       'postdoc gipplab unigoettingen phd unikonstanz previously ucberkeley jouhouken uniwuppertal interest nlp datascience'],
      dtype=object)

In [30]:
date.values

array(['2023-04-26 13:02:16+00:00', '2023-04-26 13:02:06+00:00',
       '2023-04-26 13:02:05+00:00', ..., '2022-12-05 17:09:04+00:00',
       '2022-12-05 17:08:44+00:00', '2022-12-05 17:08:20+00:00'],
      dtype=object)

In [31]:
clean_user

0                bilal haidar
1              josh arrington
3                   smartling
4                     richard
5                    lockroca
                 ...         
400769                   nenu
400770               iamtmoyo
400772    gabriel furstenheim
400773                 devang
400774        norman meuschke
Name: user_name, Length: 374777, dtype: object

In [29]:
text

0         I'm happy to share with you that @draftmateai ...
1         🤖 - Integrate ChatGPT into your Marketo Smart ...
3         We’re pleased to announce new patent-pending t...
4         Starting to tire of being scolded by ChatGPT f...
5         @SpartaJustice It's notable that wikipedia ent...
                                ...                        
400769    ChatGPT is the biggest, smartest brain 🧠 in th...
400770    Levels🙏🙏🙏,so happy for the chatGPT team for co...
400772    Russel vs ChatGPT. It's also funny that it tak...
400773    Was just wondering is there any difference bet...
400774    #ChatGPT and similar #LLM pose a challenge to ...
Name: text, Length: 374777, dtype: object

In [30]:
text.values

array(["I'm happy to share with you that @draftmateai now supports a 5-day free trial period to try out the application before you subscribe!\n\nCheck it here: https://t.co/xzXrnE5ATY. \n\n#ChatGPT #AI #Laravel",
       '🤖 - Integrate ChatGPT into your Marketo Smart Campaigns! Discover the power of AI-driven marketing in our upcoming webinar on May 4th at 4 PM CEST. Save your spot today -\xa0https://t.co/cjCFZ8rH1U\n\n#AI\xa0#Marketo\xa0#ChatGPT\xa0#OpenAI https://t.co/YA91h1K8cq',
       "We’re pleased to announce new patent-pending technology that will bring human-quality machine translation closer to reality.\n\nThis new AI-powered process enables the use of #LLMs (GPT-4, #ChatGPT) to follow companys' style guides and maintain brand voice.\n\nhttps://t.co/gnKqAch4Yy",
       ...,
       "Russel vs ChatGPT. It's also funny that it takes a long time to answer\n#ChatGPT https://t.co/2YkR0fRVCT",
       'Was just wondering is there any difference between Jasper and ChatGPT / GPT3? #Chat

In [31]:
dataframe = {'user_name': clean_user.values, 'user_description': clean_description.values, 'text': text.values, 'clean_text': clean_text.values, 'date': date.values}

In [32]:
result = pd.DataFrame(dataframe)

In [33]:
result.to_csv('datasets/preprocessed_chatgpt.csv')