In [1]:
# importing libraries

import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.corpus import wordnet

import joblib

In [2]:
# importing dataset

dataset = pd.read_csv(r"C:\Users\ANSWEB\Desktop\Major Project\Dataset\TwitterDataset.csv",
    usecols=['OriginalTweet'], encoding='latin-1')

print(dataset)

                                           OriginalTweet
0      @MeNyrbie @Phil_Gahan @Chrisitv https://t.co/i...
1      advice Talk to your neighbours family to excha...
2      Coronavirus Australia: Woolworths to give elde...
3      My food stock is not the only one which is emp...
4      Me, ready to go at supermarket during the #COV...
...                                                  ...
44950  Meanwhile In A Supermarket in Israel -- People...
44951  Did you panic buy a lot of non-perishable item...
44952  Asst Prof of Economics @cconces was on @NBCPhi...
44953  Gov need to do somethings instead of biar je r...
44954  I and @ForestandPaper members are committed to...

[44955 rows x 1 columns]


In [3]:
# defining tokenization function

stop_words = set(stopwords.words('english'))
# punctuations without '@' and '#'
punctuation_pattern = '[!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~]'

def text_clean(x) :
    # remove newline and carriage return
    x = x.replace('\n', ' ').replace('\r', '')
    # remove the links
    x = nltk.re.sub(r'http:\S+|https:\S+|www\S+', '', x)
    # remove punctuation marks and emoticons 
    x = nltk.re.sub(punctuation_pattern, '', x)
    words = x.split(' ')
    temp = []
    
    for word in words :
        word=word.lower()
        if (word not in stop_words) and (not word[1:].isnumeric()) and (not word.startswith('@')):
            if word.startswith('#'):
                temp.append(word[1:])
            else:
                temp.append(word)
                
    pattern = '[!"$%&\'()*+,-./:;<=>?[\\]^_`{|}~@#\$]'
    cleaned_sentence = nltk.re.sub(pattern, '', ' '.join(temp))
    
    return cleaned_sentence

In [4]:
# applying tokenization on dataset

dataset['CleanTweet'] = dataset['OriginalTweet'].apply(lambda x : text_clean(str(x)))

In [5]:
# displaying stop words

print(stop_words)

{'she', "you'll", 'theirs', 'against', 'below', 'on', "hadn't", "mightn't", 'some', 'now', 'y', 'what', 'about', "weren't", 'themselves', 'ours', 'through', 'your', 'i', 'where', 'will', 'how', 'up', 'off', 'you', 'ma', 'who', 'after', 'not', 'have', 'just', 'had', 's', "couldn't", 'same', 'why', 'once', 'ain', 'did', 've', "you've", 'because', 'hasn', 'of', 'shouldn', 'having', 'before', 'in', 'over', 'when', 'didn', 'then', 'by', 'down', 'itself', 'has', 'o', "that'll", 'they', 'their', 'if', "didn't", 'mustn', 'until', 'which', 'few', 'am', "haven't", "shan't", 'as', 'll', 'my', 'more', "mustn't", 'been', 'and', 'them', 'wouldn', 'most', 'wasn', 'haven', 'needn', 'me', 'being', 'those', 'he', 'him', 'mightn', 'do', 'into', 'himself', 'but', "doesn't", 'very', 'such', 'so', 'that', 'above', 'hadn', 'the', 'only', 'aren', 'whom', 'shan', 'weren', 'it', "aren't", 't', 'is', 'are', 'at', 'under', 'both', "isn't", 'again', 'during', 'were', 'further', 'own', 'here', 'can', "it's", 'to', 

In [6]:
# displaying tokenized dataset

dataset['CleanTweet']

0                                                         
1        advice talk neighbours family exchange phone n...
2        coronavirus australia woolworths give elderly ...
3        food stock one empty  please dont panic enough...
4        ready go supermarket covid19 outbreak  im para...
                               ...                        
44950    meanwhile supermarket israel  people dance sin...
44951    panic buy lot nonperishable items echo needs f...
44952    asst prof economics talking recent research co...
44953    gov need somethings instead biar je rakyat ass...
44954    members committed safety employees endusers mo...
Name: CleanTweet, Length: 44955, dtype: object

In [7]:
# defining the lemmatization function

lemmatizer = WordNetLemmatizer()

def lemm(text):

    # tokenizing text into words
    tokens = nltk.word_tokenize(text)

    # lemmatizing each word
    lemmatized_words = []
    for token in tokens:
        # getting the part-of-speech (POS)
        pos_tag = nltk.pos_tag([token])[0][1]
        pos = wordnet.VERB if pos_tag.startswith('V') else wordnet.NOUN if pos_tag.startswith('N') else wordnet.ADJ if pos_tag.startswith('J') else wordnet.ADV
        lemma = lemmatizer.lemmatize(token, pos=pos)
        lemmatized_words.append(lemma)

    # joining lemmatized words back into a sentence
    lemmatized_text = ' '.join(lemmatized_words)

    return lemmatized_text

In [8]:
# applying lemmatization on dataset

dataset['Lemmatized']=dataset.CleanTweet.apply(lambda x : lemm(str(x)))

In [9]:
# displaying lemmatized dataset

dataset['Lemmatized']

0                                                         
1        advice talk neighbour family exchange phone nu...
2        coronavirus australia woolworth give elderly d...
3        food stock one empty please dont panic enough ...
4        ready go supermarket covid19 outbreak im paran...
                               ...                        
44950    meanwhile supermarket israel people dance sing...
44951    panic buy lot nonperishable item echo need foo...
44952    asst prof economics talk recent research coron...
44953    gov need somethings instead biar je rakyat ass...
44954    member commit safety employee endusers monitor...
Name: Lemmatized, Length: 44955, dtype: object

In [10]:
# saving lemmatized dataset

dataset.to_csv('LemmatizedData.csv')

In [11]:
# defining vectorization function 

from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer(max_features=4999)

In [12]:
# applying vectorization

vectors = vectorizer.fit_transform(dataset["Lemmatized"])
feature_names = vectorizer.get_feature_names_out()
dense = vectors.todense()
denselist = dense.tolist()
TfIdf = pd.DataFrame(denselist, columns=feature_names)

In [13]:
# saving the tfidf model

joblib.dump(vectorizer, 'TFIDF.sav')

In [14]:
# saving vectorized dataset

TfIdf.to_csv('TFIDFData.csv', index=False)