# Importing Libraries

In [260]:
import pandas as pd
import math
import string
import re
import operator

# Importing our dataset and other files

In [261]:
data = pd.read_csv("spam_ham_dataset.csv")
stop_wordsF = pd.read_csv('stop-words.csv')

# DataSet Statistics

In [262]:
data.head()
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 4 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tag        5171 non-null   int64 
 1   label      5171 non-null   object
 2   text       5171 non-null   object
 3   label_num  5171 non-null   int64 
dtypes: int64(2), object(2)
memory usage: 161.7+ KB


We observe that we have 4 columns in our dataset, out of which we can drop tag which is just a unique id assigned, and also we can drop label field since we already have label_num field which represents `SPAM` and `HAM` as **1** and **0** respectively.

In [263]:
data.drop(['tag', 'label'], axis=1, inplace=True)
data.head(5000)

Unnamed: 0,text,label_num
0,Subject: enron methanol ; meter # : 988291\nth...,0
1,"Subject: hpl nom for january 9 , 2001\n( see a...",0
2,"Subject: neon retreat\nho ho ho , we ' re arou...",0
3,"Subject: photoshop , windows , office . cheap ...",1
4,Subject: re : indian springs\nthis deal is to ...,0
...,...,...
4995,"Subject: coca cola , mbna america , nascar par...",1
4996,Subject: software 75 % off downdraft\nwindows ...,1
4997,Subject: get prescri ) ption d ) rugs to your ...,1
4998,Subject: sparkasse security issue\nsehr geehrt...,1


We can now do furthur analysis of our data such as how many emails in our dataset are ham and how many emails are spam which will be helpful in applying furthur NLP techniques and later in splitting our data in test and train parts

In [264]:
ham_count = data["label_num"].value_counts()[0]
spam_count = data["label_num"].value_counts()[1]

ham_percentage = ham_count / (ham_count + spam_count) * 100
spam_percentage = spam_count / (ham_count + spam_count) * 100

print("Ham percentage:", ham_percentage)
print("Spam percentage:", spam_percentage)

Ham percentage: 71.01140978534133
Spam percentage: 28.98859021465867


As we see we have `30:70` ratio in our dataset for SPAM and HAM, which is quite a healthy dataset considering all the emails are random.

In [265]:
def count_non_chars(text):
  count = 0
  for word in text.split():
    if not re.match('[a-zA-Z\s]', word):
      count += 1
  return count

In [266]:
total_count = data["text"].apply(count_non_chars)
print(total_count.describe())

count    5171.000000
mean       79.052021
std       151.653444
min         0.000000
25%        13.000000
50%        34.000000
75%        96.000000
max      5430.000000
Name: text, dtype: float64


As we can see our dataset has a lot of characters that are not a part of english alphabet. We need to take care of this. Effective **NLP techniques** to remove this *noise* needs to be implemented

# Data Cleaning Steps

1. Removing Punctuations


In [267]:
print(string.punctuation)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [268]:
def remove_punctuation(text):
    text = text.replace('\n', ' ') # also removing newline characters while removing punctuations
    new_text = []
    for char in text:
        if char not in string.punctuation:
            new_text.append(char)
    return ''.join(new_text)

In [269]:
data["no_punctuations"] = ""
for i, row in data.iterrows():
    data.at[i, 'no_punctuations'] = remove_punctuation(row['text'])

print("Plain Text:\n", data.text[0])
print("\n")
print("After removing punctuations:\n", data.no_punctuations[0])

Plain Text:
 Subject: enron methanol ; meter # : 988291
this is a follow up to the note i gave you on monday , 4 / 3 / 00 { preliminary
flow data provided by daren } .
please override pop ' s daily volume { presently zero } to reflect daily
activity you can obtain from gas control .
this change is needed asap for economics purposes .


After removing punctuations:
 Subject enron methanol  meter   988291 this is a follow up to the note i gave you on monday  4  3  00  preliminary flow data provided by daren   please override pop  s daily volume  presently zero  to reflect daily activity you can obtain from gas control  this change is needed asap for economics purposes 


2. Converting everything to lowercase

In [270]:
def convert_lower_case(text):
    new_text = []
    for char in text:
        new_text.append(char.lower())
    return ''.join(new_text)

In [271]:
data["lower_case"] = ""
for i, row in data.iterrows():
    data.at[i, 'lower_case'] = convert_lower_case(row['no_punctuations'])

print("Plain Text:\n", data.no_punctuations[0])
print("\n")
print("After converting:\n", data.lower_case[0])

Plain Text:
 Subject enron methanol  meter   988291 this is a follow up to the note i gave you on monday  4  3  00  preliminary flow data provided by daren   please override pop  s daily volume  presently zero  to reflect daily activity you can obtain from gas control  this change is needed asap for economics purposes 


After converting:
 subject enron methanol  meter   988291 this is a follow up to the note i gave you on monday  4  3  00  preliminary flow data provided by daren   please override pop  s daily volume  presently zero  to reflect daily activity you can obtain from gas control  this change is needed asap for economics purposes 


3. Removing Numbers

In [272]:
def remove_numbers(text):
    text_without_numbers = re.sub(r'\d', '', text)
    return text_without_numbers

4. Removing Extra White Spaces

In [273]:
def remove_extra_space(text):
    text_without_spaces = re.sub(r'\s{1,}', ' ', text)
    text_without_spaces = text_without_spaces.strip()
    return text_without_spaces

In [274]:
data["new_text"] = ""
for i, row in data.iterrows():
    data.at[i, 'new_text'] = remove_numbers(row['lower_case'])
for i, row in data.iterrows():
    data.at[i, 'new_text'] = remove_extra_space(row['new_text'])
data.drop(['no_punctuations', 'lower_case'], axis=1, inplace=True)
data.head()


Unnamed: 0,text,label_num,new_text
0,Subject: enron methanol ; meter # : 988291\nth...,0,subject enron methanol meter this is a follow ...
1,"Subject: hpl nom for january 9 , 2001\n( see a...",0,subject hpl nom for january see attached file ...
2,"Subject: neon retreat\nho ho ho , we ' re arou...",0,subject neon retreat ho ho ho we re around to ...
3,"Subject: photoshop , windows , office . cheap ...",1,subject photoshop windows office cheap main tr...
4,Subject: re : indian springs\nthis deal is to ...,0,subject re indian springs this deal is to book...


In [275]:
print("Plain Text:\n", data.text[0])
print("\n")
print("After converting:\n", data.new_text[0])

Plain Text:
 Subject: enron methanol ; meter # : 988291
this is a follow up to the note i gave you on monday , 4 / 3 / 00 { preliminary
flow data provided by daren } .
please override pop ' s daily volume { presently zero } to reflect daily
activity you can obtain from gas control .
this change is needed asap for economics purposes .


After converting:
 subject enron methanol meter this is a follow up to the note i gave you on monday preliminary flow data provided by daren please override pop s daily volume presently zero to reflect daily activity you can obtain from gas control this change is needed asap for economics purposes


5. Word Tokenizing the Data

In [276]:
def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens

In [277]:
data["tokens"] = ""
for i, row in data.iterrows():
    data.at[i, 'tokens'] = tokenize(row['new_text'])
data.head()

Unnamed: 0,text,label_num,new_text,tokens
0,Subject: enron methanol ; meter # : 988291\nth...,0,subject enron methanol meter this is a follow ...,"[subject, enron, methanol, meter, this, is, a,..."
1,"Subject: hpl nom for january 9 , 2001\n( see a...",0,subject hpl nom for january see attached file ...,"[subject, hpl, nom, for, january, see, attache..."
2,"Subject: neon retreat\nho ho ho , we ' re arou...",0,subject neon retreat ho ho ho we re around to ...,"[subject, neon, retreat, ho, ho, ho, we, re, a..."
3,"Subject: photoshop , windows , office . cheap ...",1,subject photoshop windows office cheap main tr...,"[subject, photoshop, windows, office, cheap, m..."
4,Subject: re : indian springs\nthis deal is to ...,0,subject re indian springs this deal is to book...,"[subject, re, indian, springs, this, deal, is,..."


A new column tokens has a list with all the different tokens of that email extracted from new text

# Stemming

Simple implementation of stemming with custom rules

In [278]:
def stemming(text):
    stem_text = []
    stem_set_suffix = [
        ("sses", "ss"),
        ("ies", "i"),
        ("s", ""),
        ("eed", "ee"),
        ("ed", ""),
        ("ing", ""),
        ("edly", "ed"),
        ("ly", ""),
        ("ation", "ate"),
        ("izations", "ize"),
        ("izer", "ize"),
        ("ational", "ate"),
        ("ator", "ate"),
        ("ment", ""),
        ("ble", ""),
        ("ful", ""),
        ("cious", "ce"),
        ("ness", ""),
        ("i", "y"),
    ]
    stem_set_prefix = [
        ("pre-", ""), # pre-condition, pre-historic
        ("un-", ""),  # un-breakable
        ("re-", ""),  # re-use  
        ("mis-", ""), # mis-understand
    ]
    for word in text:
        for suffix, replace in stem_set_suffix:
            if (word.endswith(suffix)):
                word = word[:-len(suffix)] + replace
        for prefix, replace in stem_set_prefix:
            if (word.startswith(prefix)):
                word = replace + word[len(prefix):]
        stem_text.append(word)
    return stem_text

In [279]:
data['stemmed_words'] = ""
for i, row in data.iterrows():
    data.at[i, 'stemmed_words'] = stemming(row["tokens"])
data.head()

Unnamed: 0,text,label_num,new_text,tokens,stemmed_words
0,Subject: enron methanol ; meter # : 988291\nth...,0,subject enron methanol meter this is a follow ...,"[subject, enron, methanol, meter, this, is, a,...","[subject, enron, methanol, meter, thy, y, a, f..."
1,"Subject: hpl nom for january 9 , 2001\n( see a...",0,subject hpl nom for january see attached file ...,"[subject, hpl, nom, for, january, see, attache...","[subject, hpl, nom, for, january, see, attach,..."
2,"Subject: neon retreat\nho ho ho , we ' re arou...",0,subject neon retreat ho ho ho we re around to ...,"[subject, neon, retreat, ho, ho, ho, we, re, a...","[subject, neon, retreat, ho, ho, ho, we, re, a..."
3,"Subject: photoshop , windows , office . cheap ...",1,subject photoshop windows office cheap main tr...,"[subject, photoshop, windows, office, cheap, m...","[subject, photoshop, window, office, cheap, ma..."
4,Subject: re : indian springs\nthis deal is to ...,0,subject re indian springs this deal is to book...,"[subject, re, indian, springs, this, deal, is,...","[subject, re, indian, spr, thy, deal, y, to, b..."


In [280]:
print("Data before applying stemming:", data.tokens[0])
print("\n")
print("Data after applying stemming:", data.stemmed_words[0])

Data before applying stemming: ['subject', 'enron', 'methanol', 'meter', 'this', 'is', 'a', 'follow', 'up', 'to', 'the', 'note', 'i', 'gave', 'you', 'on', 'monday', 'preliminary', 'flow', 'data', 'provided', 'by', 'daren', 'please', 'override', 'pop', 's', 'daily', 'volume', 'presently', 'zero', 'to', 'reflect', 'daily', 'activity', 'you', 'can', 'obtain', 'from', 'gas', 'control', 'this', 'change', 'is', 'needed', 'asap', 'for', 'economics', 'purposes']


Data after applying stemming: ['subject', 'enron', 'methanol', 'meter', 'thy', 'y', 'a', 'follow', 'up', 'to', 'the', 'note', 'y', 'gave', 'you', 'on', 'monday', 'preliminary', 'flow', 'data', 'provid', 'by', 'daren', 'please', 'override', 'pop', '', 'day', 'volume', 'present', 'zero', 'to', 'reflect', 'day', 'activity', 'you', 'can', 'obtain', 'from', 'ga', 'control', 'thy', 'change', 'y', 'need', 'asap', 'for', 'economic', 'purpose']


As we can clearly see, stemming has been successfully applied and words have been reduced to its basic forms. For example in the above scenario,<br>
**daily** -> *day*<br>
**provided** -> *provide*<br>
**presently** -> *present*<br>

# Lemmatization

In [281]:
lemmF = pd.read_csv('lemmatization-en.csv')
lemmF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 41646 entries, 0 to 41645
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   a       41646 non-null  object
 1   an      41646 non-null  object
dtypes: object(2)
memory usage: 650.8+ KB


We create a 2d array to act as a dictionary for lemmatization. We load a publicly available csv with over 50k values.

In [282]:
lemm_d = [] # Imported a local lemmatization list and made a dictionary for it
values = []
i = 0
for index, row in lemmF.iterrows():
    i += 1
    if i % 100 != 0:
        if i % 105 <= 5:  # Take 5 values
            str1 = row['a']
            str2 = row['an']
            found = False
            values.append(str2)
            for item in lemm_d:
                if item[0] == str1:
                    item[1].append(str2)
                    found = True
                    break

            if not found:
                lemm_d.append([str1, [str2]])
    else:
        continue

# lemm_d.append(["economic", ["economics"]])

print(lemm_d)


[['a', ['as']], ['aardvark', ['aardvarks']], ['ab', ['abs']], ['abacus', ['abaci', 'abacuses']], ['abrogate', ['abrogating']], ['abrogation', ['abrogations']], ['abrupt', ['abrupter', 'abruptest']], ['abruption', ['abruptions']], ['abscess', ['abscesses']], ['acclimatisation', ['acclimatisations']], ['acclimatise', ['acclimatised', 'acclimatises', 'acclimatising']], ['acclimatization', ['acclimatizations']], ['acclimatize', ['acclimatized']], ['acquaint', ['acquainted', 'acquainting', 'acquaints']], ['acquaintance', ['acquaintances']], ['acquaintanceship', ['acquaintanceships']], ['acquiesce', ['acquiesced']], ['adduct', ['adducting', 'adducts']], ['adductor', ['adductors']], ['ade', ['ades']], ['adenine', ['adenines']], ['adenocarcinoma', ['adenocarcinomas']], ['adsorb', ['adsorbing', 'adsorbs']], ['adsorbate', ['adsorbates']], ['adsorbent', ['adsorbents']], ['adsorber', ['adsorbers']], ['adsorption', ['adsorptions']], ['afford', ['afforded', 'affording', 'affords']], ['affordance', [

We define a lemmatizing function in which we substitute the words from the second array with the root words using our own dictionary of 2d array.

In [283]:
def lemmatizing(row):
    new_row = []
    for token in row:
        found = False
        for value1, value2 in lemm_d:
            if token in value2:
                new_row.append(value1)
                found = True
                break
        if not found:
            new_row.append(token) 
    return new_row

In [284]:
data['lemm_text'] = ""
for i, row in data.iterrows():
    data.at[i, 'lemm_text'] = lemmatizing(row["stemmed_words"])
print("Data before lemmatization:", data.stemmed_words[0])
print("\n")
print("Data after lemmatization:", data.lemm_text[0])

# some examples
# Row 0 has differences:
# Character 3: Stemmed: gave, Lemmatized: give
# Row 10 has differences:
# Character 498: Stemmed: meant, Lemmatized: mean
# Row 15 has differences:
# Character 701: Stemmed: given, Lemmatized: give
# Row 24 has differences:
# Character 245: Stemmed: feet, Lemmatized: foot
# Character 348: Stemmed: thought, Lemmatized: think
# Row 33 has differences:
# Character 107: Stemmed: thought, Lemmatized: think
# Row 48 has differences:
# Character 24: Stemmed: felt, Lemmatized: feel
# Row 60 has differences:
# Character 660: Stemmed: given, Lemmatized: give

Data before lemmatization: ['subject', 'enron', 'methanol', 'meter', 'thy', 'y', 'a', 'follow', 'up', 'to', 'the', 'note', 'y', 'gave', 'you', 'on', 'monday', 'preliminary', 'flow', 'data', 'provid', 'by', 'daren', 'please', 'override', 'pop', '', 'day', 'volume', 'present', 'zero', 'to', 'reflect', 'day', 'activity', 'you', 'can', 'obtain', 'from', 'ga', 'control', 'thy', 'change', 'y', 'need', 'asap', 'for', 'economic', 'purpose']


Data after lemmatization: ['subject', 'enron', 'methanol', 'meter', 'thy', 'y', 'a', 'follow', 'up', 'to', 'the', 'note', 'y', 'give', 'you', 'on', 'monday', 'preliminary', 'flow', 'data', 'provid', 'by', 'daren', 'please', 'override', 'pop', '', 'day', 'volume', 'present', 'zero', 'to', 'reflect', 'day', 'activity', 'you', 'can', 'obtain', 'from', 'ga', 'control', 'thy', 'change', 'y', 'need', 'asap', 'for', 'economic', 'purpose']


In [285]:
data.head()

Unnamed: 0,text,label_num,new_text,tokens,stemmed_words,lemm_text
0,Subject: enron methanol ; meter # : 988291\nth...,0,subject enron methanol meter this is a follow ...,"[subject, enron, methanol, meter, this, is, a,...","[subject, enron, methanol, meter, thy, y, a, f...","[subject, enron, methanol, meter, thy, y, a, f..."
1,"Subject: hpl nom for january 9 , 2001\n( see a...",0,subject hpl nom for january see attached file ...,"[subject, hpl, nom, for, january, see, attache...","[subject, hpl, nom, for, january, see, attach,...","[subject, hpl, nom, for, january, see, attach,..."
2,"Subject: neon retreat\nho ho ho , we ' re arou...",0,subject neon retreat ho ho ho we re around to ...,"[subject, neon, retreat, ho, ho, ho, we, re, a...","[subject, neon, retreat, ho, ho, ho, we, re, a...","[subject, neon, retreat, ho, ho, ho, we, re, a..."
3,"Subject: photoshop , windows , office . cheap ...",1,subject photoshop windows office cheap main tr...,"[subject, photoshop, windows, office, cheap, m...","[subject, photoshop, window, office, cheap, ma...","[subject, photoshop, window, office, cheap, ma..."
4,Subject: re : indian springs\nthis deal is to ...,0,subject re indian springs this deal is to book...,"[subject, re, indian, springs, this, deal, is,...","[subject, re, indian, spr, thy, deal, y, to, b...","[subject, re, indian, spr, thy, deal, y, to, b..."


# Vectorization
- This section includes the vectorization of the text we have processed, removing stopwords from our text and then converting it into a TF-IDF vector to train our model.

### Importing stop words from our local csv file

In [286]:
stop_wordsF.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 179 entries, 0 to 178
Data columns (total 2 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  179 non-null    int64 
 1   0           179 non-null    object
dtypes: int64(1), object(1)
memory usage: 2.9+ KB


In [287]:
stop_words = []
for word in stop_wordsF['0']:
    stop_words.append(word)
print(stop_words)

['i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', "you're", "you've", "you'll", "you'd", 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', "she's", 'her', 'hers', 'herself', 'it', "it's", 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', "that'll", 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from', 'up', 'down', 'in', 'out', 'on', 'off', 'over', 'under', 'again', 'further', 'then', 'once', 'here', 'there', 'when', 'where', 'why', 'how', 'all', 'any', 'both', 'each', 'few', 'more', 'most', 'other', 'some', 'such', 'no', 'nor', 'not', 'only', 'own', 'same', 'so', 'than', '

- We create a frequency distribution to not only find the maximum ocurring words, but also implement the tf idf model subsequently

In [288]:
freq_map = {}
def countFrequency(row):
    for token in row:
        if token not in freq_map:
            freq_map[token] = 1
        else:
            freq_map[token] += 1
            
for row in data['lemm_text']:
    countFrequency(row)

sorted_list = sorted(freq_map.items(), key = operator.itemgetter(1), reverse=True)

print(sorted_list)

print(stop_words)
print(len(stop_words))

[('the', 25614), ('to', 20332), ('ect', 13908), ('y', 13704), ('a', 13232), ('and', 12815), ('for', 10505), ('of', 10171), ('on', 8260), ('you', 8159), ('subject', 8062), ('in', 7703), ('hou', 7289), ('thy', 7167), ('enron', 6555), ('be', 5469), ('that', 4770), ('we', 4347), ('from', 4191), ('will', 4160), ('have', 4096), ('your', 4075), ('with', 3986), ('com', 3911), ('it', 3861), ('at', 3735), ('deal', 3655), ('', 3592), ('are', 3387), ('please', 3198), ('if', 3137), ('or', 3078), ('ga', 3073), ('by', 3029), ('not', 2999), ('meter', 2721), ('me', 2568), ('am', 2531), ('cc', 2391), ('pm', 2343), ('hpl', 2318), ('thank', 2303), ('day', 2129), ('d', 2129), ('our', 2096), ('can', 2024), ('re', 2015), ('any', 1999), ('e', 1995), ('all', 1928), ('daren', 1901), ('nee', 1850), ('forward', 1832), ('corp', 1776), ('wa', 1729), ('new', 1717), ('volume', 1690), ('ha', 1665), ('know', 1618), ('price', 1518), ('an', 1510), ('do', 1452), ('company', 1429), ('mmbtu', 1408), ('t', 1407), ('may', 140

In [289]:
# count = 0
# for pairs in sorted_list:
#     if count > 20:
#         break
#     count += 1
#     if pairs[0] not in stop_words:
#         stop_words.append(pairs[0])

# print(stop_words)
# print(len(stop_words))

In [290]:
# Removing stop words from the tokens array
# We have made a new column to clearly see the effects
clean_txt = []
for row in data['lemm_text']:
    new_row = []
    for token in row:
        if(not token in stop_words):
            new_row.append(token) 
    clean_txt.append(new_row)

data['clean_text'] = clean_txt
data.head()

Unnamed: 0,text,label_num,new_text,tokens,stemmed_words,lemm_text,clean_text
0,Subject: enron methanol ; meter # : 988291\nth...,0,subject enron methanol meter this is a follow ...,"[subject, enron, methanol, meter, this, is, a,...","[subject, enron, methanol, meter, thy, y, a, f...","[subject, enron, methanol, meter, thy, y, a, f...","[subject, enron, methanol, meter, thy, follow,..."
1,"Subject: hpl nom for january 9 , 2001\n( see a...",0,subject hpl nom for january see attached file ...,"[subject, hpl, nom, for, january, see, attache...","[subject, hpl, nom, for, january, see, attach,...","[subject, hpl, nom, for, january, see, attach,...","[subject, hpl, nom, january, see, attach, file..."
2,"Subject: neon retreat\nho ho ho , we ' re arou...",0,subject neon retreat ho ho ho we re around to ...,"[subject, neon, retreat, ho, ho, ho, we, re, a...","[subject, neon, retreat, ho, ho, ho, we, re, a...","[subject, neon, retreat, ho, ho, ho, we, re, a...","[subject, neon, retreat, ho, ho, ho, around, w..."
3,"Subject: photoshop , windows , office . cheap ...",1,subject photoshop windows office cheap main tr...,"[subject, photoshop, windows, office, cheap, m...","[subject, photoshop, window, office, cheap, ma...","[subject, photoshop, window, office, cheap, ma...","[subject, photoshop, window, office, cheap, ma..."
4,Subject: re : indian springs\nthis deal is to ...,0,subject re indian springs this deal is to book...,"[subject, re, indian, springs, this, deal, is,...","[subject, re, indian, spr, thy, deal, y, to, b...","[subject, re, indian, spr, thy, deal, y, to, b...","[subject, indian, spr, thy, deal, book, teco, ..."


In [291]:
print(len(clean_txt))
print("Length before removing stop words:", len(data.lemm_text[0]))
print("Length after removing stop words:", len(data.clean_text[0]))
data.info()

5171
Length before removing stop words: 49
Length after removing stop words: 34
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Data columns (total 7 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   text           5171 non-null   object
 1   label_num      5171 non-null   int64 
 2   new_text       5171 non-null   object
 3   tokens         5171 non-null   object
 4   stemmed_words  5171 non-null   object
 5   lemm_text      5171 non-null   object
 6   clean_text     5171 non-null   object
dtypes: int64(1), object(6)
memory usage: 282.9+ KB


In [292]:
word_set = {}
for row in data['clean_text']:
    word_set = set(row).union(word_set);
    
word_set.remove('')
print(word_set)
print(len(word_set))

{'homebuyer', 'juanita', 'frank', 'placater', 'chatham', 'zzso', 'woodsmith', 'optic', 'cataclysm', 'edison', 'planner', 'firewater', 'tuberculin', 'stuff', 'anycase', 'mcn', 'fenton', 'gunderson', 'del', 'fundamental', 'ediegore', 'xmgqjxc', 'nobody', 'unify', 'inflexy', 'unwaver', 'neue', 'ldchel', 'minehaul', 'dres', 'awoke', 'gwmvurvtdu', 'paz', 'bqil', 'spekocial', 'schroeder', 'boric', 'epldztd', 'catv', 'discgreet', 'bindle', 'combatant', 'dhc', 'thereupon', 'attacker', 'holdout', 'ouu', 'skjl', 'ommerce', 'monumental', 'dg', 'mace', 'echinoderm', 'otdmbipdtc', 'bilateral', 'rafferty', 'ojhcabr', 'ctr', 'corporeal', 'biny', 'prank', 'pjmmfpecj', 'rx', 'expletivewinemake', 'purport', 'loaneon', 'bermuda', 'brecon', 'ffhm', 'trillion', 'pkjen', 'yeah', 'emblem', 'trichophore', 'luxe', 'appl', 'jin', 'sportswriter', 'wiil', 'houghton', 'apprehend', 'coache', 'jitterbugg', 'davidfinley', 'sastephen', 'kiss', 'asa', 'pronto', 'cadence', 'wayne', 'fvjzp', 'frontpage', 'carr', 'avowal'

In [293]:
idf_table = {}
corpus_freq = 0
temp_set = set()
for word in word_set:
    temp = word
    for row in data['clean_text']:
        if word in row:
            corpus_freq += 1
            continue
    idf = math.log((5171/corpus_freq), 2)
    if idf == 0:
        temp_set.add(word)
        print(word)
    else: 
        idf_table[word] = idf
    corpus_freq = 0
word_set.difference(temp_set)
print(idf_table)

In [None]:
print(len(word_set))

40156


In [None]:
tf = {}

In [None]:
# i = 0
for word in word_set:
    # if i > 4000:
    #     break
    # else:
    #     i += 1
    tf_list = []
    for doc in data['clean_text']:
        count = 0
        for term in doc:
            if term == word:
                count+=1
        term_freq = 0
        if count != 0:
            term_freq = 1 + math.log(count, 2)
        tf_list.append(term_freq)
    tf[word] = tf_list
        
tf_idf = pd.DataFrame(tf)
tf_idf.head()

Unnamed: 0,homebuyer,juanita,frank,placater,chatham,zzso,woodsmith,optic,cataclysm,edison,...,jetbrtive,portfolio,breakwater,presbytery,pressofficer,cscfinancial,sidin,dhm,nihilist,bsnawhrda
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5171 entries, 0 to 5170
Columns: 4001 entries, homebuyer to bsnawhrda
dtypes: float64(4001)
memory usage: 157.8 MB


ValueError: If using all scalar values, you must pass an index