In [1]:
import pandas as pd
import math
import nltk
import string
import re
import numpy as np
import joblib
from tqdm import tqdm
from nltk.stem import PorterStemmer, WordNetLemmatizer
from scipy.sparse import save_npz, load_npz
from scipy import sparse
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
data = pd.read_csv("./DataSet/combined_data.csv")
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 105199 entries, 0 to 105198
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   label   105199 non-null  int64 
 1   text    101072 non-null  object
dtypes: int64(1), object(1)
memory usage: 1.6+ MB


In [3]:
ham_count = data["label"].value_counts()[0]
spam_count = data["label"].value_counts()[1]

ham_percentage = ham_count / (ham_count + spam_count) * 100
spam_percentage = spam_count / (ham_count + spam_count) * 100

print("Ham percentage:", ham_percentage)
print("Spam percentage:", spam_percentage)

Ham percentage: 38.98611203528551
Spam percentage: 61.0138879647145


In [4]:
def check_and_remove_nan(df, column_name):
    nan_count = df[column_name].isna().sum()
    print(f"Number of NaN values in {column_name}: {nan_count}")
    df = df.dropna(subset=[column_name])
    
    return df

data = check_and_remove_nan(data, 'text')

Number of NaN values in text: 4127


In [5]:
def count_non_chars(text):
  count = 0
  for word in text.split():
    if not re.match('[a-zA-Z\s]', word):
      count += 1
  return count

total_count = data["text"].apply(count_non_chars)
print(total_count.describe())

count    101072.000000
mean         26.557741
std         121.429828
min           0.000000
25%           0.000000
50%           0.000000
75%          13.000000
max       10444.000000
Name: text, dtype: float64


In [6]:
print(string.punctuation)
def remove_punctuation(text):
    text = text.replace('\n', ' ') # also removing newline characters while removing punctuations
    new_text = []
    for char in text:
        if char not in string.punctuation:
            new_text.append(char)
    return ''.join(new_text)

data["no_punctuations"] = ""
for i, row in data.iterrows():
    data.at[i, 'no_punctuations'] = remove_punctuation(row['text'])

print("Plain Text:\n", data.text[0])
print("\n")
print("After removing punctuations:\n", data.no_punctuations[0])

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~
Plain Text:
 do you feel the pressure to perform and not rising to the occasion try v ia gr a your anxiety will be a thing of the past and you will be back to your old self 


After removing punctuations:
 do you feel the pressure to perform and not rising to the occasion try v ia gr a your anxiety will be a thing of the past and you will be back to your old self 


In [7]:
def convert_lower_case(text):
    new_text = []
    for char in text:
        new_text.append(char.lower())
    return ''.join(new_text)

data["lower_case"] = ""
for i, row in data.iterrows():
    data.at[i, 'lower_case'] = convert_lower_case(row['no_punctuations'])

print("Plain Text:\n", data.no_punctuations[0])
print("\n")
print("After converting:\n", data.lower_case[0])

Plain Text:
 do you feel the pressure to perform and not rising to the occasion try v ia gr a your anxiety will be a thing of the past and you will be back to your old self 


After converting:
 do you feel the pressure to perform and not rising to the occasion try v ia gr a your anxiety will be a thing of the past and you will be back to your old self 


In [8]:
def remove_numbers(text):
    text_without_numbers = re.sub(r'\d', '', text)
    return text_without_numbers

In [9]:
def remove_extra_space(text):
    text_without_spaces = re.sub(r'\s{1,}', ' ', text)
    text_without_spaces = text_without_spaces.strip()
    return text_without_spaces

In [10]:
data["new_text"] = ""
for i, row in data.iterrows():
    data.at[i, 'new_text'] = remove_numbers(row['lower_case'])
for i, row in data.iterrows():
    data.at[i, 'new_text'] = remove_extra_space(row['new_text'])
data.drop(['no_punctuations', 'lower_case'], axis=1, inplace=True)
data.head()

Unnamed: 0,label,text,new_text
0,1,do you feel the pressure to perform and not ri...,do you feel the pressure to perform and not ri...
1,0,hi i've just updated from the gulus and i chec...,hi ive just updated from the gulus and i check...
2,1,mega authenticv i a g r a discount pricec i a ...,mega authenticv i a g r a discount pricec i a ...
3,1,hey billy it was really fun going out the othe...,hey billy it was really fun going out the othe...
4,1,system of the home it will have the capabiliti...,system of the home it will have the capabiliti...


In [11]:
def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens

data["tokens"] = ""
for i, row in data.iterrows():
    data.at[i, 'tokens'] = tokenize(row['new_text'])
data.head()

Unnamed: 0,label,text,new_text,tokens
0,1,do you feel the pressure to perform and not ri...,do you feel the pressure to perform and not ri...,"[do, you, feel, the, pressure, to, perform, an..."
1,0,hi i've just updated from the gulus and i chec...,hi ive just updated from the gulus and i check...,"[hi, ive, just, updated, from, the, gulus, and..."
2,1,mega authenticv i a g r a discount pricec i a ...,mega authenticv i a g r a discount pricec i a ...,"[mega, authenticv, i, a, g, r, a, discount, pr..."
3,1,hey billy it was really fun going out the othe...,hey billy it was really fun going out the othe...,"[hey, billy, it, was, really, fun, going, out,..."
4,1,system of the home it will have the capabiliti...,system of the home it will have the capabiliti...,"[system, of, the, home, it, will, have, the, c..."


In [12]:
def stemming(tokens):
    ps = PorterStemmer()
    return [ps.stem(word) for word in tokens]

data['stemmed_tokens'] = data['tokens'].apply(stemming)

In [13]:
def lemmatization(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in tokens]


data['lemmatized_tokens'] = data['stemmed_tokens'].apply(lemmatization)

In [14]:
stop_wordsF = pd.read_csv('./DataSet/stop-words.csv')
stop_words = []
for word in stop_wordsF['0']:
    stop_words.append(word)

clean_txt = []
for row in data['lemmatized_tokens']:
    new_row = []
    for token in row:
        if(not token in stop_words):
            new_row.append(token) 
    clean_txt.append(new_row)

data['clean_text'] = clean_txt
data.head()

Unnamed: 0,label,text,new_text,tokens,stemmed_tokens,lemmatized_tokens,clean_text
0,1,do you feel the pressure to perform and not ri...,do you feel the pressure to perform and not ri...,"[do, you, feel, the, pressure, to, perform, an...","[do, you, feel, the, pressur, to, perform, and...","[do, you, feel, the, pressur, to, perform, and...","[feel, pressur, perform, rise, occas, tri, v, ..."
1,0,hi i've just updated from the gulus and i chec...,hi ive just updated from the gulus and i check...,"[hi, ive, just, updated, from, the, gulus, and...","[hi, ive, just, updat, from, the, gulu, and, i...","[hi, ive, just, updat, from, the, gulu, and, i...","[hi, ive, updat, gulu, check, mirror, seem, li..."
2,1,mega authenticv i a g r a discount pricec i a ...,mega authenticv i a g r a discount pricec i a ...,"[mega, authenticv, i, a, g, r, a, discount, pr...","[mega, authenticv, i, a, g, r, a, discount, pr...","[mega, authenticv, i, a, g, r, a, discount, pr...","[mega, authenticv, g, r, discount, pricec, l, ..."
3,1,hey billy it was really fun going out the othe...,hey billy it was really fun going out the othe...,"[hey, billy, it, was, really, fun, going, out,...","[hey, billi, it, wa, realli, fun, go, out, the...","[hey, billi, it, wa, realli, fun, go, out, the...","[hey, billi, wa, realli, fun, go, night, talk,..."
4,1,system of the home it will have the capabiliti...,system of the home it will have the capabiliti...,"[system, of, the, home, it, will, have, the, c...","[system, of, the, home, it, will, have, the, c...","[system, of, the, home, it, will, have, the, c...","[system, home, capabl, link, far, know, within..."


In [15]:
# word_set = set(word for text in data['clean_text'] for word in text)

# for word in word_set:
#     doc_count = sum(1 for doc in data['clean_text'] if word in doc)
#     idf_table[word] = math.log(total_docs / (1 + doc_count))  # Adding 1 to avoid division by zero


# tf = {}
# def set_tf():
#     tf = {}
# def calculate_tf(word_set, data):

#     for word in word_set:
#         tf_list = []
#         for doc in data:
#             count = 0
#             for term in doc:
#                 if term == word:
#                     count += 1
#             term_freq = 0
#             if count != 0:
#                 term_freq = 1 + math.log(count, 2)
#             tf_list.append(term_freq)
#         tf[word] = tf_list

#     tf_idf = pd.DataFrame(tf)
#     return tf_idf

# set_tf()
# tf_idf  = calculate_tf(word_set, data['clean_text'])

# def calculate_tf_idf(tf, idf_table):
#     tf_idf = {}
#     for word in tf.keys():
#         tf_idf_list = [tf[word][i] * idf_table[word] for i in range(len(tf[word]))]
#         tf_idf[word] = tf_idf_list

#     tf_idf_table = pd.DataFrame(tf_idf)
#     return tf_idf_table

# tf_idf_table = calculate_tf_idf(tf, idf_table)
# tf_idf_table.head()


data['clean_text'] = data['clean_text'].apply(' '.join)

vectorizer = TfidfVectorizer()
tfidf_vector = vectorizer.fit_transform(data['clean_text'])

joblib.dump(vectorizer, './DataSet/vectorizer.joblib')

save_npz('./DataSet/tfidf_vector_sparse.npz', tfidf_vector)

# Load the TF-IDF vector from the sparse matrix file
loaded_tfidf_vector = load_npz('./DataSet/tfidf_vector_sparse.npz')

In [16]:
print(loaded_tfidf_vector)

  (0, 180967)	0.19095446010736283
  (0, 144352)	0.2245584127143744
  (0, 15490)	0.1905114060151016
  (0, 150732)	0.22980183445266128
  (0, 200655)	0.19189961501837136
  (0, 9521)	0.35078251480416944
  (0, 83502)	0.34316409850396096
  (0, 94002)	0.36898655703683647
  (0, 205222)	0.17170381445999983
  (0, 142620)	0.36152292356380067
  (0, 172785)	0.2698134356113022
  (0, 152479)	0.23325812845822724
  (0, 159383)	0.2817095193615477
  (0, 70746)	0.2084946037738639
  (1, 117529)	0.07159100365184952
  (1, 38585)	0.0410845619171498
  (1, 205647)	0.05984505668076646
  (1, 193184)	0.04019204476757009
  (1, 117475)	0.06958038014176232
  (1, 170636)	0.04936989953205303
  (1, 59083)	0.03787477655884909
  (1, 210842)	0.08859978061281074
  (1, 62499)	0.06149551631921449
  (1, 36378)	0.028113681694890626
  (1, 178295)	0.15265423999183125
  :	:
  (101070, 122875)	0.1330289585897811
  (101070, 196530)	0.21814110711562779
  (101070, 58388)	0.1331430838130512
  (101070, 31787)	0.17427209276530625
  (1010

In [17]:
term_index = 114617
term = vectorizer.get_feature_names_out()[term_index]

print(f"The term at index {term_index} is: {term}")

The term at index 114617 is: leasen
