In [6]:
import pandas as pd

In [7]:
df = pd.read_csv('Datasets/spam.csv', encoding='latin1')
df.head()

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,


In [None]:
import nltk
from nltk.tokenize import word_tokenize , sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer , PorterStemmer , SnowballStemmer

nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')

nltk.download('averaged_perceptron_tagger')
nltk.download('maxent_ne_chunker')
nltk.download('words')

stop_words = set(stopwords.words('english'))

porterstemmer = PorterStemmer()
snowballstemmer = SnowballStemmer('english')
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    word_tokens = word_tokenize(text)
    sent_tokens = sent_tokenize(text)
    filtered_tokens = [word for word in word_tokens if word.lower() not in stop_words]
    porterstemmer_tokens = [porterstemmer.stem(word.lower()) for word in filtered_tokens]
    snowballstemmer_tokens = [snowballstemmer.stem(word.lower()) for word in filtered_tokens]
    lemmatizer_tokens = [lemmatizer.lemmatize(word.lower()) for word in filtered_tokens]

    
    return {
        "original_text": text,
        "word_tokens": word_tokens,
        "sent_tokens": sent_tokens,
        "filtered_tokens": filtered_tokens,
        "porterstemmer_tokens": porterstemmer_tokens,
        "snowballstemmer_tokens": snowballstemmer_tokens,
        "lemmatizer_tokens": lemmatizer_tokens,
    }

results = df['v2'].apply(preprocess_text)


[nltk_data] Downloading package stopwords to C:\Users\FAZIL K
[nltk_data]     FAAZI\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:\Users\FAZIL K
[nltk_data]     FAAZI\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to C:\Users\FAZIL K
[nltk_data]     FAAZI\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\FAZIL K FAAZI\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package maxent_ne_chunker to C:\Users\FAZIL K
[nltk_data]     FAAZI\AppData\Roaming\nltk_data...
[nltk_data]   Package maxent_ne_chunker is already up-to-date!
[nltk_data] Downloading package words to C:\Users\FAZIL K
[nltk_data]     FAAZI\AppData\Roaming\nltk_da

In [13]:
processed_df = pd.DataFrame(results.tolist())
processed_df.head()

pos_tokens = processed_df['filtered_tokens'].explode().dropna().tolist()
pos_tokens = pos_tokens[:1000]
pos_tag_words = nltk.pos_tag(pos_tokens)
ner_words = nltk.ne_chunk(pos_tag_words)

print(pos_tag_words[:25])
print(ner_words)

[('Go', 'VB'), ('jurong', 'JJ'), ('point', 'NN'), (',', ','), ('crazy', 'JJ'), ('..', 'NNP'), ('Available', 'NNP'), ('bugis', 'NN'), ('n', 'RB'), ('great', 'JJ'), ('world', 'NN'), ('la', 'NN'), ('e', 'FW'), ('buffet', 'NN'), ('...', ':'), ('Cine', 'NNP'), ('got', 'VBD'), ('amore', 'RB'), ('wat', 'JJ'), ('...', ':'), ('Ok', 'NNP'), ('lar', 'NN'), ('...', ':'), ('Joking', 'NNP'), ('wif', 'WRB')]
(S
  Go/VB
  jurong/JJ
  point/NN
  ,/,
  crazy/JJ
  ../NNP
  Available/NNP
  bugis/NN
  n/RB
  great/JJ
  world/NN
  la/NN
  e/FW
  buffet/NN
  .../:
  (PERSON Cine/NNP)
  got/VBD
  amore/RB
  wat/JJ
  .../:
  Ok/NNP
  lar/NN
  .../:
  Joking/NNP
  wif/WRB
  u/JJ
  oni/NN
  .../:
  Free/JJ
  entry/NN
  2/CD
  wkly/JJ
  comp/NN
  win/VBP
  FA/NNP
  Cup/NNP
  final/JJ
  tkts/NN
  21st/CD
  May/NNP
  2005/CD
  ./.
  (PERSON Text/NNP)
  FA/NNP
  87121/CD
  receive/JJ
  entry/NN
  question/NN
  (/(
  std/JJ
  txt/NN
  rate/NN
  )/)
  &/CC
  C/NNP
  's/POS
  apply/VBP
  08452810075over18/CD
  's/POS
 

**Vectorization**

In [22]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
import numpy as np

lemmas = processed_df['lemmatizer_tokens'].explode().dropna().tolist()
vocab = sorted(set(lemmas))

vocab_reshaped = np.array(vocab).reshape(-1, 1)
one_hot = OneHotEncoder(sparse_output=False)
one_hot_matrix = one_hot.fit_transform(vocab_reshaped)

docs = processed_df['lemmatizer_tokens'].apply(lambda x: " ".join(x))

bow = CountVectorizer()
bow_matrix = bow.fit_transform(docs)

tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform(docs)

feature_names = tfidf.get_feature_names_out()
print(f"Feature names:\n{feature_names}")

dense_row = tfidf_matrix[0].todense().tolist()

results = {
    "one_hot": one_hot_matrix,
    "bow": bow_matrix,
    "tfidf": tfidf_matrix
}

print(results)


Feature names:
['00' '000' '000pes' ... 'ûïharry' 'ûò' 'ûówell']
{'one_hot': array([[1., 0., 0., ..., 0., 0., 0.],
       [0., 1., 0., ..., 0., 0., 0.],
       [0., 0., 1., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 0.],
       [0., 0., 0., ..., 0., 0., 1.]]), 'bow': <5572x8117 sparse matrix of type '<class 'numpy.int64'>'
	with 47921 stored elements in Compressed Sparse Row format>, 'tfidf': <5572x8117 sparse matrix of type '<class 'numpy.float64'>'
	with 47921 stored elements in Compressed Sparse Row format>}


In [25]:
import gensim
from gensim.models import Word2Vec

data = processed_df['lemmatizer_tokens'].explode().dropna().tolist()

model1 = gensim.models.Word2Vec(data, min_count = 1, window = 5, sg=0)
model2 = gensim.models.Word2Vec(data, min_count = 1,  window = 5, sg = 1)