In [39]:
# download needed data from nltk just in case
import nltk
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('omw-1.4')

from sklearn.feature_extraction.text import CountVectorizer

# from text_processing.ipynb
from nltk.tokenize import RegexpTokenizer
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

import pandas as pd

[nltk_data] Downloading package stopwords to /home/vscode/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /home/vscode/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/vscode/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [40]:
# from text_processing.ipynb
tokenizer = RegexpTokenizer(r'\w+')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# apply stop_words and tokenizer when running count vectorizer
vectorizer = CountVectorizer(stop_words=stop_words,
                             tokenizer=tokenizer.tokenize)

In [41]:
# create function to lemmatize strings
def lemmatize(text):
    # use lemmatization code from text_processing.ipynb
    lemmatized_words = ' '.join([lemmatizer.lemmatize(word=word,pos='v') for word in text.split(' ')])
    return lemmatized_words

In [42]:
csv_file = open("../Data/text_content.csv", mode='r', encoding="utf-8")
df = pd.read_csv(csv_file)
df.head()

Unnamed: 0,Image_File,Image_Text,IsSpam
0,../Data/Processed\Dredze Spam\1000.jpg,CRITICAL INVESTOR ALERT! ESPION INTERNATIONAL...,Spam
1,../Data/Processed\Dredze Spam\1002.jpg,Itis on the move this company is producing rea...,Spam
2,../Data/Processed\Dredze Spam\1003.jpg,ATTENTION ALL DAY TRADERS AND INVESTORS*** IN...,Spam
3,../Data/Processed\Dredze Spam\1007.jpg,YOU GOT IT FIRST! 1 Ready to Rum!!! (RUNNING ...,Spam
4,../Data/Processed\Dredze Spam\1008.jpg,Hot Stock For Your Attention ENERGY FINDERS I...,Spam


In [43]:
matrix = vectorizer.fit_transform(df['Image_Text'].str.lower().apply(lemmatize))
matrix 

<2325x27792 sparse matrix of type '<class 'numpy.int64'>'
	with 141388 stored elements in Compressed Sparse Row format>

In [44]:
counts = pd.DataFrame(matrix.toarray(),columns=vectorizer.get_feature_names_out())
counts

Unnamed: 0,0,00,000,0000,00000000,0005,00055,000m,000ow,0011,...,érew,és,ét,étely,évents,évetel,évidence,éxeiting,éxpansiothy,éxpert
0,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,3,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,1,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,2,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,2,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2320,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,0,0,0
2321,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2322,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2323,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [45]:
# join feature columns with original datafram
df = df.join(counts)

# remove Image_Text column and move IsSpam to end
df = df.drop(columns=['Image_Text'])
df1 = df.pop('IsSpam')
df['IsSpam'] = df1

df.head()

Unnamed: 0,Image_File,0,00,000,0000,00000000,0005,00055,000m,000ow,...,és,ét,étely,évents,évetel,évidence,éxeiting,éxpansiothy,éxpert,IsSpam
0,../Data/Processed\Dredze Spam\1000.jpg,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Spam
1,../Data/Processed\Dredze Spam\1002.jpg,1,3,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Spam
2,../Data/Processed\Dredze Spam\1003.jpg,1,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Spam
3,../Data/Processed\Dredze Spam\1007.jpg,2,1,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Spam
4,../Data/Processed\Dredze Spam\1008.jpg,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,Spam


In [46]:
df.to_csv('../Data/feature-extract.csv')