<h3>Created By Dusit Chunviset [642115017]</h3>

In [54]:
import pandas as pd
import string as str
import re
import nltk
import numpy as np
nltk.download('words')
from ordered_set import OrderedSet
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize


[nltk_data] Downloading package words to
[nltk_data]     C:\Users\ASUS\AppData\Roaming\nltk_data...
[nltk_data]   Package words is already up-to-date!


In [55]:
def create_stem_cache(cleaned_description): 
 tokenized_description = cleaned_description.apply(lambda s: word_tokenize(s)) 
 concated = np.unique(np.concatenate([s for s in tokenized_description.values])) 
 stem_cache = {} 
 ps = PorterStemmer() 
 for s in concated: 
    stem_cache[s] = ps.stem(s) 
 return stem_cache
  
def create_custom_preprocessor(stop_dict, stem_cache): 
    def custom_preprocessor(s): 
        ps = PorterStemmer() 
        s = re.sub(r'[^A-Za-z]', ' ', s) 
        s = re.sub(r'\s+', ' ', s) 
        s = word_tokenize(s) 
        s = list(OrderedSet(s) - stop_dict) 
        s = [word for word in s if len(word)>2] 
        s = [stem_cache[w] if w in stem_cache else ps.stem(w) for w in s] 
        s = ' '.join(s) 
        return s 
    return custom_preprocessor

def sk_vectorize(texts, cleaned_description, stop_dict, stem_cache): 
    my_custom_preprocessor = create_custom_preprocessor(stop_dict, stem_cache) 
    vectorizer = CountVectorizer(preprocessor=my_custom_preprocessor) 
    vectorizer.fit(cleaned_description) 
    query = vectorizer.transform(texts) 
    print(query) 
    print(vectorizer.inverse_transform(query))

# 1) Use the previous dataset “spam dictation”
# 2) Preprocess text including:
• Remove white space <br/>
• Remove anything that is not English <br/>
• Calculate word length and added with column name “length” <br/>


In [56]:
def clean_text():
    data = pd.read_csv("data/spam.csv", encoding='ISO-8859-1')
   
    data["clean_text"] = data["v2"].apply(lambda s: s.lower())
    data["clean_text"] = data["clean_text"].apply(lambda s: re.sub(r'[^A-Za-z]', ' ', s))
    data['length'] = data['clean_text'].apply(lambda x: len(x))
    data["clean_text"] = data["clean_text"].apply(lambda s: re.sub(r'\s+', ' ', s))
    
    pattern_english =  re.compile(r'[^a-zA-Z\s]')
    data['clean_text'] = data['clean_text'].apply(lambda x: re.sub(pattern_english, '', x))
    
    
    data['label'] = data['v1'].map({'ham': 0, 'spam': 1})
    data['orginal_text'] = data['v2']
    data['clean_text'] = data["clean_text"]
 
    data = pd.concat([data['label'], data['clean_text'], data['orginal_text'], data['length']], axis=1)
    
    return data


# 3) Create new column name “text2”

In [57]:
cleaned_description =  clean_text()
stem_cache = create_stem_cache(cleaned_description['clean_text'])
stop_dict = set(stopwords.words('English')) 
my_custom_preprocessor = create_custom_preprocessor(stop_dict, stem_cache)
my_custom_processor = create_custom_preprocessor(stop_dict, stem_cache)

In [58]:
cleaned_description

Unnamed: 0,label,clean_text,orginal_text,length
0,0,go until jurong point crazy available only in ...,"Go until jurong point, crazy.. Available only ...",111
1,0,ok lar joking wif u oni,Ok lar... Joking wif u oni...,29
2,1,free entry in a wkly comp to win fa cup final ...,Free entry in 2 a wkly comp to win FA Cup fina...,155
3,0,u dun say so early hor u c already then say,U dun say so early hor... U c already then say...,49
4,0,nah i don t think he goes to usf he lives arou...,"Nah I don't think he goes to usf, he lives aro...",61
...,...,...,...,...
5567,1,this is the nd time we have tried contact u u ...,This is the 2nd time we have tried 2 contact u...,161
5568,0,will b going to esplanade fr home,Will Ì_ b going to esplanade fr home?,37
5569,0,pity was in mood for that so any other suggest...,"Pity, * was in mood for that. So...any other s...",57
5570,0,the guy did some bitching but i acted like i d...,The guy did some bitching but I acted like i'd...,125


#  4. Use labelEncoder method to convert class target

In [59]:

bigram_vectorizer = CountVectorizer(preprocessor=my_custom_preprocessor, ngram_range=(1,2))
bigram_vectorizer.fit(cleaned_description["clean_text"])
print(len(bigram_vectorizer.get_feature_names_out()))

32280


# 5. Use CountVectorize to perform BOW

<h3>Bag Of Ngrams</h3>

In [60]:
vectorizer = CountVectorizer(preprocessor=my_custom_preprocessor)
vectorizer.fit(cleaned_description["clean_text"])
X = vectorizer.transform(cleaned_description["clean_text"])
N = len(cleaned_description["clean_text"])
X_df = pd.DataFrame(X.toarray(), columns=vectorizer.get_feature_names_out())

X_df[X_df.sum().sort_values()[-50:].index].iloc[:50]

Unnamed: 0,tri,miss,well,wait,messag,min,meet,week,msg,ask,...,free,want,got,time,like,know,day,come,get,call
0,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
9,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,1


#  6. List Top 5 and bottom 5 of transform sample to show the results and submit your works to MS team

In [61]:
result = pd.concat([cleaned_description, X_df[X_df.sum().sort_values()[:].index].iloc[:]], axis=1)
result.drop(["length"], axis=1, inplace=True)
result["length"] = result["clean_text"].apply(lambda x: len(x))
result

Unnamed: 0,label,clean_text,orginal_text,zyada,ileav,sorta,ilol,im,imat,imf,...,want,got,time,like,know,day,come,get,call,length
0,0,go until jurong point crazy available only in ...,"Go until jurong point, crazy.. Available only ...",0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,103
1,0,ok lar joking wif u oni,Ok lar... Joking wif u oni...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,24
2,1,free entry in a wkly comp to win fa cup final ...,Free entry in 2 a wkly comp to win FA Cup fina...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,126
3,0,u dun say so early hor u c already then say,U dun say so early hor... U c already then say...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,44
4,0,nah i don t think he goes to usf he lives arou...,"Nah I don't think he goes to usf, he lives aro...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
5567,1,this is the nd time we have tried contact u u ...,This is the 2nd time we have tried 2 contact u...,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,129
5568,0,will b going to esplanade fr home,Will Ì_ b going to esplanade fr home?,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,34
5569,0,pity was in mood for that so any other suggest...,"Pity, * was in mood for that. So...any other s...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,51
5570,0,the guy did some bitching but i acted like i d...,The guy did some bitching but I acted like i'd...,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,125


<h3>Top 5</h3>

In [62]:
result.head(5)

Unnamed: 0,label,clean_text,orginal_text,zyada,ileav,sorta,ilol,im,imat,imf,...,want,got,time,like,know,day,come,get,call,length
0,0,go until jurong point crazy available only in ...,"Go until jurong point, crazy.. Available only ...",0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,103
1,0,ok lar joking wif u oni,Ok lar... Joking wif u oni...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,24
2,1,free entry in a wkly comp to win fa cup final ...,Free entry in 2 a wkly comp to win FA Cup fina...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,126
3,0,u dun say so early hor u c already then say,U dun say so early hor... U c already then say...,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,44
4,0,nah i don t think he goes to usf he lives arou...,"Nah I don't think he goes to usf, he lives aro...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,60


<h3>Bottom 5<h3>

In [63]:
result.tail(5)

Unnamed: 0,label,clean_text,orginal_text,zyada,ileav,sorta,ilol,im,imat,imf,...,want,got,time,like,know,day,come,get,call,length
5567,1,this is the nd time we have tried contact u u ...,This is the 2nd time we have tried 2 contact u...,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,1,129
5568,0,will b going to esplanade fr home,Will Ì_ b going to esplanade fr home?,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,34
5569,0,pity was in mood for that so any other suggest...,"Pity, * was in mood for that. So...any other s...",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,51
5570,0,the guy did some bitching but i acted like i d...,The guy did some bitching but I acted like i'd...,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,125
5571,0,rofl its true to its name,Rofl. Its true to its name,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,25
