### Bag of words - One Hot Encoded

In [6]:
sentence="The Eiffel Tower was built from 1887 to 1889 by French engineer Gustave Eiffel, whose company specialized in building metal frameworks and structures."

In [1]:
import pandas as pd
messages=pd.read_csv('dataset/SMSSpamCollection',
                    sep='\t',names=["label","message"])

In [2]:
messages

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will ü b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [6]:
# Data cleaning and pre-processing
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
ps=PorterStemmer()

In [8]:
corpus=[]
for i in range(0,len(messages)):
    # any character, apart from "a" to "z", is replaced with space
    review=re.sub('[^a-z]',' ', messages['message'][i]) 
    review=review.lower()
    review=review.split()
    review=[ps.stem(word) for word in review if not word in stopwords.words('english')]
    review=' '.join(review)
    corpus.append(review)

In [9]:
corpus

['jurong point crazi vailabl bugi n great world la e buffet ine got amor wat',
 'k lar oke wif u oni',
 'ree entri wkli comp win final tkt st ay ext receiv entri question std txt rate appli',
 'dun say earli hor c alreadi say',
 'ah think goe usf live around though',
 'ree sg ey darl week word back like fun still b ok x std chg send rcv',
 'ven brother like speak hey treat like aid patent',
 'per request ell ell ru innaminungint urungu ettam set callertun aller ress copi friend allertun',
 'valu network custom select receivea prize reward claim call laim code alid hour',
 'ad mobil month entitl pdate latest colour mobil camera ree obil pdate',
 'gonna home soon want talk stuff anymor tonight k cri enough today',
 'chanc win rom pound txt send ost p day day sand appli epli info',
 'ou week membership rize ackpot xt word www dbuk net',
 'search right word thank breather promis wont take help grant fulfil promis ou wonder bless time',
 '',
 'obil ovi lub use credit click link next txt mes

In [16]:
# create the bag of words
from sklearn.feature_extraction.text import CountVectorizer
cv=CountVectorizer(max_features=2500, binary=True) # Bag of words of max 2500
cv.fit_transform(corpus).toarray()

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [17]:
X=cv.fit_transform(corpus).toarray()

In [19]:
X.shape

(5572, 2500)

### Bag of words - One Hot Encoded

In [20]:
cv.vocabulary_

{'point': np.int64(1671),
 'crazi': np.int64(439),
 'vailabl': np.int64(2340),
 'bugi': np.int64(293),
 'great': np.int64(890),
 'world': np.int64(2447),
 'la': np.int64(1166),
 'ine': np.int64(1071),
 'got': np.int64(878),
 'wat': np.int64(2384),
 'lar': np.int64(1178),
 'oke': np.int64(1483),
 'wif': np.int64(2418),
 'oni': np.int64(1511),
 'ree': np.int64(1830),
 'entri': np.int64(625),
 'wkli': np.int64(2437),
 'comp': np.int64(403),
 'win': np.int64(2422),
 'final': np.int64(741),
 'tkt': np.int64(2212),
 'st': np.int64(2080),
 'ay': np.int64(195),
 'ext': np.int64(686),
 'receiv': np.int64(1819),
 'question': np.int64(1762),
 'std': np.int64(2088),
 'txt': np.int64(2261),
 'rate': np.int64(1788),
 'appli': np.int64(122),
 'dun': np.int64(549),
 'say': np.int64(1939),
 'earli': np.int64(556),
 'alreadi': np.int64(78),
 'ah': np.int64(42),
 'think': np.int64(2190),
 'goe': np.int64(867),
 'usf': np.int64(2325),
 'live': np.int64(1229),
 'around': np.int64(142),
 'though': np.int64(

In [22]:
# create the Bag of words with N-Grams
from sklearn.feature_extraction.text import CountVectorizer

# ngram_range=(1,1) --> unigram
cv=CountVectorizer(max_features=100, binary=True, ngram_range=(1,1))
X=cv.fit_transform(corpus).toarray()

In [23]:
cv.vocabulary_

{'great': np.int64(31),
 'got': np.int64(30),
 'wat': np.int64(94),
 'txt': np.int64(88),
 'say': np.int64(73),
 'alreadi': np.int64(0),
 'think': np.int64(82),
 'ey': np.int64(21),
 'week': np.int64(96),
 'back': np.int64(4),
 'like': np.int64(42),
 'still': np.int64(76),
 'ok': np.int64(59),
 'send': np.int64(75),
 'ell': np.int64(16),
 'friend': np.int64(25),
 'prize': np.int64(69),
 'claim': np.int64(7),
 'call': np.int64(5),
 'mobil': np.int64(51),
 'home': np.int64(35),
 'want': np.int64(93),
 'today': np.int64(84),
 'day': np.int64(13),
 'ou': np.int64(64),
 'xt': np.int64(99),
 'www': np.int64(98),
 'right': np.int64(71),
 'take': np.int64(78),
 'time': np.int64(83),
 'messag': np.int64(48),
 'com': np.int64(9),
 'es': np.int64(18),
 'make': np.int64(46),
 'way': np.int64(95),
 'feel': np.int64(22),
 'hat': np.int64(33),
 'dont': np.int64(14),
 'miss': np.int64(50),
 'ur': np.int64(89),
 'ox': np.int64(66),
 'go': np.int64(28),
 'tri': np.int64(86),
 'hen': np.int64(34),
 'da':

In [25]:
# ngram_range=(1,2) --> bigram, including unigram
cv=CountVectorizer(max_features=500, binary=True, ngram_range=(1,2))
X=cv.fit_transform(corpus).toarray()
cv.vocabulary_

{'great': np.int64(61),
 'got': np.int64(60),
 'wat': np.int64(185),
 'ree': np.int64(140),
 'win': np.int64(191),
 'st': np.int64(156),
 'ay': np.int64(12),
 'ext': np.int64(46),
 'txt': np.int64(174),
 'say': np.int64(144),
 'alreadi': np.int64(3),
 'ah': np.int64(1),
 'think': np.int64(166),
 'around': np.int64(8),
 'ey': np.int64(47),
 'week': np.int64(188),
 'word': np.int64(193),
 'back': np.int64(14),
 'like': np.int64(86),
 'still': np.int64(158),
 'ok': np.int64(115),
 'send': np.int64(146),
 'per': np.int64(127),
 'ell': np.int64(39),
 'friend': np.int64(54),
 'custom': np.int64(31),
 'prize': np.int64(137),
 'claim': np.int64(23),
 'call': np.int64(16),
 'hour': np.int64(72),
 'ad': np.int64(0),
 'mobil': np.int64(102),
 'month': np.int64(104),
 'obil': np.int64(114),
 'gonna': np.int64(58),
 'home': np.int64(70),
 'soon': np.int64(154),
 'want': np.int64(184),
 'talk': np.int64(162),
 'tonight': np.int64(171),
 'today': np.int64(168),
 'day': np.int64(33),
 'epli': np.int64

In [26]:
# ngram_range=(1,2) --> bigram ignoring unigram
cv=CountVectorizer(max_features=500, binary=True, ngram_range=(2,2))
X=cv.fit_transform(corpus).toarray()
cv.vocabulary_

{'ree entri': np.int64(365),
 'entri wkli': np.int64(116),
 'std txt': np.int64(400),
 'txt rate': np.int64(431),
 'rate appli': np.int64(358),
 'ree sg': np.int64(367),
 'per request': np.int64(338),
 'set callertun': np.int64(388),
 'callertun aller': np.int64(54),
 'aller ress': np.int64(7),
 'ress copi': np.int64(371),
 'copi friend': np.int64(87),
 'friend allertun': np.int64(138),
 'claim call': np.int64(68),
 'code alid': np.int64(72),
 'ad mobil': np.int64(1),
 'latest colour': np.int64(210),
 'ree obil': np.int64(366),
 'obil pdate': np.int64(279),
 'chanc win': np.int64(66),
 'xt word': np.int64(499),
 'word www': np.int64(492),
 'txt ox': np.int64(430),
 'let know': np.int64(215),
 'feel like': np.int64(128),
 'mobil charg': np.int64(253),
 'go home': np.int64(151),
 'nyth lor': np.int64(275),
 'call repli': np.int64(50),
 'leas call': np.int64(212),
 'deliveri tomorrow': np.int64(96),
 'lt gt': np.int64(231),
 'miss call': np.int64(251),
 'want go': np.int64(470),
 'like lt

In [27]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

### Bag of words - One Hot Encoded

In [29]:
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
wordNetLemmatizer=WordNetLemmatizer()

In [30]:
corpus=[]
for i in range(0,len(messages)):
    # substitute any non text char (special characters)
    review=re.sub('[^a-zA-z]',' ',messages['message'][i])
    review=review.lower()
    review=review.split()
    review=[wordNetLemmatizer.lemmatize(word) for word in review if not word in stopwords.words('english')]
    review=' '.join(review)
    corpus.append(review)

In [31]:
corpus

['go jurong point crazy available bugis n great world la e buffet cine got amore wat',
 'ok lar joking wif u oni',
 'free entry wkly comp win fa cup final tkts st may text fa receive entry question std txt rate c apply',
 'u dun say early hor u c already say',
 'nah think go usf life around though',
 'freemsg hey darling week word back like fun still tb ok xxx std chgs send rcv',
 'even brother like speak treat like aid patent',
 'per request melle melle oru minnaminunginte nurungu vettam set callertune caller press copy friend callertune',
 'winner valued network customer selected receivea prize reward claim call claim code kl valid hour',
 'mobile month u r entitled update latest colour mobile camera free call mobile update co free',
 'gonna home soon want talk stuff anymore tonight k cried enough today',
 'six chance win cash pound txt csh send cost p day day tsandcs apply reply hl info',
 'urgent week free membership prize jackpot txt word claim c www dbuk net lccltd pobox ldnw rw'

In [32]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf=TfidfVectorizer(max_features=100)

In [34]:
X=tfidf.fit_transform(corpus).toarray()

In [36]:
import numpy as np
np.set_printoptions(edgeitems=30,linewidth=100000,
                    formatter=dict(float=lambda x:"%.3g" % x))

<Token var=<ContextVar name='format_options' default={'edgeitems': 3, 'threshold': 1000, 'floatmode': 'maxprec', 'precision': 8, 'suppress': False, 'linewidth': 75, 'nanstr': 'nan', 'infstr': 'inf', 'sign': '-', 'formatter': None, 'legacy': 9223372036854775807, 'override_repr': None} at 0x7f7990b6d850> at 0x7f7945dd9e00>

In [37]:
X

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.434, 0, 0, 0.461, 0.544, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.55, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.456, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0.473, 0, 0, 0, 0, 0, 0, 0, 0.492, 0, 0, 0, 0, 0, 0, 0, 0.571, 0, 0, 0, 0, 0, 0],
       [0.465, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.486, 0, 0, 0, 0, 0, 0, 0, ..., 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.574, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 