### Bag of Words

In [11]:
import pandas as pd


In [12]:
messages = pd.read_csv(
    'datasets/spam.csv',
    encoding='latin-1'
).drop(
    columns=['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4']
).rename(
    columns={'v1': 'label', 'v2': 'message'}
)

messages.head()


Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [15]:
### Data cleaning and preprocessing
import regex as re      # for regular expression operations
import nltk            # natural language toolkit
from nltk.corpus import stopwords   # stopwords corpus
from nltk.stem import PorterStemmer # stemming with Porter Stemmer
from nltk.stem import WordNetLemmatizer # lemmatization with WordNet Lemmatizer

In [17]:
corpus = []
ps = PorterStemmer()
lemmatizer = WordNetLemmatizer()

for i in range(0, len(messages)):
    # # Remove all the special characters
    # review = re.sub(r'\W', ' ', str(messages['message'][i]))

    # # remove all single characters
    # review = re.sub(r'\s+[a-zA-Z]\s+', ' ', review)

    # # Remove single characters from the start
    # review = re.sub(r'\^[a-zA-Z]\s+', ' ', review) 

    # # Substituting multiple spaces with single space
    # review = re.sub(r'\s+', ' ', review, flags=re.I)

    # # Removing prefixed 'b'
    # review = re.sub(r'^b\s+', '', review)

    # Removing any digit or symbols apart from A-Z and a-z replacing with space
    review = re.sub(r'[^a-zA-Z]', ' ', str(messages['message'][i]))
    # Converting to Lowercase
    review = review.lower()

    # stemming and removing stopwords
    review = review.split()

    review = [lemmatizer.lemmatize(word) for word in review if word not in set(stopwords.words('english'))]
    review = ' '.join(review)
    
    corpus.append(review)

In [20]:
## Creating the Bag of Words model
from sklearn.feature_extraction.text import CountVectorizer
cv = CountVectorizer(max_features=2500, binary=True)  # selecting top 2500 features with binary occurrence
X = cv.fit_transform(corpus).toarray()
y = pd.get_dummies(messages['label'])
y = y.iloc[:,1].values  # spam=1, ham=0
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(5572, 2500))

## N-GRAMs

In [30]:
## Creating the Bag of Words model with n-grams
cv = CountVectorizer(max_features=20, binary=True, ngram_range=(2,3))  # selecting top 2500 features with binary occurrence
X = cv.fit_transform(corpus).toarray()
y = pd.get_dummies(messages['label'])
y = y.iloc[:,1].values  # spam=1, ham=0
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(5572, 20))

In [31]:
cv.vocabulary_

{'free call': np.int64(4),
 'let know': np.int64(7),
 'please call': np.int64(13),
 'lt gt': np.int64(10),
 'sorry call': np.int64(17),
 'call later': np.int64(1),
 'sorry call later': np.int64(18),
 'customer service': np.int64(3),
 'prize guaranteed': np.int64(16),
 'guaranteed call': np.int64(6),
 'call landline': np.int64(0),
 'new year': np.int64(11),
 'co uk': np.int64(2),
 'lt decimal': np.int64(8),
 'lt decimal gt': np.int64(9),
 'good morning': np.int64(5),
 'po box': np.int64(15),
 'pls send': np.int64(14),
 'take care': np.int64(19),
 'ok lor': np.int64(12)}

In [32]:
X

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]], shape=(5572, 20))

### Create TF-IDF using TfidfVectorizer

In [34]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [35]:
tfidf = TfidfVectorizer(max_features = 100)
X = tfidf.fit_transform(corpus).toarray()

In [36]:
import numpy as np
np.set_printoptions(edgeitems=30, linewidth=100000,
formatter= dict(float=lambda x: "%.3f" % x))

In [37]:
X

array([[0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.435, 0.000, 0.000, 0.461, 0.544, 0.000, 0.000, 0.000, 0.000, ..., 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.550, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000],
       [0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, ..., 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000],
       [0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.000, 0.456

##### With N-Grams

In [38]:
tfidf = TfidfVectorizer(max_features = 100, ngram_range=(2,2))
X = tfidf.fit_transform(corpus).toarray()

In [39]:
tfidf.vocabulary_

{'free entry': np.int64(31),
 'claim call': np.int64(15),
 'call claim': np.int64(3),
 'free call': np.int64(30),
 'chance win': np.int64(14),
 'txt word': np.int64(88),
 'let know': np.int64(51),
 'please call': np.int64(65),
 'lt gt': np.int64(55),
 'want go': np.int64(97),
 'like lt': np.int64(52),
 'sorry call': np.int64(79),
 'call later': np.int64(8),
 'ur awarded': np.int64(90),
 'hi hi': np.int64(44),
 'call customer': np.int64(4),
 'customer service': np.int64(22),
 'cash prize': np.int64(13),
 'trying contact': np.int64(85),
 'draw show': np.int64(27),
 'show prize': np.int64(77),
 'prize guaranteed': np.int64(71),
 'guaranteed call': np.int64(40),
 'valid hr': np.int64(95),
 'selected receive': np.int64(74),
 'private account': np.int64(69),
 'account statement': np.int64(0),
 'statement show': np.int64(80),
 'call identifier': np.int64(5),
 'identifier code': np.int64(47),
 'code expires': np.int64(19),
 'urgent mobile': np.int64(94),
 'call landline': np.int64(7),
 'wat ti

### Word2Vec with Gensim

In [None]:
import gensim
from gensim.models import Word2Vec, KeyedVectors
import gensim.downloader as api

wv = api.load('word2vec-google-news-300')  # load the google word2vec model



NameError: name 'Wv' is not defined

In [42]:
vec_king = wv['king']  # get the vector for the word 'king'
vec_king

array([0.126, 0.030, 0.009, 0.140, -0.026, -0.036, 0.112, -0.198, 0.051, 0.363, -0.242, -0.303, -0.178, -0.025, -0.168, -0.170, 0.035, 0.005, 0.046, 0.129, 0.137, 0.113, 0.060, 0.137, 0.101, -0.177, -0.252, 0.060, 0.342, -0.031, 0.104, 0.062, 0.125, 0.400, -0.322, 0.084, 0.039, 0.006, 0.070, 0.173, 0.139, -0.231, 0.283, 0.143, 0.342, -0.024, -0.110, 0.033, -0.055, 0.015, -0.162, 0.158, -0.260, 0.020, -0.163, 0.001, -0.145, -0.057, 0.043, -0.025, 0.186, 0.447, 0.010, 0.132, 0.099, -0.186, -0.100, -0.134, -0.125, 0.283, 0.123, 0.053, -0.178, 0.086, -0.022, 0.021, -0.140, 0.025, 0.139, -0.105, 0.139, 0.089, -0.075, -0.021, 0.173, 0.046, -0.266, 0.009, 0.149, 0.038, 0.238, -0.125, -0.218, -0.182, 0.030, 0.057, -0.029, 0.012, 0.097, -0.231, 0.058, 0.067, 0.071, -0.309, -0.215, 0.146, -0.428, -0.009, 0.154, -0.077, 0.289, 0.277, -0.000, -0.137, 0.324, -0.246, -0.003, -0.212, 0.125, 0.270, 0.204, 0.083, -0.201, -0.160, -0.038, -0.120, 0.115, -0.041, -0.040, -0.090, 0.006, 0.203, 0.187, 0.273,

In [43]:
vec_king.shape

(300,)

In [44]:
wv.most_similar("cricket")

[('cricketing', 0.8372225761413574),
 ('cricketers', 0.8165745735168457),
 ('Test_cricket', 0.8094819784164429),
 ('Twenty##_cricket', 0.8068488240242004),
 ('Twenty##', 0.762426495552063),
 ('Cricket', 0.7541396617889404),
 ('cricketer', 0.7372579574584961),
 ('twenty##', 0.7316356301307678),
 ('T##_cricket', 0.7304614186286926),
 ('West_Indies_cricket', 0.698798656463623)]

In [45]:
wv.most_similar("happy")

[('glad', 0.7408890724182129),
 ('pleased', 0.6632170677185059),
 ('ecstatic', 0.6626912355422974),
 ('overjoyed', 0.6599286794662476),
 ('thrilled', 0.6514049172401428),
 ('satisfied', 0.6437948942184448),
 ('proud', 0.636042058467865),
 ('delighted', 0.627237856388092),
 ('disappointed', 0.6269949674606323),
 ('excited', 0.6247665882110596)]

In [46]:
wv.similarity("hockey", "sports")

np.float32(0.53541523)

In [47]:
vec = wv['king'] - wv['man'] + wv['woman']
vec

array([0.043, -0.178, -0.129, 0.115, 0.003, -0.102, 0.196, -0.180, 0.020, 0.410, -0.368, -0.396, -0.157, 0.001, -0.093, -0.116, -0.055, -0.108, 0.079, 0.199, 0.239, 0.063, -0.022, 0.000, 0.047, -0.218, -0.345, 0.064, 0.316, -0.198, 0.086, -0.081, -0.037, 0.316, -0.342, -0.047, 0.098, 0.084, -0.097, 0.052, -0.050, -0.221, 0.229, 0.126, 0.249, 0.021, -0.110, 0.058, -0.034, 0.130, 0.024, 0.035, -0.260, 0.242, -0.322, 0.015, -0.159, -0.084, 0.165, 0.002, 0.310, 0.316, 0.007, 0.241, 0.049, -0.099, 0.029, 0.149, -0.048, 0.236, 0.221, 0.125, -0.139, 0.154, 0.072, 0.130, -0.106, 0.060, 0.315, 0.110, 0.085, 0.077, -0.022, 0.061, -0.190, 0.208, -0.163, 0.114, 0.201, 0.061, 0.128, -0.311, -0.280, -0.156, 0.042, 0.099, 0.170, -0.035, 0.208, -0.099, 0.004, -0.073, -0.042, -0.409, -0.276, 0.164, -0.558, -0.202, 0.212, -0.098, 0.231, 0.276, 0.168, -0.045, 0.172, -0.377, -0.004, -0.302, 0.174, 0.330, 0.201, 0.118, -0.138, -0.107, 0.086, 0.106, 0.145, 0.003, 0.018, 0.037, 0.007, 0.133, 0.096, 0.336, 0.

In [49]:
wv.most_similar(vec)

[('king', 0.8449392318725586),
 ('queen', 0.7300516366958618),
 ('monarch', 0.645466148853302),
 ('princess', 0.6156251430511475),
 ('crown_prince', 0.5818676948547363),
 ('prince', 0.5777117609977722),
 ('kings', 0.5613663792610168),
 ('sultan', 0.5376776456832886),
 ('Queen_Consort', 0.5344247221946716),
 ('queens', 0.5289887189865112)]