In [1]:
import pandas as pd
import numpy as np

In [3]:
spam_data = pd.read_csv(r"spam.csv", encoding="latin-1")
spam_data = spam_data.loc[:,:"Text"]

In [4]:
spam_data.head()

Unnamed: 0,Target,Text
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [5]:
spam_data_small_text = list(spam_data.head()["Text"])
spam_data_small_target = list(spam_data.head()["Target"])

In [6]:
spam_data_small_text

['Go until jurong point, crazy.. Available only in bugis n great world la e buffet... Cine there got amore wat...',
 'Ok lar... Joking wif u oni...',
 "Free entry in 2 a wkly comp to win FA Cup final tkts 21st May 2005. Text FA to 87121 to receive entry question(std txt rate)T&C's apply 08452810075over18's",
 'U dun say so early hor... U c already then say...',
 "Nah I don't think he goes to usf, he lives around here though"]

## Bag of Words Approach

In [7]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

In [8]:
cv = CountVectorizer(stop_words='english', min_df=0.005)
cv.fit(list(spam_data["Text"]))
output = cv.transform(list(spam_data["Text"]))


tfidf = TfidfVectorizer(stop_words='english', min_df=0.005)
tfidf.fit(list(spam_data["Text"]))
output = tfidf.transform(list(spam_data["Text"]))

In [9]:
output = output.toarray()
columns = cv.get_feature_names()

X = pd.DataFrame(output, columns=columns)
y = spam_data["Target"]

### Create New Features
- Count of # Digits

In [10]:
# msg = "This is the 2nd time we have tried 2 contact u"
def fn_count_digits(msg):
    return sum(char.isdigit() for char in msg)

count_digits = spam_data["Text"].apply(lambda x : fn_count_digits(x))

In [11]:
X["count_digits"] = count_digits

### Train Test Split

In [12]:
from sklearn.model_selection import train_test_split

In [13]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

### Model Building

In [14]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, confusion_matrix

In [15]:
clf = RandomForestClassifier()
clf.fit(X_train, y_train)

RandomForestClassifier()

In [16]:
preds = clf.predict(X_test)

In [17]:
print(classification_report(y_test, preds))

              precision    recall  f1-score   support

         ham       0.99      1.00      0.99      1587
        spam       0.97      0.92      0.95       252

    accuracy                           0.99      1839
   macro avg       0.98      0.96      0.97      1839
weighted avg       0.99      0.99      0.99      1839



### Testing on sample examples

In [18]:
test_samples = ["WINNER! Congratulations on winning a free gift. Send message to 58585 to redeem your gift",
                "Hey gimme a call me on 78996797 when you are free. Need to talk something important"]

1. Use count vectorizer and convert to an array
2. Use the model and generate predictions

In [19]:
test_samples

['WINNER! Congratulations on winning a free gift. Send message to 58585 to redeem your gift',
 'Hey gimme a call me on 78996797 when you are free. Need to talk something important']

In [20]:
test_samples_output = tfidf.transform(test_samples)
test_samples_output = pd.DataFrame(test_samples_output.toarray(), columns = cv.get_feature_names())
test_samples_output['count_digits'] = [fn_count_digits(msg) for msg in test_samples]

In [21]:
test_samples_output.shape

(2, 288)

In [22]:
test_samples_preds = clf.predict(test_samples_output)
test_samples_probs = clf.predict_proba(test_samples_output)

In [23]:
print(test_samples_preds)
print(test_samples_probs)

['spam' 'ham']
[[0.28 0.72]
 [0.5  0.5 ]]


## Word Embeddings

https://github.com/RaRe-Technologies/gensim-data

In [24]:
import gensim
import gensim.downloader
from sklearn.metrics.pairwise import cosine_similarity

In [25]:
gensim.downloader.info()["models"].keys()

dict_keys(['fasttext-wiki-news-subwords-300', 'conceptnet-numberbatch-17-06-300', 'word2vec-ruscorpora-300', 'word2vec-google-news-300', 'glove-wiki-gigaword-50', 'glove-wiki-gigaword-100', 'glove-wiki-gigaword-200', 'glove-wiki-gigaword-300', 'glove-twitter-25', 'glove-twitter-50', 'glove-twitter-100', 'glove-twitter-200', '__testing_word2vec-matrix-synopsis'])

In [26]:
word2vec_300 = gensim.downloader.load("word2vec-google-news-300")
# fasttext_300 = gensim.downloader.load("fasttext-wiki-news-subwords-300")

[--------------------------------------------------] 1.4% 23.4/1662.8MB downloaded

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[=-------------------------------------------------] 3.3% 55.0/1662.8MB downloaded

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[==------------------------------------------------] 5.2% 86.3/1662.8MB downloaded

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[===-----------------------------------------------] 7.7% 128.7/1662.8MB downloaded

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



[=====---------------------------------------------] 10.7% 177.5/1662.8MB downloaded

IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)





IOPub message rate exceeded.
The notebook server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--NotebookApp.iopub_msg_rate_limit`.

Current values:
NotebookApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
NotebookApp.rate_limit_window=3.0 (secs)



In [27]:
vec1 = word2vec_300.get_vector("Superman")
vec2 = word2vec_300.get_vector("Spiderman")
vec3 = word2vec_300.get_vector("Superhero")

In [28]:
cosine_similarity(vec1.reshape(1,-1), vec2.reshape(1,-1))

array([[0.5477701]], dtype=float32)

In [29]:
cosine_similarity(vec1.reshape(1,-1), vec3.reshape(1,-1))

array([[0.5475111]], dtype=float32)

In [30]:
word2vec_300.most_similar("Capital")

[('Captial', 0.6502077579498291),
 ('Capital_Partners', 0.6405684351921082),
 ('Financial_Cp_COF', 0.581904411315918),
 ('Ltd_ACAS', 0.562832772731781),
 ('Edinburgh_Inspiring', 0.5587600469589233),
 ('LLC_Currency_Currents', 0.5564382076263428),
 ('www.internetcapital.com', 0.5522413849830627),
 ('Mitchelle_Stephenson', 0.5467348098754883),
 ('Development_Fund_UNCDF', 0.5438245534896851),
 ('LONG_BEACH_Mffais.com_Munder', 0.5372943878173828)]

In [31]:
word2vec_300.most_similar("capital")

[('captial', 0.6443068981170654),
 ('worth_##mln_rub', 0.5211092829704285),
 ('worth_#.###bn_rub', 0.5162901282310486),
 ('worth_##.###bn_rub', 0.5028226971626282),
 ('Lima_Peruvians', 0.49432554841041565),
 ('thecapital', 0.4910687506198883),
 ('Bishkek_Otunbayeva', 0.4839523136615753),
 ('Andreessen_Horowitz_venture', 0.47973471879959106),
 ('EQUITY_Issued', 0.4725496470928192),
 ('liquidity', 0.4684329628944397)]

In [32]:
word2vec_300.most_similar("King")

[('Jackson', 0.5326348543167114),
 ('Prince', 0.5306329727172852),
 ('Tupou_V.', 0.5292826294898987),
 ('KIng', 0.5227501392364502),
 ('e_mail_robert.king_@', 0.5173623561859131),
 ('king', 0.5158917903900146),
 ('Queen', 0.5157250165939331),
 ('Geoffrey_Rush_Exit', 0.49920955300331116),
 ('prosecutor_Dan_Satterberg', 0.49850785732269287),
 ('NECN_Alison', 0.49128594994544983)]

In [33]:
word2vec_300.most_similar("Narendra_Modi")

[('Modi', 0.8249751329421997),
 ('Minister_Narendra_Modi', 0.8121265172958374),
 ('Nitish_Kumar', 0.7806440591812134),
 ('minister_Narendra_Modi', 0.7666792869567871),
 ('Advani', 0.756213366985321),
 ('BJP', 0.7535826563835144),
 ('Lalu_Yadav', 0.7422481179237366),
 ('Mayawati', 0.7403608560562134),
 ('Mulayam_Singh', 0.7345786690711975),
 ('Nitish', 0.7312207818031311)]

In [34]:
sent1 = "ThePresident is speaking to the media"
sent2 = "Barack Obama is addressing the press"

vec1 = word2vec_300.get_mean_vector(sent1.split())
vec2 = word2vec_300.get_mean_vector(sent2.split())

In [35]:
cosine_similarity(vec1.reshape(1,-1), vec2.reshape(1,-1))

array([[0.70237154]], dtype=float32)

In [36]:
gensim.__version__

'4.3.1'