In [1]:
corpus = [
    "python is awesome",
    "hello world",
    "we are learning python",
    "we will meet tomorrow"
]

In [2]:
vocab = []
for document in corpus:
    for word in document.split():
        if word not in vocab:
            vocab.append(word)

In [3]:
vocab

['python',
 'is',
 'awesome',
 'hello',
 'world',
 'we',
 'are',
 'learning',
 'will',
 'meet',
 'tomorrow']

    word to vector

    vector to word

    {'python': 0, 'hello':1}

    {0:'python', 1:'hello'}

In [4]:
vocab = []
for document in corpus:
    for word in document.split():
        if word not in vocab:
            vocab.append(word)
vocab.sort()
vocab

['are',
 'awesome',
 'hello',
 'is',
 'learning',
 'meet',
 'python',
 'tomorrow',
 'we',
 'will',
 'world']

In [5]:
for i in enumerate(vocab):
    print(i)

(0, 'are')
(1, 'awesome')
(2, 'hello')
(3, 'is')
(4, 'learning')
(5, 'meet')
(6, 'python')
(7, 'tomorrow')
(8, 'we')
(9, 'will')
(10, 'world')


In [20]:
def vocab(corpus):
    vocab = []
    for document in corpus:
        for word in document.split():
            if word not in vocab:
                vocab.append(word)
    vocab.sort()
    v = {w:v for v, w in enumerate(vocab)}
    return v

In [21]:
vocabulary = vocab(corpus)
vocabulary

{'are': 0,
 'awesome': 1,
 'hello': 2,
 'is': 3,
 'learning': 4,
 'meet': 5,
 'python': 6,
 'tomorrow': 7,
 'we': 8,
 'will': 9,
 'world': 10}

In [22]:
corpus

['python is awesome',
 'hello world',
 'we are learning python',
 'we will meet tomorrow']

In [23]:
def bag_of_word(corpus, vocabulary):
    vectors = []
    for document in corpus:
        vector = [0] * len(vocabulary)
        for word in document.split():
            if word in vocabulary:
                i = vocabulary[word]
                vector[i] += 1
        vectors.append(vector)
    return vectors

In [24]:
bag_of_word(corpus, vocabulary)

[[0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0],
 [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1],
 [1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0],
 [0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0]]

In [11]:
[0] * 10

[0, 0, 0, 0, 0, 0, 0, 0, 0, 0]

In [25]:
from sklearn.base import BaseEstimator, TransformerMixin

In [34]:
class Vectorization(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.vocab_ = None
        self.vocab_size_ = None

    def fit(self, corpus):
        vocab = []
        for document in corpus:
            for word in document.split():
                if word not in vocab:
                    vocab.append(word)
        vocab.sort()
        self.vocab_ = {w:v for v, w in enumerate(vocab)}
        self.vocab_size_ = len(self.vocab_)

    def transform(self, corpus):
        vectors = []
        for document in corpus:
            vector = [0] * self.vocab_size_
            for word in document.split():
                if word in self.vocab_:
                    i = self.vocab_[word]
                    vector[i] += 1
            vectors.append(vector)
        return vectors

In [35]:
vectors = Vectorization()

In [36]:
vectors.fit(corpus)

In [39]:
vectors.transform(corpus)

[[0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0],
 [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1],
 [1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0],
 [0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0]]

In [40]:
corpus = [
    "python is awesome awesome awesome",
    "hello world",
    "we are learning python",
    "we will meet tomorrow"
]

In [41]:
vectors.transform(corpus)

[[0, 3, 0, 1, 0, 0, 1, 0, 0, 0, 0],
 [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1],
 [1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0],
 [0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0]]

In [42]:
vectors.vocab_

{'are': 0,
 'awesome': 1,
 'hello': 2,
 'is': 3,
 'learning': 4,
 'meet': 5,
 'python': 6,
 'tomorrow': 7,
 'we': 8,
 'will': 9,
 'world': 10}

In [43]:
corpus = [
    "python is awesome",
    "hello world",
    "we are learning python",
    "we will meet tomorrow",
    "I am going to jaipur"
]

In [44]:
vectors.transform(corpus)

[[0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0],
 [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1],
 [1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0],
 [0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0],
 [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]]

In [45]:
from sklearn.feature_extraction.text import CountVectorizer

In [46]:
v = CountVectorizer()

In [47]:
v

In [48]:
corpus = [
    "python is awesome",
    "hello world",
    "we are learning python",
    "we will meet tomorrow"
]

In [49]:
v.fit(corpus)

In [51]:
sp_mx = v.transform(corpus)
sp_mx

<4x11 sparse matrix of type '<class 'numpy.int64'>'
	with 13 stored elements in Compressed Sparse Row format>

In [52]:
sp_mx.toarray()

array([[0, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1],
       [1, 0, 0, 0, 1, 0, 1, 0, 1, 0, 0],
       [0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 0]], dtype=int64)

    Data Preprocessing: 

                1. Removing Stop words
                2. Removing special char.
                3. Stemming
                4. TF / IDF

In [53]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

In [57]:
data = pd.read_csv('spam.csv', encoding='latin', usecols=[0, 1])
data.rename({'v1':'label', 'v2':'message'}, axis=1, inplace=True)

In [58]:
data

Unnamed: 0,label,message
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [59]:
X = data['message']

In [62]:
X = data['message'].str.lower()
X

0       go until jurong point, crazy.. available only ...
1                           ok lar... joking wif u oni...
2       free entry in 2 a wkly comp to win fa cup fina...
3       u dun say so early hor... u c already then say...
4       nah i don't think he goes to usf, he lives aro...
                              ...                        
5567    this is the 2nd time we have tried 2 contact u...
5568                will ì_ b going to esplanade fr home?
5569    pity, * was in mood for that. so...any other s...
5570    the guy did some bitching but i acted like i'd...
5571                           rofl. its true to its name
Name: message, Length: 5572, dtype: object

In [64]:
y = data['label']
y

0        ham
1        ham
2       spam
3        ham
4        ham
        ... 
5567    spam
5568     ham
5569     ham
5570     ham
5571     ham
Name: label, Length: 5572, dtype: object