In [1]:
corpus = [
    "hello python is awesome",
    "python is modern language",
    "hello how are you",
    "i am awesome",
    "this is best time",
]

In [2]:
def build_voca(corpus):
    vocab = []
    for document in corpus:
        for word in document.split():
            if word.lower() not in vocab:
                vocab.append(word)
    vocab.sort()
    w2v = { word:vector for vector,word in enumerate(vocab)}
    v2w = { vector:word for vector, word in enumerate(vocab)}
    return w2v, v2w

In [3]:
build_voca(corpus)

({'am': 0,
  'are': 1,
  'awesome': 2,
  'best': 3,
  'hello': 4,
  'how': 5,
  'i': 6,
  'is': 7,
  'language': 8,
  'modern': 9,
  'python': 10,
  'this': 11,
  'time': 12,
  'you': 13},
 {0: 'am',
  1: 'are',
  2: 'awesome',
  3: 'best',
  4: 'hello',
  5: 'how',
  6: 'i',
  7: 'is',
  8: 'language',
  9: 'modern',
  10: 'python',
  11: 'this',
  12: 'time',
  13: 'you'})

In [4]:
from sklearn.base import BaseEstimator,TransformerMixin

In [11]:
class Vectorizer(BaseEstimator,TransformerMixin):
    def __init__(self):
        self.vocab_ = []
        self.vocab_size = 0
    def fit(self, corpus):
        vocab = []
        for document in corpus:
            for word in document.split():
                if word.lower() not in vocab:
                    vocab.append(word)
        vocab.sort()
        self.vocab_size = len(vocab)
        self.vocab_ = {w:v for v ,w in enumerate(vocab)}
    def transform(self, corpus):
        vectors = []
        for document in corpus:
            vector = [ 0 ] * self.vocab_size
            # [ 0, 0, 0, 0, 0, 0, 0, 0 ]
            for word in document.split():
                word = word.strip().lower()
                if word in self.vocab_:
                    i = self.vocab_[word]
                    vector[i] += 1
            vectors.append(vector)
        return vectors

In [12]:
vector = Vectorizer()

In [13]:
vector.fit(corpus)

In [14]:
vc = vector.transform(corpus)

In [15]:
vc

[[0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0],
 [0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0],
 [0, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 0, 1],
 [1, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
 [0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0]]

In [16]:
from sklearn.feature_extraction.text import CountVectorizer

In [19]:
model = CountVectorizer()
model.fit(corpus)
tr = model.transform(corpus).toarray()

print(*tr ,sep='\n')
print(*corpus, sep='\n') # m x n 

[0 0 1 0 1 0 1 0 0 1 0 0 0]
[0 0 0 0 0 0 1 1 1 1 0 0 0]
[0 1 0 0 1 1 0 0 0 0 0 0 1]
[1 0 1 0 0 0 0 0 0 0 0 0 0]
[0 0 0 1 0 0 1 0 0 0 1 1 0]
hello python is awesome
python is modern language
hello how are you
i am awesome
this is best time


In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [23]:
!dir

 Volume in drive D has no label.
 Volume Serial Number is AAE8-B91D

 Directory of D:\Python_grass_solution\MachineLearning\Notebooks\my

07-10-2021  21:31    <DIR>          .
07-10-2021  21:31    <DIR>          ..
07-10-2021  18:37    <DIR>          .ipynb_checkpoints
03-10-2021  13:34           122,744 classification metrices.ipynb
08-09-2021  01:12           173,568 crossvalidation hyper tunining.ipynb
13-09-2021  15:10           920,800 FuelConsumptionCo2.ipynb
05-09-2021  15:05           281,304 gradient descent.ipynb
12-09-2021  21:10           348,208 lasso.ipynb
22-08-2021  21:08           155,235 linear regression.ipynb
04-10-2021  19:46            61,045 MNIST.ipynb
03-09-2021  00:03            40,470 pipeline2.ipynb
31-08-2021  01:26            41,517 pipline.ipynb
05-09-2021  21:21           636,247 polynomial regression.ipynb
30-08-2021  23:52           109,520 scaler.ipynb
07-10-2021  21:31             8,048 spam Ham.ipynb
18-09-2021  19:28           503,663 spam.csv
30-0

In [25]:
df = pd.read_csv('spam.csv',encoding='latin')

In [26]:
df

Unnamed: 0,v1,v2,Unnamed: 2,Unnamed: 3,Unnamed: 4
0,ham,"Go until jurong point, crazy.. Available only ...",,,
1,ham,Ok lar... Joking wif u oni...,,,
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...,,,
3,ham,U dun say so early hor... U c already then say...,,,
4,ham,"Nah I don't think he goes to usf, he lives aro...",,,
...,...,...,...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...,,,
5568,ham,Will Ì_ b going to esplanade fr home?,,,
5569,ham,"Pity, * was in mood for that. So...any other s...",,,
5570,ham,The guy did some bitching but I acted like i'd...,,,


In [27]:
df.isna().sum() / len(df)*100

v1             0.000000
v2             0.000000
Unnamed: 2    99.102656
Unnamed: 3    99.784637
Unnamed: 4    99.892319
dtype: float64

In [28]:
df = df[['v1','v2']]

In [30]:
df

Unnamed: 0,v1,v2
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."
...,...,...
5567,spam,This is the 2nd time we have tried 2 contact u...
5568,ham,Will Ì_ b going to esplanade fr home?
5569,ham,"Pity, * was in mood for that. So...any other s..."
5570,ham,The guy did some bitching but I acted like i'd...


In [31]:
df.rename({'v1': 'label', 'v2': 'message'}, axis=1, inplace=True)

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(


In [32]:
df['message'] = df.message.str.lower()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._set_item(key, value)


In [33]:
df.head()

Unnamed: 0,label,message
0,ham,"go until jurong point, crazy.. available only ..."
1,ham,ok lar... joking wif u oni...
2,spam,free entry in 2 a wkly comp to win fa cup fina...
3,ham,u dun say so early hor... u c already then say...
4,ham,"nah i don't think he goes to usf, he lives aro..."


In [34]:
X = df['message']
y = df['label']

In [35]:
y.value_counts() / len(y) # imbalance

ham     0.865937
spam    0.134063
Name: label, dtype: float64

In [37]:
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split

def evaluate(y, y_hat):
    print("Confusion Matrix")
    print(confusion_matrix(y, y_hat))
    print("\nClassification Report")
    print(classification_report(y, y_hat))
    print("\nAccuracy Score")
    print(accuracy_score(y, y_hat))

In [38]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                   random_state=42)

In [39]:
from sklearn.naive_bayes import MultinomialNB

In [40]:
model = Pipeline([
    ('vectorizer', CountVectorizer()),
    ('naive bays model', MultinomialNB())
])

In [42]:
model.fit(X_train,y_train)

Pipeline(steps=[('vectorizer', CountVectorizer()),
                ('naive bays model', MultinomialNB())])

In [45]:
y_hat_train = model.predict(X_train)


In [46]:
evaluate(y_train,y_hat_train)

Confusion Matrix
[[3851    9]
 [  16  581]]

Classification Report
              precision    recall  f1-score   support

         ham       1.00      1.00      1.00      3860
        spam       0.98      0.97      0.98       597

    accuracy                           0.99      4457
   macro avg       0.99      0.99      0.99      4457
weighted avg       0.99      0.99      0.99      4457


Accuracy Score
0.9943908458604442


In [47]:
y_hat_test = model.predict(X_test)
evaluate(y_test,y_hat_test)

Confusion Matrix
[[963   2]
 [ 16 134]]

Classification Report
              precision    recall  f1-score   support

         ham       0.98      1.00      0.99       965
        spam       0.99      0.89      0.94       150

    accuracy                           0.98      1115
   macro avg       0.98      0.95      0.96      1115
weighted avg       0.98      0.98      0.98      1115


Accuracy Score
0.9838565022421525
