In [1]:
from sklearn.feature_extraction.text import CountVectorizer
import pandas as pd

# Guide to use "CountVectorizer"

In [2]:
cvec = CountVectorizer()

In [3]:
L = ["Today is a beautiful day, also today rainny day."]

cvec.fit_transform(L)       # 1x6 sparse matrix
                            # all word(at least 2 letter)

<1x6 sparse matrix of type '<class 'numpy.int64'>'
	with 6 stored elements in Compressed Sparse Row format>

In [4]:
cvec.fit_transform(L).toarray()        # "Today"     - 2        # simple or capital  (Today / today)
                                       # "is"        - 1        # doesn't get "a"
                                       # "beautiful" - 1
                                       # "day"       - 2
                                       # "also"      - 1
                                       # "rainny"    - 1

array([[1, 1, 2, 1, 1, 2]], dtype=int64)

In [5]:
L = ["Today is a beautiful day, also today rainny day.", 
     "Have a nice day", 
     "we have beautiful country, but now a day it is bad"]

cvec.fit_transform(L).toarray()

array([[1, 0, 1, 0, 0, 2, 0, 1, 0, 0, 0, 1, 2, 0],
       [0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0],
       [0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1]], dtype=int64)

In [6]:
s1 = pd.Series(L)          # also we can use pandas series
s1

0     Today is a beautiful day, also today rainny day.
1                                      Have a nice day
2    we have beautiful country, but now a day it is...
dtype: object

In [7]:
cvec.fit_transform(s1).toarray()

array([[1, 0, 1, 0, 0, 2, 0, 1, 0, 0, 0, 1, 2, 0],
       [0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 0, 0],
       [0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 1]], dtype=int64)

# Classify ham & sham

In [8]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split, GridSearchCV, KFold
from sklearn.svm import SVC
from sklearn.metrics import accuracy_score, classification_report
from imblearn.over_sampling import SMOTE

In [9]:
data = pd.read_csv("spam.csv")
data.head()

Unnamed: 0,Label,EmailText
0,ham,"Go until jurong point, crazy.. Available only ..."
1,ham,Ok lar... Joking wif u oni...
2,spam,Free entry in 2 a wkly comp to win FA Cup fina...
3,ham,U dun say so early hor... U c already then say...
4,ham,"Nah I don't think he goes to usf, he lives aro..."


In [10]:
# ham is good email
# spam is bad email

In [11]:
data["Label"].value_counts()

ham     4825
spam     747
Name: Label, dtype: int64

In [12]:
x = data["EmailText"]        # x is panda series
y = data["Label"] 

## Creating a matrix with frequencies of email texts

In [13]:
cvec = CountVectorizer()
cx = cvec.fit_transform(x)
cx

<5572x8679 sparse matrix of type '<class 'numpy.int64'>'
	with 73767 stored elements in Compressed Sparse Row format>

# Using SMOTE for balancing the response data

In [14]:
data["Label"].value_counts()

ham     4825
spam     747
Name: Label, dtype: int64

In [15]:
# normaly we have magority of ham email
# therefore this case is class imbalance (we have several technic for balance - here we use "SMOTE")

In [None]:
smt = SMOTE()
x_sm, y_sm = smt.fit_resample(cx, y)

In [None]:
y_sm.value_counts()        # now balance

In [None]:
x_sm.shape

## Splitting training & testing data

In [None]:
X_train, X_test, y_train, y_test = train_test_split(x_sm, y_sm, test_size=0.2, random_state=0)

## Grid Search for identifying best hyperparameters

In [None]:
params = {"kernel" : ["rbf", "linear"], "C" : [0.1, 0.2, 0.5, 1]}
cval = KFold(n_splits = 5)
model = SVC()

gsearch = GridSearchCV(model, params, cv = cval)

result = gsearch.fit(X_train, y_train)
result.best_params_            # best model parameter

In [None]:
model = SVC(kernel = "linear", C = 0.5)
model.fit(X_train, y_train)

y_pred = model.predict(X_test)
y_pred

In [None]:
accuracy_score(y_pred, y_test)          # good accuracy

In [None]:
print(classification_report(y_pred, y_test))            # class vize accuracy
                                                        # f1-score of both classes are higher

# Grid Search for identifying best hyperparameters

In [None]:
emails = ["Hey....You have won a car!!!!!!!!!. congratzzzzzz", 
         "Dear sir, your CV has been recieved "]                       # this have new word

In [None]:
model.predict(cvec.transform(emails))             # only find default word, not recoganize new word