In [1]:
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


## How to vectorize inputs

In [3]:
corpus = [
    "Este es el primer documento",
    "Este es el segundo segundo documento",
    "Y el tercero",
    "Es este el primer documento?"
]
x = vectorizer.fit_transform(corpus)
print(x.shape)
print(x)

(4, 7)
  (0, 3)	1
  (0, 2)	1
  (0, 1)	1
  (0, 4)	1
  (0, 0)	1
  (1, 3)	1
  (1, 2)	1
  (1, 1)	1
  (1, 0)	1
  (1, 5)	2
  (2, 1)	1
  (2, 6)	1
  (3, 3)	1
  (3, 2)	1
  (3, 1)	1
  (3, 4)	1
  (3, 0)	1


In [4]:
print(vectorizer.get_feature_names_out())

['documento' 'el' 'es' 'este' 'primer' 'segundo' 'tercero']


In [5]:
print(x.toarray())

[[1 1 1 1 1 0 0]
 [1 1 1 1 0 2 0]
 [0 1 0 0 0 0 1]
 [1 1 1 1 1 0 0]]


In [6]:
print(vectorizer.vocabulary_)

{'este': 3, 'es': 2, 'el': 1, 'primer': 4, 'documento': 0, 'segundo': 5, 'tercero': 6}


In [7]:
print(vectorizer.transform(["Algo completamente nuevo"]).toarray())

[[0 0 0 0 0 0 0]]


In [8]:
bigram_vectorizer = CountVectorizer(ngram_range=(1, 2), min_df=1)
x_2 = bigram_vectorizer.fit_transform(corpus).toarray()
print(x_2.shape)

(4, 17)


In [9]:
print(bigram_vectorizer.get_feature_names_out())
print(bigram_vectorizer.vocabulary_)


['documento' 'el' 'el primer' 'el segundo' 'el tercero' 'es' 'es el'
 'es este' 'este' 'este el' 'este es' 'primer' 'primer documento'
 'segundo' 'segundo documento' 'segundo segundo' 'tercero']
{'este': 8, 'es': 5, 'el': 1, 'primer': 11, 'documento': 0, 'este es': 10, 'es el': 6, 'el primer': 2, 'primer documento': 12, 'segundo': 13, 'el segundo': 3, 'segundo segundo': 15, 'segundo documento': 14, 'tercero': 16, 'el tercero': 4, 'es este': 7, 'este el': 9}


In [10]:
print(x_2)

[[1 1 1 0 0 1 1 0 1 0 1 1 1 0 0 0 0]
 [1 1 0 1 0 1 1 0 1 0 1 0 0 2 1 1 0]
 [0 1 0 0 1 0 0 0 0 0 0 0 0 0 0 0 1]
 [1 1 1 0 0 1 0 1 1 1 0 1 1 0 0 0 0]]


In [13]:
tfidf_vect = TfidfVectorizer()
x_train_tfidf = tfidf_vect.fit_transform(corpus)
print(x_train_tfidf.shape)
print(tfidf_vect.get_feature_names_out())
print(x_train_tfidf.toarray())

(4, 7)
['documento' 'el' 'es' 'este' 'primer' 'segundo' 'tercero']
[[0.43877674 0.35872874 0.43877674 0.43877674 0.54197657 0.
  0.        ]
 [0.27230147 0.22262429 0.27230147 0.27230147 0.         0.85322574
  0.        ]
 [0.         0.46263733 0.         0.         0.         0.
  0.88654763]
 [0.43877674 0.35872874 0.43877674 0.43877674 0.54197657 0.
  0.        ]]


In [14]:
tfidf_vect = TfidfVectorizer(ngram_range=(1, 4), min_df=1)
x_train_tfidf = tfidf_vect.fit_transform(corpus)
print(x_train_tfidf.shape)
print(tfidf_vect.get_feature_names_out())
print(x_train_tfidf.toarray())

(4, 32)
['documento' 'el' 'el primer' 'el primer documento' 'el segundo'
 'el segundo segundo' 'el segundo segundo documento' 'el tercero' 'es'
 'es el' 'es el primer' 'es el primer documento' 'es el segundo'
 'es el segundo segundo' 'es este' 'es este el' 'es este el primer' 'este'
 'este el' 'este el primer' 'este el primer documento' 'este es'
 'este es el' 'este es el primer' 'este es el segundo' 'primer'
 'primer documento' 'segundo' 'segundo documento' 'segundo segundo'
 'segundo segundo documento' 'tercero']
[[0.21461001 0.17545774 0.26508605 0.26508605 0.         0.
  0.         0.         0.21461001 0.26508605 0.33622805 0.33622805
  0.         0.         0.         0.         0.         0.21461001
  0.         0.         0.         0.26508605 0.26508605 0.33622805
  0.         0.26508605 0.26508605 0.         0.         0.
  0.         0.        ]
 [0.15780956 0.12901965 0.         0.         0.24723917 0.24723917
  0.24723917 0.         0.15780956 0.1949262  0.         0.
  

In [15]:
vect = CountVectorizer(ngram_range=(1, 4), min_df=1)
x_train = vect.fit_transform(corpus)
print(x_train.shape)
print(vect.get_feature_names_out())
print(x_train.toarray())

(4, 32)
['documento' 'el' 'el primer' 'el primer documento' 'el segundo'
 'el segundo segundo' 'el segundo segundo documento' 'el tercero' 'es'
 'es el' 'es el primer' 'es el primer documento' 'es el segundo'
 'es el segundo segundo' 'es este' 'es este el' 'es este el primer' 'este'
 'este el' 'este el primer' 'este el primer documento' 'este es'
 'este es el' 'este es el primer' 'este es el segundo' 'primer'
 'primer documento' 'segundo' 'segundo documento' 'segundo segundo'
 'segundo segundo documento' 'tercero']
[[1 1 1 1 0 0 0 0 1 1 1 1 0 0 0 0 0 1 0 0 0 1 1 1 0 1 1 0 0 0 0 0]
 [1 1 0 0 1 1 1 0 1 1 0 0 1 1 0 0 0 1 0 0 0 1 1 0 1 0 0 2 1 1 1 0]
 [0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1]
 [1 1 1 1 0 0 0 0 1 0 0 0 0 0 1 1 1 1 1 1 1 0 0 0 0 1 1 0 0 0 0 0]]


## Testing Models

In [16]:
directorio = "/content/drive/MyDrive/PLN_UADY/"

In [17]:
import pandas as pd
import numpy as np
spam_data = pd.read_csv(directorio + "spam.csv", encoding="ISO-8859-1")
spam_data['target'] = np.where(spam_data['target']=='spam',1,0)
spam_data.head(10)

Unnamed: 0,text,target
0,"Go until jurong point, crazy.. Available only ...",0
1,Ok lar... Joking wif u oni...,0
2,Free entry in 2 a wkly comp to win FA Cup fina...,1
3,U dun say so early hor... U c already then say...,0
4,"Nah I don't think he goes to usf, he lives aro...",0
5,FreeMsg Hey there darling it's been 3 week's n...,1
6,Even my brother is not like to speak with me. ...,0
7,As per your request 'Melle Melle (Oru Minnamin...,0
8,WINNER!! As a valued network customer you have...,1
9,Had your mobile 11 months or more? U R entitle...,1


In [18]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(spam_data['text'],
                                                    spam_data['target'],
                                                    random_state=0)

In [19]:
spam = spam_data.loc[spam_data['target'] == 1]
print("Spam:",len(spam), "Corpus Total:", len(spam_data), "Porcentaje de Spam:", len(spam)/len(spam_data)*100)

Spam: 747 Corpus Total: 5572 Porcentaje de Spam: 13.406317300789663


In [20]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score

In [21]:
vect = CountVectorizer().fit(X_train)
X_train_vectorized = vect.transform(X_train)
clf = MultinomialNB()
clf.fit(X_train_vectorized, y_train)
X_test_vectorized = vect.transform(X_test)
y_pred = clf.predict(X_test_vectorized)
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Precision: ", precision_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred))
print("F1: ", f1_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

Accuracy:  0.9863603732950467
Precision:  0.9836956521739131
Recall:  0.9187817258883249
F1:  0.9501312335958005
[[1193    3]
 [  16  181]]


Tarea: Investigar como en vez de hacer un count vectorizer, llamar al tfdf, minimo de document frecuence de 3

In [22]:
tfidf_vect = TfidfVectorizer(min_df=3)
x_train_tfidf = tfidf_vect.fit_transform(X_train)
print(x_train_tfidf.shape)
clf.fit(x_train_tfidf, y_train)
X_test_vectorized = tfidf_vect.transform(X_test)
y_pred = clf.predict(X_test_vectorized)
print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Precision: ", precision_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred))
print("F1: ", f1_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

(4179, 2294)
Accuracy:  0.9698492462311558
Precision:  1.0
Recall:  0.7868020304568528
F1:  0.8806818181818182
[[1196    0]
 [  42  155]]


## Funcion para agregar otras features al modelo

In [23]:
from scipy.sparse import hstack
from scipy.sparse import csr_matrix

def add_features(X, features):
    return hstack([X, csr_matrix(features).T],'csr')

In [24]:
vect = CountVectorizer().fit(X_train)
X_train_vectorized = vect.transform(X_train)
num_chars = X_train.str.len()

print(X_train_vectorized.shape)
X_train_vectorized = add_features(X_train_vectorized, num_chars)
print(X_train_vectorized.shape)

num_chars_test = X_test.str.len()

clf = MultinomialNB(alpha=0.1)
clf.fit(X_train_vectorized, y_train)

X_test_vectorized = vect.transform(X_test)
X_test_vectorized = add_features(X_test_vectorized, num_chars_test)
y_pred = clf.predict(X_test_vectorized)

print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Precision: ", precision_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred))
print("F1: ", f1_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

(4179, 7359)
(4179, 7360)
Accuracy:  0.9913854989231874
Precision:  0.9946524064171123
Recall:  0.9441624365482234
F1:  0.96875
[[1195    1]
 [  11  186]]


In [25]:
vect = TfidfVectorizer(min_df=3).fit(X_train)
X_train_vectorized = vect.transform(X_train)
num_chars = X_train.str.len()

print(X_train_vectorized.shape)
X_train_vectorized = add_features(X_train_vectorized, num_chars)
print(X_train_vectorized.shape)

num_chars_test = X_test.str.len()

clf = MultinomialNB(alpha=0.1)
clf.fit(X_train_vectorized, y_train)

X_test_vectorized = vect.transform(X_test)
X_test_vectorized = add_features(X_test_vectorized, num_chars_test)
y_pred = clf.predict(X_test_vectorized)

print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Precision: ", precision_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred))
print("F1: ", f1_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

(4179, 2294)
(4179, 2295)
Accuracy:  0.9856424982053122
Precision:  1.0
Recall:  0.8984771573604061
F1:  0.946524064171123
[[1196    0]
 [  20  177]]


In [26]:
vect = CountVectorizer(ngram_range=(1,3)).fit(X_train)
X_train_vectorized = vect.transform(X_train)
num_chars = X_train.str.len()

print(X_train_vectorized.shape)
X_train_vectorized = add_features(X_train_vectorized, num_chars)
print(X_train_vectorized.shape)

num_chars_test = X_test.str.len()

clf = MultinomialNB(alpha=0.1)
clf.fit(X_train_vectorized, y_train)

X_test_vectorized = vect.transform(X_test)
X_test_vectorized = add_features(X_test_vectorized, num_chars_test)
y_pred = clf.predict(X_test_vectorized)

print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Precision: ", precision_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred))
print("F1: ", f1_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

(4179, 82511)
(4179, 82512)
Accuracy:  0.9856424982053122
Precision:  0.988950276243094
Recall:  0.9086294416243654
F1:  0.9470899470899471
[[1194    2]
 [  18  179]]


In [27]:
vect = CountVectorizer(ngram_range=(1,3)).fit(X_train)
X_train_vectorized = vect.transform(X_train)
num_chars = X_train.str.len()
num_digits = X_train.str.count(r'\d')

print(X_train_vectorized.shape)
X_train_vectorized = add_features(X_train_vectorized, num_chars)
X_train_vectorized = add_features(X_train_vectorized, num_digits)
print(X_train_vectorized.shape)

num_chars_test = X_test.str.len()
num_digits_test = X_test.str.count(r'\d')

clf = MultinomialNB(alpha=0.1)
clf.fit(X_train_vectorized, y_train)

X_test_vectorized = vect.transform(X_test)
X_test_vectorized = add_features(X_test_vectorized, num_chars_test)
X_test_vectorized = add_features(X_test_vectorized, num_digits_test)
y_pred = clf.predict(X_test_vectorized)

print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Precision: ", precision_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred))
print("F1: ", f1_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

(4179, 82511)
(4179, 82513)
Accuracy:  0.9899497487437185
Precision:  0.9790575916230366
Recall:  0.949238578680203
F1:  0.9639175257731959
[[1192    4]
 [  10  187]]


In [28]:
suma = 0
for text in spam_data["text"]:
  suma+=len(text)
print(suma/spam_data["text"].shape[0])

80.33309404163676


In [29]:
spam_data["text"].shape

(5572,)

In [30]:
print(suma)

447616


In [31]:
len(spam_data["text"])

5572

## Logistic Regression y SVMs

In [34]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC

vect = TfidfVectorizer(min_df=5, ngram_range=(1,3)).fit(X_train)
X_train_vectorized = vect.transform(X_train)
num_chars = X_train.str.len()
num_digits = X_train.str.count(r'\d')

X_train_vectorized = add_features(X_train_vectorized, num_chars)
X_train_vectorized = add_features(X_train_vectorized, num_digits)

model = LogisticRegression(C=100, max_iter=1000)
model.fit(X_train_vectorized, y_train)

X_test_vectorized = vect.transform(X_test)
num_chars_test = X_test.str.len()
num_digits_test = X_test.str.count(r'\d')

X_test_vectorized = add_features(X_test_vectorized, num_chars_test)
X_test_vectorized = add_features(X_test_vectorized, num_digits_test)

y_pred = model.predict(X_test_vectorized)

print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Precision: ", precision_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred))
print("F1: ", f1_score(y_test, y_pred))

Accuracy:  0.9892318736539842
Precision:  0.9789473684210527
Recall:  0.9441624365482234
F1:  0.9612403100775194


In [35]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.preprocessing import QuantileTransformer

vect = TfidfVectorizer(min_df=5, ngram_range=(1,3)).fit(X_train)
X_train_vectorized = vect.transform(X_train)
num_chars = X_train.str.len()
num_digits = X_train.str.count(r'\d')

X_train_vectorized = add_features(X_train_vectorized, num_chars)
X_train_vectorized = add_features(X_train_vectorized, num_digits)

model = LogisticRegression(C=100, max_iter=1000)
quantile_transformer = QuantileTransformer(random_state=0)
X_train_vectorized = quantile_transformer.fit_transform(X_train_vectorized)
model.fit(X_train_vectorized, y_train)

X_test_vectorized = vect.transform(X_test)
num_chars_test = X_test.str.len()
num_digits_test = X_test.str.count(r'\d')

X_test_vectorized = add_features(X_test_vectorized, num_chars_test)
X_test_vectorized = add_features(X_test_vectorized, num_digits_test)
X_test_vectorized = quantile_transformer.transform(X_test_vectorized)

y_pred = model.predict(X_test_vectorized)

print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Precision: ", precision_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred))
print("F1: ", f1_score(y_test, y_pred))

Accuracy:  0.9892318736539842
Precision:  0.9840425531914894
Recall:  0.9390862944162437
F1:  0.961038961038961


In [38]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.preprocessing import QuantileTransformer

vect = TfidfVectorizer(min_df=5, ngram_range=(2,5), analyzer="char_wb").fit(X_train)
X_train_vectorized = vect.transform(X_train)
num_chars = X_train.str.len()
num_digits = X_train.str.count(r'\d')

print(X_train_vectorized.shape)
X_train_vectorized = add_features(X_train_vectorized, num_chars)
X_train_vectorized = add_features(X_train_vectorized, num_digits)
print(X_train_vectorized.shape)

model = LogisticRegression(C=100)
quantile_transformer = QuantileTransformer(random_state=0)
X_train_vectorized = quantile_transformer.fit_transform(X_train_vectorized)
model.fit(X_train_vectorized, y_train)

X_test_vectorized = vect.transform(X_test)
num_chars_test = X_test.str.len()
num_digits_test = X_test.str.count(r'\d')

X_test_vectorized = add_features(X_test_vectorized, num_chars_test)
X_test_vectorized = add_features(X_test_vectorized, num_digits_test)
X_test_vectorized = quantile_transformer.transform(X_test_vectorized)

y_pred = model.predict(X_test_vectorized)

print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Precision: ", precision_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred))
print("F1: ", f1_score(y_test, y_pred))

(4179, 16370)
(4179, 16372)
Accuracy:  0.9913854989231874
Precision:  1.0
Recall:  0.9390862944162437
F1:  0.9685863874345549


In [40]:
from sklearn.svm import LinearSVC, SVC
from sklearn.preprocessing import QuantileTransformer

vect = TfidfVectorizer(min_df=5).fit(X_train)
X_train_vectorized = vect.transform(X_train)
num_chars = X_train.str.len()
num_digits = X_train.str.count(r'\d')

X_train_vectorized = add_features(X_train_vectorized, num_chars)
X_train_vectorized = add_features(X_train_vectorized, num_digits)

model = SVC(C=10000, random_state = 0)
model_linear_SVC = LinearSVC(C=10000, random_state = 0, max_iter=10500)
model.fit(X_train_vectorized, y_train)
model_linear_SVC.fit(X_train_vectorized, y_train)

X_test_vectorized = vect.transform(X_test)
num_chars_test = X_test.str.len()
num_digits_test = X_test.str.count(r'\d')

X_test_vectorized = add_features(X_test_vectorized, num_chars_test)
X_test_vectorized = add_features(X_test_vectorized, num_digits_test)

y_pred = model.predict(X_test_vectorized)
y_pred_linear_SVC = model_linear_SVC.predict(X_test_vectorized)

print(f"Accuracy: SVC: {accuracy_score(y_test, y_pred)} \t LinearSVC: {accuracy_score(y_test, y_pred_linear_SVC)}")
print(f"Precision: SVC: {precision_score(y_test, y_pred)} \t LinearSVC: {precision_score(y_test, y_pred_linear_SVC)}")
print(f"Recall: SVC: {recall_score(y_test, y_pred)} \t LinearSVC: {recall_score(y_test, y_pred_linear_SVC)}")
print(f"F1: SVC: {f1_score(y_test, y_pred)} \t LinearSVC: {f1_score(y_test, y_pred_linear_SVC)}")

Accuracy: SVC: 0.9877961234745154 	 LinearSVC: 0.9885139985642498
Precision: SVC: 0.9787234042553191 	 LinearSVC: 0.9641025641025641
Recall: SVC: 0.934010152284264 	 LinearSVC: 0.9543147208121827
F1: SVC: 0.9558441558441558 	 LinearSVC: 0.9591836734693877


In [43]:
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import KFold

x = spam_data["text"]
y = spam_data["target"]

vect = TfidfVectorizer(min_df=3, ngram_range=(1,3)).fit(X_train)
X_vectorized = vect.transform(x)

k=10
kf = KFold(n_splits=k, random_state=None)
clf = MultinomialNB(alpha=0.1)
scores = cross_val_score(clf, X_vectorized, y, cv=kf, scoring="f1")
print(scores)
print("Promedio: {}".format(scores.mean()))

[0.95597484 0.94186047 0.94029851 0.97101449 0.90909091 0.9516129
 0.93430657 0.94267516 0.94366197 0.94964029]
Promedio: 0.9440136108596103
