# Mayan or Spanish Text Classifier

In [1]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
directorio = "/content/drive/MyDrive/PLN_UADY/"

In [3]:
import pandas as pd
import numpy as np
data = pd.read_csv(directorio + "JornadaMaya.csv")
data['tag'] = np.where(data['tag']=='maya',1,0)
data.tail(10)

Unnamed: 0,texto,tag
843,tumen mina’an a’almajt’aan ti’al u páajtal u t...,1
844,yaan xan ba’alo’ob ma’ táan u páajtal u beetik...,1
845,Yucatáne’ ts’o’ok u yantal máax k’ubik tuukul ...,1
846,chéen ku síit’ta’al u piixil talofita yaan ti’...,1
847,ts’o’ok u yantal u beetik u xsutt’aanil ti’al ...,1
848,"Leti’e’ u xnuk kiikil ichil jo’otúul paalal, k...",1
849,Ya’abach ti’ u láak’o’obe’ tu kanaj u meyajto’...,1
850,ka’atúul u yíits’ino’obe’ x-áantaj ts’akyajo’o...,1
851,Ts’o’ok u chan máan jump’éel ja’ab úuchik u ká...,1
852,ti’al beyo’ máax kun k’uchul te’elo’ u jéets’e...,1


In [9]:
data.loc[149,'tag'] = 0

In [18]:
data.loc[149]

Unnamed: 0,149
texto,Viaje para México
tag,0


In [11]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data['texto'],
                                                    data['tag'],
                                                    random_state=0)

In [12]:
maya = data.loc[data['tag'] == 1]
print("Maya:",len(maya), "Corpus Total:", len(data), "Porcentaje de Maya:", len(maya)/len(data)*100)

Maya: 427 Corpus Total: 853 Porcentaje de Maya: 50.058616647127785


In [13]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, precision_score, recall_score, f1_score

In [14]:
from scipy.sparse import hstack
from scipy.sparse import csr_matrix

def add_features(X, features):
    return hstack([X, csr_matrix(features).T],'csr')

In [15]:
vect = CountVectorizer().fit(X_train)
X_train_vectorized = vect.transform(X_train)
num_chars = X_train.str.len()

print(X_train_vectorized.shape)
print(X_train_vectorized.shape)

num_chars_test = X_test.str.len()

clf = MultinomialNB(alpha=0.1)
clf.fit(X_train_vectorized, y_train)

X_test_vectorized = vect.transform(X_test)
y_pred = clf.predict(X_test_vectorized)

print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Precision: ", precision_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred))
print("F1: ", f1_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

(639, 3126)
(639, 3126)
Accuracy:  0.9906542056074766
Precision:  0.9900990099009901
Recall:  0.9900990099009901
F1:  0.9900990099009901
[[112   1]
 [  1 100]]


In [26]:
y_test.iloc[168] = 0

0

In [16]:
def consecutive_vowels(word):
  num = 0
  for i in range(len(word)-1):
    if word[i] in 'aeiou' and word[i]==word[i+1]:
      num += 1
  return num

In [27]:
vect = CountVectorizer().fit(X_train)
X_train_vectorized = vect.transform(X_train)
num_apostrofes = X_train.str.count('’')
num_consecutives = X_train.apply(consecutive_vowels)
print(X_train_vectorized.shape)
X_train_vectorized = add_features(X_train_vectorized, num_apostrofes)
X_train_vectorized = add_features(X_train_vectorized, num_consecutives)
print(X_train_vectorized.shape)

num_apostrofes_test = X_test.str.count('’')
num_consecutives_test = X_test.apply(consecutive_vowels)


clf = MultinomialNB(alpha=0.1)
clf.fit(X_train_vectorized, y_train)

X_test_vectorized = vect.transform(X_test)
X_test_vectorized = add_features(X_test_vectorized, num_apostrofes_test)
X_test_vectorized = add_features(X_test_vectorized, num_consecutives_test)
y_pred = clf.predict(X_test_vectorized)

print("Accuracy: ", accuracy_score(y_test, y_pred))
print("Precision: ", precision_score(y_test, y_pred))
print("Recall: ", recall_score(y_test, y_pred))
print("F1: ", f1_score(y_test, y_pred))
print(confusion_matrix(y_test, y_pred))

(639, 3126)
(639, 3128)
Accuracy:  1.0
Precision:  1.0
Recall:  1.0
F1:  1.0
[[114   0]
 [  0 100]]


In [19]:
falsos_negativos = np.where((y_test == 1) & (y_pred == 0))

In [21]:
falsos_negativos

(array([168]),)