In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
# Cargo los datos
df_train = pd.read_hdf("/kaggle/input/sesgos-en-el-dataset-de-snli/train_data.hdf5")
df_valid = pd.read_hdf("/kaggle/input/sesgos-en-el-dataset-de-snli/valid_data.hdf5")
df_test = pd.read_hdf("/kaggle/input/sesgos-en-el-dataset-de-snli/test_data.hdf5")

In [None]:
#Cantidad de documentos en train
train_doc_num = len(df_train)
train_doc_num

In [None]:
df_train.head()

In [None]:
df_valid.head()

In [None]:
df_test.head()

In [None]:
df_submission = pd.read_csv("/kaggle/input/sesgos-en-el-dataset-de-snli/submission_sample.csv", index_col="pairID")

In [None]:
df_submission

In [None]:
text_train = df_train["text"].tolist()
labels_train = df_train["gold_label"].tolist()
text_val = df_valid["text"].tolist()
labels_val = df_valid["gold_label"].tolist()
text_test = df_test["text"].tolist()

In [None]:
#Veamos el balance de clases
from collections import Counter
Counter(labels_train)

# Clases de este dataset
+ Contradiction
+ Entailment
+ Neutral

# Pre-procesamiento de Texto
+ NLTK (Natural Language Toolkit)
  + Tokenization: separa el texto en las palabras según criterio
  + Lemmatization: reduce a sus significados (ej, quita conjugación verbal)
  + Stop Words: quita preposiciones (como palabras muy usuales de relleno?)
  + Stemming: reduce las palabras a su raíz
  + Filtrado de no palabras

In [None]:
import nltk
# nltk.downloader.Downloader(server_index_url = "https://raw.githubusercontent.com/nltk/nltk_data/gh-pages/index.xml", download_dir = "/kaggle/input")

In [None]:
#Tokenización (a partir de este se trabajan las otras combinacionies)
from nltk.tokenize import word_tokenize
nltk.download('punkt')

from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
lemmatizer = WordNetLemmatizer()

from nltk.corpus import stopwords
nltk.download('stopwords')

from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

In [None]:
text_train_filter = text_filter(df_train, True, True, True, True)

In [None]:
text_val_filter = text_filter(df_valid, True, True, True, True)

In [None]:
df_text_filter

## Backup de los datos procesados

In [None]:
import pickle

In [None]:
with open('./text_filt.pck', 'wb') as fp:
    pickle.dump(df_text_filter, fp)

In [None]:
with open('./text_filt.pck', 'rb') as fp:
    saved_texts = pickle.load(fp)

# Armos los CV para train y valid

In [None]:
#Importo los vectorizadores
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

### Parámetros CV

In [None]:
df_max = 0.99
df_min = 0.01
n_range = (1,1)

In [None]:
# Seteo los parámetros del CV
# ngram: vocablo conjunto n-ario de palabras
cv = CountVectorizer(max_df = df_max, min_df= df_min, ngram_range = n_range)

#cv = TfidfVectorizer(min_df=1)

In [None]:
cv_train = cv.fit_transform(text_train_filter)

In [None]:
cv_valid = cv.transform(text_val_filter)

In [None]:
cv_train.shape

In [None]:
# Vemos un poco el vocabulario
cv.get_feature_names()

# Multinomial NB

In [None]:
from sklearn.naive_bayes import MultinomialNB

In [None]:
clf = MultinomialNB(alpha=1e-10)
clf.fit(cv_train, labels_train)

In [None]:
#logprobabilidades de la clase 0
clf.coef_[0]

In [None]:
#Veamos còmo funciona el clasificador para train
clf.score(cv_train, labels_train)

In [None]:
# Veamos còomo funciona el clasificador para valid
clf.score(cv_valid, labels_val)

In [None]:
cv_test = cv.transform(text_test)
test_labels = clf.predict(cv_test)

# For Submission

In [None]:
#Armo el submission.csv
df_test = pd.DataFrame(data=test_labels, columns=["pred_labels"],)

In [None]:
df_test.head()

In [None]:
df_test.index.names = ["pairID"]

In [None]:
df_test

In [None]:
df_test.to_csv("submission.csv")

# Conclusión/Resumen

In [None]:
def text_filter(dataset, do_lemm, do_stop, do_stem, do_alpha):
    texts_filtrados = list()
    for idx in range(len(dataset.text)):
        if idx%100==0:
            print("\r Procesados: {}".format(idx),end="")
        em=dataset.text[idx]
        tok=word_tokenize(em)
        if do_lemm == True:
            lem=[lemmatizer.lemmatize(x,pos='v') for x in tok]
        else:
            lem = tok
        if do_stop == True:
            stop = [x for x in lem if x not in stopwords.words('english')]
        else:
            stop = lem
        if do_stem == True:
            stem=[stemmer.stem(x) for x in stop]
        else:
            stem = stop
        if do_alpha == True:
            alpha=[x for x in stem if x.isalpha()]
        else:
            alpha = stem
        texts_filtrados.append(" ".join(alpha))
    return texts_filtrados

In [None]:
def cv_filtrado(data, df_max, df_min, n_range):
    texts_filtrados = text_filter(data, True, True, True, True)
    count_vect = CountVectorizer(max_df = df_max, min_df = df_min, ngram_range = n_range)
    X_data = count_vect_fit_transform(texts_filtrados)
    X_data.toarray()
    return X_data   

In [None]:
def clasifNBM(X,y,X_valid,Y_valid, a):
    clasif_MNB = MultinomialNB(alpha = a)
    clasif_MNB.fit(X,y)
    y_check = clasif_MNB.predict(X_valid)
    
    m_conf = sklearn.metrics.confusion_matrix(y_valid, y_check)
    precision = sklearn.metrics.precisioni_score(y_valid, y_check)
    recall_score = sklearn.metrics.recall_score(y_valid,y_check)
    f1_score = sklearn.metrics.f1_score(y_valid,y_check)
    acc = sklearn.metrics.accuracy_score(y_valid, y_check)
    
    return clasif_MNB, m_conf, precision, recall_score, f1_score, acc