In [18]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)


# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/sesgos-en-el-dataset-de-snli/train_data.hdf5
/kaggle/input/sesgos-en-el-dataset-de-snli/valid_data.hdf5
/kaggle/input/sesgos-en-el-dataset-de-snli/submission_sample.csv
/kaggle/input/sesgos-en-el-dataset-de-snli/test_data.hdf5


In [49]:
#Importo los vectorizadores
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB

In [20]:
# Cargo los datos
df_train = pd.read_hdf("/kaggle/input/sesgos-en-el-dataset-de-snli/train_data.hdf5")
df_valid = pd.read_hdf("/kaggle/input/sesgos-en-el-dataset-de-snli/valid_data.hdf5")
df_test = pd.read_hdf("/kaggle/input/sesgos-en-el-dataset-de-snli/test_data.hdf5")

In [21]:
#Cantidad de documentos en train
len(df_train)

549367

In [22]:
df_train.head()

Unnamed: 0_level_0,gold_label,text
pairID,Unnamed: 1_level_1,Unnamed: 2_level_1
1000092795.jpg#0r1c,contradiction,they are inside of a house
1000092795.jpg#0r1e,entailment,two guys are in a yard
1000092795.jpg#0r1n,neutral,They are doing yardwork
1000092795.jpg#1r1c,contradiction,A man is swimming.
1000092795.jpg#1r1e,entailment,Two young white men are near some bushes.


In [23]:
df_valid.head()

Unnamed: 0_level_0,gold_label,text
pairID,Unnamed: 1_level_1,Unnamed: 2_level_1
100197432.jpg#4r1c,contradiction,The women are seated in a bistro eating lunch.
100197432.jpg#4r1e,entailment,There are muliple women outdoors.
100197432.jpg#4r1n,neutral,The women are friends.
1026792563.jpg#1r1c,contradiction,A group of women are fishing
1026792563.jpg#1r1e,entailment,A group of women are selling their wares


In [24]:
df_test.head()

Unnamed: 0_level_0,text
pairID,Unnamed: 1_level_1
0,The church has cracks in the ceiling.
1,The church is filled with song.
2,A choir singing at a baseball game.
3,The woman is young.
4,The woman is very happy.


In [25]:
df_submission = pd.read_csv("/kaggle/input/sesgos-en-el-dataset-de-snli/submission_sample.csv", index_col="pairID")

In [None]:
df_submission

In [27]:
text_train = df_train["text"].tolist()
labels_train = df_train["gold_label"].tolist()
text_val = df_valid["text"].tolist()
labels_val = df_valid["gold_label"].tolist()
text_test = df_test["text"].tolist()

In [28]:
#Veamos el balance de clases
from collections import Counter
Counter(labels_train)

Counter({'contradiction': 183187, 'entailment': 183416, 'neutral': 182764})

# Clases de este dataset
+ Contradiction
+ Entailment
+ Neutral

# Pre-procesamiento de Texto
+ NLTK (Natural Language Toolkit)
  + Tokenization: separa el texto en las palabras según criterio
  + Lemmatization: reduce a sus significados (ej, quita conjugación verbal)
  + Stop Words: quita preposiciones (como palabras muy usuales de relleno?)
  + Stemming: reduce las palabras a su raíz
  + Filtrado de palabras

In [None]:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer

In [None]:
def data_filter(dataset):
    texts_filtrados = list()
    for idx in range(len(dataset.text)):
        if idx%100==0:
            print("\r Procesados: {}".format(idx),end="")
        em=dataset.text[idx]
        tok=word_tokenize(em)
        lem=[lemmatizer.lemmatize(x,pos='v') for x in tok]
        stop = [x for x in lem if x not in stopwords.words('english')]
        stem=[stemmer.stem(x) for x in stop]
        alpha=[x for x in stem if x.isalpha()]
        texts_filtrados.append(" ".join(alpha))
    return texts_filtrados

In [None]:
# 

# Armos los CV para train y valid

In [29]:
cv = CountVectorizer(min_df=10, ngram_range = (1,2)) 
#cv = TfidfVectorizer(min_df=1)
#ngram_range es lo que toma como vocabulario conjuntos de paralbras consecutivas.

In [30]:
cv_train = cv.fit_transform(text_train)


In [31]:
cv_valid = cv.transform(text_val)

In [32]:
cv_train.shape

(549367, 36451)

In [33]:
# Vemos un poco el vocabulario
cv.get_feature_names()[-19:]

['youth are',
 'youth group',
 'youth is',
 'youthful',
 'youths',
 'youths are',
 'youtube',
 'youtube video',
 'zebra',
 'zebra is',
 'zero',
 'zip',
 'zip line',
 'zombie',
 'zombies',
 'zombies are',
 'zone',
 'zoo',
 'zooms']

In [34]:
from sklearn.naive_bayes import MultinomialNB

In [35]:
clf = MultinomialNB(alpha=1e-10)
clf.fit(cv_train, labels_train)

MultinomialNB(alpha=1e-10)

In [36]:
#logprobabilidades de la clase 0
clf.coef_[0]

array([-12.56211639, -10.84446489, -13.40941425, ..., -11.41698409,
        -9.40208107, -13.40941425])

In [37]:
#Veamos còmo funciona el clasificador para train
clf.score(cv_train, labels_train)

0.6493109342206576

In [38]:
# Veamos còomo funciona el clasificador para valid
clf.score(cv_valid, labels_val)

0.638996138996139

In [39]:
cv_test = cv.transform(text_test)
test_labels = clf.predict(cv_test)

In [40]:
#Armo el submission.csv
df_test = pd.DataFrame(data=test_labels, columns=["pred_labels"],)

In [41]:
df_test.head()

Unnamed: 0,pred_labels
0,contradiction
1,neutral
2,neutral
3,entailment
4,neutral


In [42]:
df_test.index.names = ["pairID"]

In [43]:
df_test

Unnamed: 0_level_0,pred_labels
pairID,Unnamed: 1_level_1
0,contradiction
1,neutral
2,neutral
3,entailment
4,neutral
...,...
9819,contradiction
9820,entailment
9821,contradiction
9822,entailment


In [44]:
df_test.to_csv("submission.csv")