#Librerías

---
Carga de librerías


In [None]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from collections import Counter
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.neural_network import MLPClassifier
from sklearn.linear_model import SGDClassifier
from sklearn.svm import LinearSVC
from sklearn.model_selection import RandomizedSearchCV, ShuffleSplit
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
import nltk
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.corpus import stopwords
from string import punctuation
import pickle

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Unzipping taggers/averaged_perceptron_tagger.zip.


In [None]:
# Load the Drive helper and mount
from google.colab import drive

# This will prompt for authorization.
drive.mount('/content/drive', force_remount = True)

Mounted at /content/drive


In [None]:

%cd /content/drive/My Drive/Redes Neuronales

!ls

#Carga de datos

---
Esta sección comprende la carga de datos train_data, valid_data y test_data. Se analizó la proporción de los 3 subsets, se decidió unir el subset de train y el de validación, para luego aumentar la proporción de datos para validación a un 20 %.


In [None]:
df_train = pd.read_hdf("train_data.hdf5")
df_valid = pd.read_hdf("valid_data.hdf5")
df_test = pd.read_hdf("test_data.hdf5")

In [None]:
#Proporción de los 3 subsets
n_train = len(df_train)
n_val = len(df_valid)
n_test=len(df_test)

print("El ", 100*n_train/(n_train+n_val+n_test), "% de los datos del dataset conforman el subset TRAIN.")
print("El ", 100*n_val/(n_train+n_val+n_test), "% de los datos del dataset conforman el subset VALIDATION.")
print("El ", 100*n_test/(n_train+n_val+n_test), "% de los datos del dataset conforman el subset TEST.")

El  96.54396142227252 % de los datos del dataset conforman el subset TRAIN.
El  1.7296009194545834 % de los datos del dataset conforman el subset VALIDATION.
El  1.7264376582728946 % de los datos del dataset conforman el subset TEST.


In [None]:
df_train.reset_index(drop=True, inplace=True)
df_valid.reset_index(drop=True, inplace = True)
df_test.reset_index(drop=True, inplace = True)
df_train

Unnamed: 0,gold_label,text
0,contradiction,they are inside of a house
1,entailment,two guys are in a yard
2,neutral,They are doing yardwork
3,contradiction,A man is swimming.
4,entailment,Two young white men are near some bushes.
...,...,...
549362,entailment,A family gathers around to play a video game
549363,neutral,A family enjoys the Christmas as they gather t...
549364,contradiction,Video games tear families apart.
549365,entailment,Video games bring families together.


In [None]:
X_train = df_train.text
y_train = df_train.gold_label

X_valid = df_valid.text
y_valid = df_valid.gold_label

#Balance de clases

---
Observamos que el dataset se encuentra balanceado en cuanto a proporción de clases, por lo que esperaríamos clasificar cada clase con una precisión de aproximadamente 0.33, si suponemos que el dataset no está sesgado.


In [None]:
#Balance de clases
from collections import Counter
prop=Counter(y_train)
total=prop['contradiction']+prop['entailment']+prop['neutral']

print("La cantidad de 'contradiction' es ", prop['contradiction'] , "y representa el ", 100*prop['contradiction']/total, "% del total del dataset de train")
print("La cantidad de 'entailment' es ", prop['entailment'] , "y representa el ", 100*prop['entailment']/total, "% del total del dataset de train")
print("La cantidad de 'neutral' es ", prop['neutral'] , "y representa el ", 100*prop['neutral']/total, "% del total del dataset de train")




#contradiction, entailment, neutral

La cantidad de 'contradiction' es  183187 y representa el  33.34510445658367 % del total del dataset de train
La cantidad de 'entailment' es  183416 y representa el  33.38678879510418 % del total del dataset de train
La cantidad de 'neutral' es  182764 y representa el  33.26810674831215 % del total del dataset de train


# Búsqueda de hiperparámetros 


## Clase Preprocesamiento

In [None]:
#lowercase
#lematización
#puntuación
#stop words
#stemización

class Preprocessor(object):
    def __init__(self, lower= True, stop= True, lem=True, punc= True, stem=True):
        
        self.lower = lower

        self.stop = stop
        if self.stop:
          self.stop_words = set(stopwords.words('english'))

        self.lem = lem
        if self.lem:
            self.lemmatizer = WordNetLemmatizer()
        
        self.punc = punc
                        
        self.stem = stem
        if self.stem:
            self.stemmer = PorterStemmer()           

    
    def __call__(self, line):
        
        if self.lower:
            line = line.lower()
            
        tokens = word_tokenize(line)

        if self.stop:
          tokens = [x for x in tokens if x not in self.stop_words]        
            
        if self.lem:
            tokens = [self.lemmatizer.lemmatize(x, pos='v') for x in tokens]             
            
        if self.punc:
            tokens = [x for x in tokens if x not in punctuation]
        
        if self.stem:
            tokens = [self.stemmer.stem(x) for x in tokens]
            
        preprocessed = []
        preprocessed.append(" ".join(tokens))
        return preprocessed[0]

## LinearSVC
---
SVM con kernel lineal

### Primera prueba
2000 iteraciones en SVM

#### Definimos el pipeline y los hiperparámetros

*   Count Vectorizer + Preprocesamiento
*   SVM lineal



In [None]:
from pprint import pprint
from time import time
import logging

In [None]:
# Display progress logs on stdout
logging.basicConfig(level=logging.INFO,
                    format='%(asctime)s %(levelname)s %(message)s')

In [None]:
# Definimos un a pipeline combinando el extractor de características y el clasificador
#CV + TFIDF + LinearSVM
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LinearSVC(loss='hinge', penalty='l2', random_state=42,
                    max_iter=2000, tol=0.0001, verbose = True))
])

parameters = {
    'vect__preprocessor': (Preprocessor(),
                           Preprocessor(stem=False),
                           Preprocessor(lem=False),
                           Preprocessor(stop = False),
                           Preprocessor(lower= False, stop = False, lem=False, punc= False, stem=False)),
    'vect__min_df': (1, 10, 100),
    'vect__max_df': (0.6, 0.8),
    'vect__ngram_range': ((1, 2),(1,3), (1,4)),
    'tfidf__use_idf': (True, False),
    
    'clf__C': (1.0,5.0, 10.0),
    'clf__fit_intercept': (True, False)
    
}

#### Randomized search
10 modelos

In [None]:
rand_search1 = RandomizedSearchCV(pipeline, parameters, verbose=1, n_iter = 10, cv = ShuffleSplit(n_splits=1, random_state=42), return_train_score = True)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
rand_search1.fit(X_train, y_train)
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % rand_search1.best_score_)
print("Best parameters set:")
best_parameters = rand_search1.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'clf__C': (1.0, 5.0, 10.0),
 'clf__fit_intercept': (True, False),
 'tfidf__use_idf': (True, False),
 'vect__max_df': (0.6, 0.8),
 'vect__min_df': (1, 10, 100),
 'vect__ngram_range': ((1, 2), (1, 3), (1, 4)),
 'vect__preprocessor': (<__main__.Preprocessor object at 0x7f2b81053950>,
                        <__main__.Preprocessor object at 0x7f2b73796590>,
                        <__main__.Preprocessor object at 0x7f2b73796610>,
                        <__main__.Preprocessor object at 0x7f2b73782210>,
                        <__main__.Preprocessor object at 0x7f2b73775e10>)}
Fitting 1 folds for each of 10 candidates, totalling 10 fits
[LibLinear]



[LibLinear]



[LibLinear][LibLinear]



[LibLinear]



[LibLinear]



[LibLinear]



[LibLinear]



[LibLinear]



[LibLinear]



[LibLinear]done in 3510.431s

Best score: 0.653
Best parameters set:
	clf__C: 5.0
	clf__fit_intercept: True
	tfidf__use_idf: False
	vect__max_df: 0.8
	vect__min_df: 10
	vect__ngram_range: (1, 3)
	vect__preprocessor: <__main__.Preprocessor object at 0x7f2b75c35c50>




#### Resultados

In [None]:
with open('rand_search1.pck', 'wb') as fp:
    pickle.dump(rand_search1, fp)

In [None]:
print(best_parameters['vect__preprocessor'].lem)
print(best_parameters['vect__preprocessor'].stem)
print(best_parameters['vect__preprocessor'].stop)
print(best_parameters['vect__preprocessor'].punc)
print(best_parameters['vect__preprocessor'].lower)

False
False
False
False
False


In [None]:
results_df1 = pd.DataFrame(rand_search1.cv_results_)
results_df1 = results_df1.sort_values(by=['rank_test_score'])
results_df1 = (
    results_df1
    .set_index(results_df1["params"].apply(
        lambda x: "_".join(str(val) for val in x.values()))
    )
    .rename_axis('kernel')
)
results_df1[
    ['rank_test_score', 'mean_test_score', 'mean_train_score', 'std_test_score']
]

Unnamed: 0_level_0,rank_test_score,mean_test_score,mean_train_score,std_test_score
kernel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"<__main__.Preprocessor object at 0x7f2b73775e10>_(1, 3)_10_0.8_False_True_5.0",1,0.653185,0.724183,0.0
"<__main__.Preprocessor object at 0x7f2b73782210>_(1, 4)_100_0.6_False_False_5.0",2,0.634381,0.647898,0.0
"<__main__.Preprocessor object at 0x7f2b73782210>_(1, 2)_100_0.8_True_False_10.0",3,0.630759,0.641361,0.0
"<__main__.Preprocessor object at 0x7f2b73796610>_(1, 3)_1_0.6_False_False_1.0",4,0.628119,0.784582,0.0
"<__main__.Preprocessor object at 0x7f2b73775e10>_(1, 3)_100_0.6_False_False_10.0",5,0.626117,0.638598,0.0
"<__main__.Preprocessor object at 0x7f2b73775e10>_(1, 4)_100_0.6_False_True_1.0",6,0.623896,0.635232,0.0
"<__main__.Preprocessor object at 0x7f2b73796610>_(1, 4)_10_0.8_False_False_10.0",7,0.622586,0.669462,0.0
"<__main__.Preprocessor object at 0x7f2b73796590>_(1, 4)_10_0.8_True_False_5.0",8,0.620893,0.667561,0.0
"<__main__.Preprocessor object at 0x7f2b73796590>_(1, 3)_100_0.6_False_False_1.0",9,0.588074,0.59715,0.0
"<__main__.Preprocessor object at 0x7f2b81053950>_(1, 4)_100_0.8_False_True_5.0",10,0.577826,0.586305,0.0


In [None]:
for i in range(0,10):
  print(i)
  print(rand_search1.cv_results_['mean_test_score'][i])
  print('lem: ',rand_search1.cv_results_['param_vect__preprocessor'][i].lem)
  print('stem: ',rand_search1.cv_results_['param_vect__preprocessor'][i].stem)
  print('stop: ',rand_search1.cv_results_['param_vect__preprocessor'][i].stop)
  print('punc: ',rand_search1.cv_results_['param_vect__preprocessor'][i].punc)
  print('lower: ',rand_search1.cv_results_['param_vect__preprocessor'][i].lower)

0
0.6343812002839616
lem:  True
stem:  True
stop:  False
punc:  True
lower:  True
1
0.6261171887798751
lem:  False
stem:  False
stop:  False
punc:  False
lower:  False
2
0.6281194823161076
lem:  False
stem:  True
stop:  True
punc:  True
lower:  True
3
0.5778255092196516
lem:  True
stem:  True
stop:  True
punc:  True
lower:  True
4
0.653184556856035
lem:  False
stem:  False
stop:  False
punc:  False
lower:  False
5
0.6208930229171596
lem:  True
stem:  False
stop:  True
punc:  True
lower:  True
6
0.6307588692502321
lem:  True
stem:  True
stop:  False
punc:  True
lower:  True
7
0.6238964632215083
lem:  False
stem:  False
stop:  False
punc:  False
lower:  False
8
0.6225858710887016
lem:  False
stem:  True
stop:  True
punc:  True
lower:  True
9
0.5880736115914593
lem:  True
stem:  False
stop:  True
punc:  True
lower:  True


Probamos el modelo con el set de validación y calculamos métricas

* Precisión o valor predictivo positivo: $\frac{VP}{VP+FP}$
* Recall o sensibilidad: $\frac{VP}{VP+FN}$
* f1-score: $2\cdot\frac{precision \cdot recall}{precision + recall}$
* support: total de clasficaciones para esa clase 

In [None]:
y_true, y_pred = y_valid, rand_search1.predict(X_valid)
print(classification_report(y_true, y_pred, digits = 4))
print()

               precision    recall  f1-score   support

contradiction     0.6587    0.6501    0.6544      3278
   entailment     0.6581    0.6984    0.6776      3329
      neutral     0.6727    0.6393    0.6556      3235

     accuracy                         0.6629      9842
    macro avg     0.6632    0.6626    0.6625      9842
 weighted avg     0.6631    0.6629    0.6626      9842




In [None]:
pd.DataFrame(confusion_matrix(y_true,y_pred), index = rand_search1.classes_, columns = rand_search1.classes_)

Unnamed: 0,contradiction,entailment,neutral
contradiction,2131,602,545
entailment,543,2325,461
neutral,561,606,2068


#### Hacemos la predicción para test con el mejor modelo de esta Randomized Search
**SUBMISSION**: 0.67284

In [None]:
with open ('rand_search1.pck', 'rb') as fp:
    rand_search1 = pickle.load(fp)

In [None]:
y_pred = rand_search1.predict(df_test.text)

In [None]:
df_submit=pd.DataFrame(y_pred).rename(columns={0:"pred_labels"})

In [None]:
df_submit = df_submit.rename_axis(index="pairID")

In [None]:
df_submit

Unnamed: 0_level_0,pred_labels
pairID,Unnamed: 1_level_1
0,neutral
1,neutral
2,contradiction
3,contradiction
4,neutral
...,...
9819,contradiction
9820,entailment
9821,contradiction
9822,entailment


In [None]:
df_submit.to_csv("submission4.csv")

### Segunda prueba
10000 iteraciones en SVM

#### Pipeline e hiperparámetros

In [None]:
# Definimos un a pipeline combinando el extractor de características y el clasificador
#CV + TFIDF + LinearSVM
pipeline = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf', TfidfTransformer()),
    ('clf', LinearSVC(loss='hinge', penalty='l2', random_state=42,
                    max_iter=10000, tol=0.0001, verbose = True))
])

parameters = {
    'vect__preprocessor': (Preprocessor(),
                           Preprocessor(stem=False),
                           Preprocessor(lem=False),
                           Preprocessor(stop = False, lem = False),
                           Preprocessor(stop = False),
                           Preprocessor(lower= False, stop = False, lem=False, punc= False, stem=False)),
    'vect__min_df': (1, 10, 100),
    'vect__max_df': (0.6, 0.8),
    'vect__ngram_range': ((1, 2),(1,3), (1,4)),
    'tfidf__use_idf': (True, False),
    
    'clf__C': (1.0,5.0, 10.0),
    'clf__fit_intercept': (True, False)
    
}

#### Randomized search
10 modelos

In [None]:
rand_search1 = RandomizedSearchCV(pipeline, parameters, verbose=1, n_iter = 10, cv = ShuffleSplit(n_splits=1, random_state=42), return_train_score = True)

print("Performing grid search...")
print("pipeline:", [name for name, _ in pipeline.steps])
print("parameters:")
pprint(parameters)
t0 = time()
rand_search1.fit(X_train, y_train)
print("done in %0.3fs" % (time() - t0))
print()

print("Best score: %0.3f" % rand_search1.best_score_)
print("Best parameters set:")
best_parameters = rand_search1.best_estimator_.get_params()
for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))

Performing grid search...
pipeline: ['vect', 'tfidf', 'clf']
parameters:
{'clf__C': (1.0, 5.0, 10.0),
 'clf__fit_intercept': (True, False),
 'tfidf__use_idf': (True, False),
 'vect__max_df': (0.6, 0.8),
 'vect__min_df': (1, 10, 100),
 'vect__ngram_range': ((1, 2), (1, 3), (1, 4)),
 'vect__preprocessor': (<__main__.Preprocessor object at 0x7f2b7280fb50>,
                        <__main__.Preprocessor object at 0x7f2b83a0bb90>,
                        <__main__.Preprocessor object at 0x7f2b83a0bfd0>,
                        <__main__.Preprocessor object at 0x7f2b736f4150>,
                        <__main__.Preprocessor object at 0x7f2b736e9d10>,
                        <__main__.Preprocessor object at 0x7f2b736e9dd0>)}
Fitting 1 folds for each of 10 candidates, totalling 10 fits
[LibLinear]



[LibLinear][LibLinear]



[LibLinear]



[LibLinear]



[LibLinear][LibLinear]



[LibLinear]



[LibLinear]



[LibLinear]



[LibLinear]done in 6799.998s

Best score: 0.658
Best parameters set:
	clf__C: 1.0
	clf__fit_intercept: True
	tfidf__use_idf: False
	vect__max_df: 0.8
	vect__min_df: 10
	vect__ngram_range: (1, 4)
	vect__preprocessor: <__main__.Preprocessor object at 0x7f2b6e63cad0>


#### Resultados

In [None]:
print(best_parameters['vect__preprocessor'].lem)
print(best_parameters['vect__preprocessor'].stem)
print(best_parameters['vect__preprocessor'].stop)
print(best_parameters['vect__preprocessor'].punc)
print(best_parameters['vect__preprocessor'].lower)

False
False
False
False
False


In [None]:
results_df1 = pd.DataFrame(rand_search1.cv_results_)
results_df1 = results_df1.sort_values(by=['rank_test_score'])
results_df1 = (
    results_df1
    .set_index(results_df1["params"].apply(
        lambda x: "_".join(str(val) for val in x.values()))
    )
    .rename_axis('kernel')
)
results_df1[
    ['rank_test_score', 'mean_test_score', 'mean_train_score', 'std_test_score']
]

Unnamed: 0_level_0,rank_test_score,mean_test_score,mean_train_score,std_test_score
kernel,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
"<__main__.Preprocessor object at 0x7f2b736e9dd0>_(1, 4)_10_0.8_False_True_1.0",1,0.658099,0.704793,0.0
"<__main__.Preprocessor object at 0x7f2b736e9dd0>_(1, 2)_1_0.6_True_False_5.0",2,0.635437,0.84197,0.0
"<__main__.Preprocessor object at 0x7f2b7280fb50>_(1, 4)_1_0.6_False_True_1.0",3,0.626481,0.807603,0.0
"<__main__.Preprocessor object at 0x7f2b7280fb50>_(1, 2)_10_0.6_False_True_1.0",4,0.62406,0.652076,0.0
"<__main__.Preprocessor object at 0x7f2b7280fb50>_(1, 3)_10_0.6_False_True_10.0",5,0.622841,0.668936,0.0
"<__main__.Preprocessor object at 0x7f2b83a0bb90>_(1, 4)_10_0.8_True_True_5.0",6,0.621912,0.66771,0.0
"<__main__.Preprocessor object at 0x7f2b7280fb50>_(1, 3)_10_0.8_True_True_10.0",7,0.621184,0.669169,0.0
"<__main__.Preprocessor object at 0x7f2b83a0bfd0>_(1, 3)_100_0.8_False_False_10.0",8,0.590986,0.599826,0.0
"<__main__.Preprocessor object at 0x7f2b7280fb50>_(1, 3)_100_0.6_True_False_1.0",9,0.590094,0.599292,0.0
"<__main__.Preprocessor object at 0x7f2b7280fb50>_(1, 2)_100_0.6_True_False_5.0",10,0.589548,0.598685,0.0


In [None]:
for i in range(0,10):
  print(i)
  print(rand_search1.cv_results_['mean_test_score'][i])
  print('lem: ',rand_search1.cv_results_['param_vect__preprocessor'][i].lem)
  print('stem: ',rand_search1.cv_results_['param_vect__preprocessor'][i].stem)
  print('stop: ',rand_search1.cv_results_['param_vect__preprocessor'][i].stop)
  print('punc: ',rand_search1.cv_results_['param_vect__preprocessor'][i].punc)
  print('lower: ',rand_search1.cv_results_['param_vect__preprocessor'][i].lower)

0
0.6228407084478584
lem:  True
stem:  True
stop:  True
punc:  True
lower:  True
1
0.6264812421500991
lem:  True
stem:  True
stop:  True
punc:  True
lower:  True
2
0.6354369550576114
lem:  False
stem:  False
stop:  False
punc:  False
lower:  False
3
0.621184265613339
lem:  True
stem:  True
stop:  True
punc:  True
lower:  True
4
0.6240602872381091
lem:  True
stem:  True
stop:  True
punc:  True
lower:  True
5
0.6580992773540602
lem:  False
stem:  False
stop:  False
punc:  False
lower:  False
6
0.621912372353787
lem:  True
stem:  False
stop:  True
punc:  True
lower:  True
7
0.5909860385532519
lem:  False
stem:  True
stop:  True
punc:  True
lower:  True
8
0.590094107796203
lem:  True
stem:  True
stop:  True
punc:  True
lower:  True
9
0.5895480277408668
lem:  True
stem:  True
stop:  True
punc:  True
lower:  True


Probamos el modelo con el set de validación y calculamos métricas

* Precisión o valor predictivo positivo: $\frac{VP}{VP+FP}$
* Recall o sensibilidad: $\frac{VP}{VP+FN}$
* f1-score: $2\cdot\frac{precision \cdot recall}{precision + recall}$
* support: total de clasficaciones para esa clase 

In [None]:
y_true, y_pred = y_valid, rand_search1.predict(X_valid)
print(classification_report(y_true, y_pred, digits = 4))
print()

               precision    recall  f1-score   support

contradiction     0.6671    0.6443    0.6555      3278
   entailment     0.6573    0.7143    0.6846      3329
      neutral     0.6769    0.6399    0.6579      3235

     accuracy                         0.6665      9842
    macro avg     0.6671    0.6662    0.6660      9842
 weighted avg     0.6670    0.6665    0.6661      9842




In [None]:
pd.DataFrame(confusion_matrix(y_true,y_pred), index = rand_search1.classes_, columns = rand_search1.classes_)

Unnamed: 0,contradiction,entailment,neutral
contradiction,2112,615,551
entailment,514,2378,437
neutral,540,625,2070


#### Hacemos la predicción para test con el mejor modelo de esta Randomized Search
**SUBMISSION**: 0.67935

In [None]:
y_pred = rand_search1.predict(df_test.text)

In [None]:
df_submit=pd.DataFrame(y_pred).rename(columns={0:"pred_labels"})

In [None]:
df_submit = df_submit.rename_axis(index="pairID")

In [None]:
df_submit

Unnamed: 0_level_0,pred_labels
pairID,Unnamed: 1_level_1
0,neutral
1,neutral
2,contradiction
3,entailment
4,neutral
...,...
9819,contradiction
9820,entailment
9821,contradiction
9822,entailment


In [None]:
df_submit.to_csv("submission.csv")

In [None]:
with open('randsearch2.pck', 'wb') as fp:
    pickle.dump(rand_search1, fp)

### Generamos modelos con los mejores hiperparámetros
---
Sabiendo los resultados de las Randomized Search probamos algunos modelos.

Los modelos que mejor funcionaron:  
0.6665

Preprocesamiento:

* lem: False
* stem: False
* stop: False
* punc: False
* lower: False
* ngram: (1,4)
* min_df: 10
* max_df: 0.8
* idf: False

SVM: 

* intercept: True
* C: 1

0.6629

Preprocesamiento:

* lem: False
* stem: False
* stop: False
* punc: False
* lower: False
* ngram: (1,3)
* min_df: 10
* max_df: 0.8
* idf: False

SVM:

* intercept: True
* C: 5


#### modelo 1
---
Usamos los mejores hiperparámetros de la segunda *RandSearch*

**SUBMISSION**: 0.66815

In [None]:
txt_clf = Pipeline([
    ('vect', CountVectorizer(preprocessor = Preprocessor(lower= False, stop = False, lem=False, punc= False, stem=False),
                             min_df = 10, max_df = 0.8, ngram_range = (1,4))),
    ('clf', LinearSVC(loss='hinge', penalty='l2', random_state=42,
                    max_iter=20000, tol=0.0001, C = 1.0, fit_intercept = True, verbose = True))
])

In [None]:
txt_clf.fit(X_train, y_train)

[LibLinear]



Pipeline(steps=[('vect',
                 CountVectorizer(max_df=0.8, min_df=10, ngram_range=(1, 4),
                                 preprocessor=<__main__.Preprocessor object at 0x7f2b70bd3250>)),
                ('clf',
                 LinearSVC(loss='hinge', max_iter=20000, random_state=42,
                           verbose=True))])

In [None]:
y_true, y_pred = y_valid, txt_clf.predict(X_valid)
print(classification_report(y_true, y_pred, digits = 4))
print()

               precision    recall  f1-score   support

contradiction     0.6545    0.6473    0.6509      3278
   entailment     0.6500    0.6963    0.6724      3329
      neutral     0.6658    0.6244    0.6444      3235

     accuracy                         0.6564      9842
    macro avg     0.6568    0.6560    0.6559      9842
 weighted avg     0.6567    0.6564    0.6560      9842




In [None]:
y_pred = txt_clf.predict(df_test.text)
df_submit1=pd.DataFrame(y_pred).rename(columns={0:"pred_labels"})
df_submit1 = df_submit1.rename_axis(index="pairID")
df_submit1

Unnamed: 0_level_0,pred_labels
pairID,Unnamed: 1_level_1
0,neutral
1,neutral
2,contradiction
3,neutral
4,neutral
...,...
9819,contradiction
9820,entailment
9821,contradiction
9822,neutral


In [None]:
df_submit1.to_csv("submission1.csv")

#### modelo 2
---
**SUBMISSION**: 0.65675
Usamos los mejores hiperparámetros de la primera RandSearch

In [None]:
# Definimos un a pipeline combinando el extractor de características y el clasificador
#CV + TFIDF + MLP
txt_clf2 = Pipeline([
    ('vect', CountVectorizer(preprocessor = Preprocessor(lower= False, stop = False, lem=False, punc= False, stem=False),
                             min_df = 10, max_df = 0.8, ngram_range= (1,3))),
    ('clf', LinearSVC(loss='hinge', penalty='l2', random_state=42,
                    max_iter=20000, tol=0.0001, C = 5.0, fit_intercept = True, verbose = True))
])

In [None]:
txt_clf2.fit(X_train, y_train)

[LibLinear]



Pipeline(steps=[('vect',
                 CountVectorizer(max_df=0.8, min_df=10, ngram_range=(1, 3),
                                 preprocessor=<__main__.Preprocessor object at 0x7f2b6094efd0>)),
                ('clf',
                 LinearSVC(C=5.0, loss='hinge', max_iter=20000, random_state=42,
                           verbose=True))])

In [None]:
y_true, y_pred = y_valid, txt_clf2.predict(X_valid)
print(classification_report(y_true, y_pred, digits = 4))
print()

               precision    recall  f1-score   support

contradiction     0.6474    0.6464    0.6469      3278
   entailment     0.6500    0.6918    0.6703      3329
      neutral     0.6649    0.6219    0.6427      3235

     accuracy                         0.6537      9842
    macro avg     0.6541    0.6534    0.6533      9842
 weighted avg     0.6540    0.6537    0.6534      9842




In [None]:
y_pred = txt_clf2.predict(df_test.text)
df_submit2=pd.DataFrame(y_pred).rename(columns={0:"pred_labels"})
df_submit2 = df_submit2.rename_axis(index="pairID")
df_submit2

Unnamed: 0_level_0,pred_labels
pairID,Unnamed: 1_level_1
0,neutral
1,neutral
2,contradiction
3,neutral
4,neutral
...,...
9819,contradiction
9820,entailment
9821,contradiction
9822,neutral


In [None]:
df_submit2.to_csv("submission2.csv")

#### modelo 3
---
Probamos con un preprocesamiento completo pero sin stopwords

**SUBMISSION**: 0.66714

In [None]:
txt_clf3 = Pipeline([
    ('vect', CountVectorizer( preprocessor= Preprocessor(lower= True, stop = False, lem=True, punc= True, stem=True),
                             min_df = 10, max_df = 0.8, ngram_range = (1,4))),
    ('tfidf', TfidfTransformer()),
    ('clf', LinearSVC(loss='hinge', penalty='l2', random_state=42,
                    max_iter=20000, tol=0.0001, C = 5.0, fit_intercept = True, verbose = True))
])

In [None]:
txt_clf3.fit(X_train, y_train)

[LibLinear]



Pipeline(steps=[('vect',
                 CountVectorizer(max_df=0.8, min_df=10, ngram_range=(1, 4),
                                 preprocessor=<__main__.Preprocessor object at 0x7f2b70bba910>)),
                ('tfidf', TfidfTransformer()),
                ('clf',
                 LinearSVC(C=5.0, loss='hinge', max_iter=20000, random_state=42,
                           verbose=True))])

In [None]:
y_true, y_pred = y_valid, txt_clf3.predict(X_valid)
print(classification_report(y_true, y_pred, digits = 4))
print()

               precision    recall  f1-score   support

contradiction     0.6600    0.6675    0.6637      3278
   entailment     0.6683    0.6960    0.6819      3329
      neutral     0.6755    0.6389    0.6567      3235

     accuracy                         0.6678      9842
    macro avg     0.6679    0.6675    0.6674      9842
 weighted avg     0.6679    0.6678    0.6676      9842




In [None]:
y_pred = txt_clf3.predict(df_test.text)
df_submit3=pd.DataFrame(y_pred).rename(columns={0:"pred_labels"})
df_submit3 = df_submit3.rename_axis(index="pairID")
df_submit3

Unnamed: 0_level_0,pred_labels
pairID,Unnamed: 1_level_1
0,neutral
1,neutral
2,contradiction
3,contradiction
4,neutral
...,...
9819,contradiction
9820,entailment
9821,contradiction
9822,entailment


In [None]:
df_submit3.to_csv("submission3.csv")