## Taller 2 
### Brayan José Calderón Amorocho
![](1.PNG)

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import svm
from sklearn.model_selection import GridSearchCV
import matplotlib.pyplot as plt
from sklearn.metrics.pairwise import chi2_kernel



In [2]:
def clean_db(db,lang):
    db["len"] = db["palabra"].str.len()
    db.drop('frecuencia', inplace=True, axis=1)
    db = db[db["len"] >= 4]
    db["palabra"] = db['palabra'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')        
    if lang == "es":
        db["lang"] = "es"
    else:
        db["lang"] = "en"
    return db

In [3]:
es_db = pd.read_csv("es_50k.txt",sep = " ")
es_db.shape

(50000, 2)

In [4]:
es_db[:5]

Unnamed: 0,palabra,frecuencia
0,de,14459520
1,que,14421005
2,no,12379505
3,a,9549646
4,la,9125471


In [5]:
en_db = pd.read_csv("en_50k.txt",sep = " ")
en_db.shape

(50000, 2)

In [6]:
es_df = clean_db(es_db,"es")
en_df = clean_db(en_db,"en")

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  db["palabra"] = db['palabra'].str.normalize('NFKD').str.encode('ascii', errors='ignore').str.decode('utf-8')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  db["lang"] = "es"
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  db["palabra"] = db['palabra'].str.normalize('NFKD').str.encode('ascii', errors

In [7]:
print(es_df.head(3))
print(en_df.head(3))

   palabra  len lang
19    para  4.0   es
20    esta  4.0   es
22    pero  4.0   es
   palabra  len lang
7     that  4.0   en
11    what  4.0   en
14    this  4.0   en


In [8]:
words_df = pd.concat([es_df[:4000], en_df[:4000]], ignore_index=True)
print(words_df.head(5),words_df.shape)

  palabra  len lang
0    para  4.0   es
1    esta  4.0   es
2    pero  4.0   es
3    bien  4.0   es
4    como  4.0   es (8000, 3)


In [9]:
X = words_df["palabra"]
#Realizo Onehot Encoding para el lenguage
y = words_df["lang"]
Y = list(y)
Y = np.array([1 if y == "en" else 0 for y in Y])

In [10]:
# Split the data with training and testing
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.33, random_state=42)
x_train.shape , x_test.shape , y_train.shape , y_test.shape

((5360,), (2640,), (5360,), (2640,))

![](2.PNG)

## Histogram Cosine Kernel

In [11]:
#Instanciamos CountVectorizer
vectorizer = CountVectorizer(ngram_range = (1,1), analyzer = 'char')

#Alimentamos al objeto vectorizer con los datos de train
X1 = vectorizer.fit_transform(x_train)

print(X1.shape)
X1.toarray()

(5360, 27)


array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 0, 0],
       ...,
       [0, 1, 0, ..., 0, 0, 0],
       [0, 1, 0, ..., 0, 1, 0],
       [0, 0, 0, ..., 0, 1, 0]])

In [12]:
#Calculamos y guardamos los n-grams (2,2)(2,3)(2,4) en una lista que está conformada
# los datos de train y los datos de test
n_grams_train =  []
n_grams_test  =  []
for n in range(2,5):
    vectorizer = CountVectorizer(ngram_range = (1,n), analyzer = "char")
    n_grams_train.append(vectorizer.fit_transform(x_train))
    n_grams_test.append(vectorizer.transform(x_test))

In [13]:
#Kernel utilizando cosine similarity utilizando el primer n-grams
K_cosine = cosine_similarity(X1)
K_cosine.shape

(5360, 5360)

In [14]:
K_cosine[:3,:3]

array([[1.        , 0.31622777, 0.53033009],
       [0.31622777, 1.        , 0.2236068 ],
       [0.53033009, 0.2236068 , 1.        ]])

In [15]:
#Aplicamos el kernel para la los n-grams obtenidos anteriormente
Kcosine_n = [cosine_similarity(gram) for gram in n_grams_train]
Kcosine_n[1][:3]

array([[1.        , 0.15294382, 0.24253563, ..., 0.11111111, 0.        ,
        0.        ],
       [0.15294382, 1.        , 0.11128298, ..., 0.15294382, 0.06131393,
        0.07647191],
       [0.24253563, 0.11128298, 1.        , ..., 0.16169042, 0.19446112,
        0.        ]])

## Histogram Intersection

In [16]:
#https://github.com/gmum/pykernels/blob/master/pykernels/regular.py
def compute_hist_kernel(h1,h2):
    h1 = normalize(h1)
    h2 = normalize(h2)
    kernel = np.zeros((h1.shape[0],h2.shape[0]))
    for d in range(h1.shape[1]):
            column_1 = h1[:, d].reshape(-1, 1)
            column_2 = h2[:, d].reshape(-1, 1)
            kernel += np.minimum(column_1, column_2.T)
    return kernel
def normalize(h):
    return h/np.sum(h, axis = 1)
    

In [123]:
#Normalizamos valores utilizando el bag of words calculado anteriormente
N1 = X1/np.sum(X1, axis = 1)
K_hist = compute_hist_kernel(N1,N1)





    

In [None]:
Khist_n = [compute_hist_kernel(gram, gram) for gram in n_grams_train]

## 3. χ2 kernel

In [17]:
K = chi2_kernel(n_grams_train[0].todense(), n_grams_train[0].todense())



In [18]:
K.shape

(5360, 5360)

In [None]:
K3_n = [chi2_kernel(gram.todense(),gram.todense()) for gram in n_grams_train]



## SSK Kernel
Use the code available at this repository https://github.com/helq/python-ssk


In [1]:
!pip install cython
%load_ext Cython



## Problem 1(c)

Use scikit-learn to train different SVMs using precomputed kernels. Use cross validation to find appropriate regularization parameters plotting the training and validation error vs. the regularization parameter. Use a logarithmic scale for $C$, $\{2^{-15} ,2^{-14},...,2^{10}\}$. Try different configurations of the parameters (in particular different $n$ values for the $n$-grams).

In [31]:

#https://scikit-learn.org/stable/auto_examples/svm/plot_svm_scale_c.html

def get_scores(kernel, Y, C):
    '''
    kernel: Precomputed kernel,
    Y: target train
    C: set of regularization parameters
    '''
    _clf = svm.SVC(kernel='precomputed')
    _grid = GridSearchCV(_clf, cv=2, param_grid={'C': C}, return_train_score=True)
    _grid.fit(kernel, Y)
    _scores = _grid.cv_results_['mean_test_score']
    max_train = np.argmax(_grid.cv_results_['mean_train_score'])
    max_test = np.argmax(_grid.cv_results_['mean_test_score'])
    return {
        'train': {
        'scores': _grid.cv_results_['mean_train_score'],
        'error': 1 - _grid.cv_results_['mean_train_score'],
        'C': C[max_train],
        'score':  _grid.cv_results_['mean_train_score'][max_train]
      },
    'validation': {
        'scores': _grid.cv_results_['mean_test_score'],
        'error': 1 - _grid.cv_results_['mean_test_score'],
        'C': C[max_test],
        'score':  _grid.cv_results_['mean_test_score'][max_test]
      }
  }

def plot_rp(d, kernel_name, n_gram, cs):
    '''
    d: Results from get_scores
    kernel_name: Title to plot
    n_gram: n_gram
    '''
    fig, axes = plt.subplots( figsize=(10, 6))
    axes.set_xlabel('Regularization Parameter')
    axes.set_ylabel('Error')
    axes.set_title(f'Kernel: {kernel_name}, {n_gram}-gram')
    x_box = min(min(d['train']['error']), min(d['validation']['error']))
    axes.text(2 ** (-14), x_box, f"Best Regularization parameter:\nTrain C : {d['train']['C']}, Error: {round(1 -  d['train']['score'],4)}\nTest  C : {d['validation']['C']}, Error: {round(1 -  d['validation']['score'],4)}",
        bbox={'facecolor': 'white', 'alpha': 0.5, 'pad': 10})
    colors = ['navy', 'red', 'darkorange']
    axes.semilogx(cs, d['train']['error'], color='red', label=f'Train')
    axes.semilogx(cs, d['validation']['error'], color='navy', label=f'Validation')  
    plt.legend()
    plt.show()


def reg_parameters(title, kernels, Y):
    '''
    title: Title for plot
    kernels: Set of precomputed kernels
    Y: Target
    '''
    cs = [2**(i) for i in range(-15, 11)]
    for idx, K in enumerate(kernels):
        scores = get_scores(K, Y, cs)
        plot_rp(scores, title, idx + 2, cs)

In [32]:
reg_parameters('Cosine Similarity', Kcosine_n, y_train)

KeyboardInterrupt: 

## Problem 1(d)

Evaluate the performance of the SVMs in the test data set:
* Report the results in a table for the different evaluated configurations.
* Illustrate examples of errors (English words mistaken as Spanish, Spanish words
mistaken as English). Give a possible explanation for these mistakes.
* Discuss the results.