# Use vectorization techniques

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler

from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import make_scorer
from scipy.stats import pearsonr
import matplotlib.pyplot as plt

pd.set_option('display.max_columns', None)
pd.set_option('display.width', None)

from src.preprocessing import clean_func, spacy_preprocess_reviews
from src.sts import STSAnalyzer
from src.models import BaselineModel
import numpy as np
from nltk.metrics import jaccard_distance
import os
import pandas as pd
# IMport partial
import nltk

from functools import partial
from nltk.corpus import wordnet as wn
from nltk.corpus import wordnet_ic
from nltk import ngrams
brown_ic = wordnet_ic.ic('ic-brown.dat')
from nltk import download
from nltk.stem import PorterStemmer

download('averaged_perceptron_tagger')
download('wordnet')
download('omw-1.4')
download('punkt')

import spacy
nlp = spacy.load("en_core_web_sm")
from spacy_wordnet.wordnet_annotator import WordnetAnnotator 
# Load a spaCy model (supported languages are "es" and "en") 
nlp = spacy.load('en_core_web_sm')
# spaCy 3.x
nlp.add_pipe("spacy_wordnet", after='tagger')

import textdistance
from src.fe_utils import *

prep_func = partial(spacy_preprocess_reviews, clean_func=clean_func, out_set=False)
INPUT_FOLDER = "input"
TRAIN_PATH = os.path.join(INPUT_FOLDER, "train")
TEST_PATH = os.path.join(INPUT_FOLDER, "test")
LS_FILES_TRAIN = ["MSRpar", "MSRvid", "SMTeuroparl"]
LS_FILES_TEST = ["MSRpar", "MSRvid", "SMTeuroparl", "surprise.SMTnews", "surprise.OnWN"]

def train_and_test(model, grid, X_train, y_train, X_test, y_test):
    pipe = Pipeline([('scaler', StandardScaler()), ('model', model)])
    pearsonr_scorer = make_scorer(lambda x, y: pearsonr(x, y).statistic, greater_is_better=True)

    grid = GridSearchCV(pipe, grid, cv=10, scoring=pearsonr_scorer, verbose=10, n_jobs=-1)
    grid.fit(X_train, y_train)

    print("Best cross-validation score: {:.2f}".format(grid.best_score_))
    print("Best parameters: {}".format(grid.best_params_))
    print("Test set score: {:.2f}".format(grid.score(X_test, y_test)))

    return grid

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/alberto/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/alberto/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/alberto/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package punkt to /home/alberto/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/alberto/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to /home/alberto/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/alberto/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-dat

In [5]:
import pandas as pd
sts = STSAnalyzer(preprocess_fun=prep_func, model=BaselineModel(),
                  ls_files=LS_FILES_TRAIN, input_path=TRAIN_PATH)
df = sts.load_data()

sts_test = STSAnalyzer(preprocess_fun=prep_func, model=BaselineModel(),
                  ls_files=LS_FILES_TEST, input_path=TEST_PATH)
df_test = sts_test.load_data()

In [36]:
print("- Tokenized (list)")
df_prep = df[["sent1", "sent2"]].copy()
df_prep["sent1"] = df_prep["sent1"].apply(lambda x: ' '.join(prep_func(x, out_set=False)))
df_prep["sent2"] = df_prep["sent2"].apply(lambda x: ' '.join(prep_func(x, out_set=False)))
        

- Tokenized (list)


In [48]:
df_prep_test = df_test[["sent1", "sent2"]].copy()
df_prep_test["sent1"] = df_prep_test["sent1"].apply(lambda x: ' '.join(prep_func(x, out_set=False)))
df_prep_test["sent2"] = df_prep_test["sent2"].apply(lambda x: ' '.join(prep_func(x, out_set=False)))

In [51]:
X_train = df_prep.copy()
y_train = df["gs"].copy()

X_test = df_prep_test.copy()
y_test = df_test["gs"].copy()

In [46]:
from sklearn.feature_extraction.text import TfidfVectorizer
tfidf = TfidfVectorizer()
X_sample = X_train.iloc[:10, 0]
X_sample_2 = X_train.iloc[:10, 1]
print(tfidf.fit_transform(X_train).todense())
print(tfidf.fit_transform(X_sample_2).todense())


[[1. 0.]
 [0. 1.]]
[[0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.35355339 0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 ...
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]
 [0.         0.         0.         ... 0.         0.         0.        ]]


In [133]:
def get_vectorized_distances(vectorizer, X_train, X_test):
    from sklearn.metrics.pairwise import paired_cosine_distances
    from sklearn.metrics.pairwise import paired_euclidean_distances

    X_train_s1 = X_train.iloc[:, 0]
    X_train_s2 = X_train.iloc[:, 1]

    tfidf = vectorizer
    tfidf.fit(pd.concat([X_train_s1, X_train_s2]))
    X_train_tfidf_s1 = tfidf.transform(X_train_s1).todense()
    X_train_tfidf_s2 = tfidf.transform(X_train_s2).todense()

    X_test_s1 = X_test.iloc[:, 0]
    X_test_s2 = X_test.iloc[:, 1]

    X_test_tfidf_s1 = tfidf.transform(X_test_s1).todense()
    X_test_tfidf_s2 = tfidf.transform(X_test_s2).todense()


    # print("Cosine similarity")
    df_dists_train = pd.DataFrame()
    df_dists_test = pd.DataFrame()

    cos_dist_train = paired_cosine_distances(np.asarray(X_train_tfidf_s1), np.asarray(X_train_tfidf_s2)).reshape(-1, 1)
    cos_dist_test = paired_cosine_distances(np.asarray(X_test_tfidf_s1), np.asarray(X_test_tfidf_s2)).reshape(-1, 1)

    df_dists_train["cosine"] = cos_dist_train.reshape(-1)
    df_dists_test["cosine"] = cos_dist_test.reshape(-1)

    euc_dist_train = paired_euclidean_distances(np.asarray(X_train_tfidf_s1), np.asarray(X_train_tfidf_s2)).reshape(-1, 1)
    euc_dist_test = paired_euclidean_distances(np.asarray(X_test_tfidf_s1), np.asarray(X_test_tfidf_s2)).reshape(-1, 1)

    df_dists_train["euclidean"] = euc_dist_train.reshape(-1)
    df_dists_test["euclidean"] = euc_dist_test.reshape(-1)

    return df_dists_train, df_dists_test

from sklearn.feature_extraction.text import CountVectorizer
df_tfidf_dists_train, df_tfidf_dists_test = get_vectorized_distances(CountVectorizer(), X_train, X_test)
print(df_tfidf_dists_train.head())
# rf = RandomForestRegressor(max_depth=5, n_estimators=400, random_state=0)
from sklearn.linear_model import LassoCV, LinearRegression
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(df_tfidf_dists_train)
df_tfidf_dists_train = sc.transform(df_tfidf_dists_train)
df_tfidf_dists_test = sc.transform(df_tfidf_dists_test)
# lr = LassoCV(alphas=np.logspace(-6, 6, 13), cv=10, max_iter=10000, fit_intercept=False)
lr = LinearRegression()
print(df_tfidf_dists_test)
lr.fit(df_tfidf_dists_train, y_train)
y_pred = lr.predict(df_tfidf_dists_test)
print(y_pred)
print(pearsonr(y_pred, y_test).statistic)

print(lr.coef_)

     cosine  euclidean
0  0.265153   2.645751
1  0.367544   2.236068
2  0.440983   2.828427
3  0.206143   2.449490
4  0.785166   4.690416
[[ 0.56038355  0.35537627]
 [ 1.36072026  1.15471945]
 [-0.27522586 -0.03439544]
 ...
 [-1.25489092 -1.52532805]
 [-1.81083664 -2.5539194 ]
 [-1.59789104 -1.52532805]]
[2.75031112 2.17061808 3.55397155 ... 4.03494266 4.22647231 4.43619899]
0.5876255340178806
[-1.16984312  0.44608543]


In [125]:
df_tfidf_dists_train.describe()

Unnamed: 0,cosine,euclidean
count,2234.0,2234.0
mean,0.48122,0.931212
std,0.264158,0.308751
min,0.0,0.0
25%,0.28933,0.760696
50%,0.442843,0.941109
75%,0.659433,1.148419
max,1.0,1.414214


In [102]:
# CV using tfidf on each of the splits and calculating distances
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.metrics.pairwise import euclidean_distances
from sklearn.metrics.pairwise import manhattan_distances
from sklearn.metrics.pairwise import paired_cosine_distances
from sklearn.metrics.pairwise import paired_euclidean_distances
from sklearn.metrics.pairwise import paired_manhattan_distances
from sklearn.metrics.pairwise import paired_distances
from sklearn.metrics.pairwise import cosine_distances


def get_tfidf_features(X_train, X_test):
    tfidf = TfidfVectorizer()
    X_train_tfidf = tfidf.fit_transform(X_train).todense()
    X_test_tfidf = tfidf.transform(X_test).todense()
    return X_train_tfidf, X_test_tfidf

from sklearn.model_selection import KFold
cv = KFold(n_splits=10, random_state=42, shuffle=True)

i = 0
ls_scores = []
for train_index, test_index in cv.split(X_train):
    print("CV split: ", i)
    X_train_s1 = X_train.iloc[train_index, 0]
    X_train_s2 = X_train.iloc[train_index, 1]

    tfidf = TfidfVectorizer()
    tfidf.fit(pd.concat([X_train_s1, X_train_s2]))
    X_train_tfidf_s1 = tfidf.transform(X_train_s1).todense()
    X_train_tfidf_s2 = tfidf.transform(X_train_s2).todense()

    X_test_s1 = X_train.iloc[test_index, 0]
    X_test_s2 = X_train.iloc[test_index, 1]

    X_test_tfidf_s1 = tfidf.transform(X_test_s1).todense()
    X_test_tfidf_s2 = tfidf.transform(X_test_s2).todense()


    # print("Cosine similarity")
    df_dists_train = pd.DataFrame()
    df_dists_test = pd.DataFrame()

    cos_dist_train = paired_cosine_distances(np.asarray(X_train_tfidf_s1), np.asarray(X_train_tfidf_s2)).reshape(-1, 1)
    cos_dist_test = paired_cosine_distances(np.asarray(X_test_tfidf_s1), np.asarray(X_test_tfidf_s2)).reshape(-1, 1)

    df_dists_train["cosine"] = cos_dist_train.reshape(-1)
    df_dists_test["cosine"] = cos_dist_test.reshape(-1)

    # euc_dist_train = paired_euclidean_distances(np.asarray(X_train_tfidf_s1), np.asarray(X_train_tfidf_s2)).reshape(-1, 1)
    # euc_dist_test = paired_euclidean_distances(np.asarray(X_test_tfidf_s1), np.asarray(X_test_tfidf_s2)).reshape(-1, 1)

    # df_dists_train["euclidean"] = euc_dist_train.reshape(-1)
    # df_dists_test["euclidean"] = euc_dist_test.reshape(-1)


    rf = RandomForestRegressor(max_depth=2)
    rf.fit(df_dists_train, y_train.iloc[train_index])
    # print(y_train.iloc[train_index])
    y_pred = rf.predict(df_dists_test)
    # print(y_pred)
    score = pearsonr(y_pred, y_train.iloc[test_index].values).statistic
    ls_scores.append(score)

    i += 1

print(np.mean(ls_scores))

CV split:  0
CV split:  1
CV split:  2
CV split:  3
CV split:  4
CV split:  5
CV split:  6
CV split:  7
CV split:  8
CV split:  9
0.7086769724077565


In [101]:
(pd.concat([df_dists_train, y_train], axis=1)).corr()

Unnamed: 0,cosine,euclidean,gs
cosine,1.0,0.959999,-0.714074
euclidean,0.959999,1.0,-0.650376
gs,-0.714074,-0.650376,1.0


In [100]:
df_dists_train, df_dists_test = get_vectorized_distances(TfidfVectorizer(), X_train, y_train, X_test, y_test)
rf = RandomForestRegressor(max_depth=5)
rf.fit(df_dists_train, y_train)
y_pred = rf.predict(df_dists_test)
print(pearsonr(y_pred, y_test).statistic)


0.6558647523668577


In [None]:
rf = RandomForestRegressor(max_depth=2)
rf.fit(df_dists_train, y_train.iloc[train_index])
# print(y_train.iloc[train_index])
y_pred = rf.predict(df_dists_test)
# print(y_pred)
score = pearsonr(y_pred, y_train.iloc[test_index].values).statistic

In [52]:
from sklearn.feature_extraction.text import TfidfVectorizer

def train_and_test_tfidf(model, grid, X_train, y_train, X_test, y_test):
    pipe = Pipeline([('vectorizer', TfidfVectorizer()), ('model', model)])
    pearsonr_scorer = make_scorer(lambda x, y: pearsonr(x, y).statistic, greater_is_better=True)

    grid = GridSearchCV(pipe, grid, cv=10, scoring=pearsonr_scorer, verbose=10, n_jobs=-1)
    grid.fit(X_train, y_train)

    print("Best cross-validation score: {:.2f}".format(grid.best_score_))
    print("Best parameters: {}".format(grid.best_params_))
    print("Test set score: {:.2f}".format(grid.score(X_test, y_test)))

    return grid

train_and_test_tfidf(DecisionTreeRegressor(), {"model__max_depth": [10]}, 
                     X_train.iloc[:10, 0], df["gs"].iloc[:10], X_test.iloc[:10, 0], df_test["gs"].iloc[:10])



Fitting 10 folds for each of 1 candidates, totalling 10 fits
[CV 8/10; 1/1] START model__max_depth=10........................................[CV 5/10; 1/1] START model__max_depth=10........................................

[CV 10/10; 1/1] START model__max_depth=10.......................................
[CV 6/10; 1/1] START model__max_depth=10........................................
[CV 2/10; 1/1] START model__max_depth=10........................................
[CV 5/10; 1/1] END ...........model__max_depth=10;, score=nan total time=   0.0s
[CV 8/10; 1/1] END ...........model__max_depth=10;, score=nan total time=   0.0s
[CV 9/10; 1/1] START model__max_depth=10........................................
[CV 10/10; 1/1] END ..........model__max_depth=10;, score=nan total time=   0.0s
[CV 2/10; 1/1] END ...........model__max_depth=10;, score=nan total time=   0.0s
[CV 6/10; 1/1] END ...........model__max_depth=10;, score=nan total time=   0.0s
[CV 9/10; 1/1] END ...........model__max_depth=1

Traceback (most recent call last):
  File "/home/alberto/Documentos/GitHub/NLP_SemanticTextualSimilarity/.venv/lib/python3.10/site-packages/sklearn/model_selection/_validation.py", line 810, in _score
    scores = scorer(estimator, X_test, y_test)
  File "/home/alberto/Documentos/GitHub/NLP_SemanticTextualSimilarity/.venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 266, in __call__
    return self._score(partial(_cached_call, None), estimator, X, y_true, **_kwargs)
  File "/home/alberto/Documentos/GitHub/NLP_SemanticTextualSimilarity/.venv/lib/python3.10/site-packages/sklearn/metrics/_scorer.py", line 355, in _score
    return self._sign * self._score_func(y_true, y_pred, **scoring_kwargs)
  File "/tmp/ipykernel_3968/878468421.py", line 5, in <lambda>
  File "/home/alberto/Documentos/GitHub/NLP_SemanticTextualSimilarity/.venv/lib/python3.10/site-packages/scipy/stats/_stats_py.py", line 4768, in pearsonr
    raise ValueError('x and y must have length at least 2.')
Val