## Libraries

In [88]:
!pip install -q spacy nltk numpy pandas scikit-learn pyjarowinkler lazypredict

In [263]:
import os
import re
import nltk
import spacy
import string
import numpy as np
import pandas as pd

from tqdm import tqdm
from itertools import chain
from functools import partial
from argparse import Namespace
from pyjarowinkler import distance

from nltk.wsd import lesk
from nltk.stem import WordNetLemmatizer
from nltk.metrics import jaccard_distance
from nltk.corpus import stopwords, wordnet

from scipy.stats import pearsonr

from sklearn import linear_model
from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler

from lazypredict.Supervised import LazyClassifier

nltk.download('wordnet')
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')
nltk.download('omw-1.4')
nltk.download('gutenberg')

[nltk_data] Downloading package wordnet to /home/rob/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to /home/rob/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /home/rob/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/rob/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package omw-1.4 to /home/rob/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package gutenberg to /home/rob/nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


True

## Download data

In [None]:
#!curl -o /content/drive/MyDrive/Colab_Notebooks/2.IHLT/final_project/trial.tgz https://gebakx.github.io/ihlt/sts/resources/trial.tgz
#!curl -o /content/drive/MyDrive/Colab_Notebooks/2.IHLT/final_project/train.tgz https://gebakx.github.io/ihlt/sts/resources/train.tgz
#!curl -o /content/drive/MyDrive/Colab_Notebooks/2.IHLT/final_project/test-gold.tgz https://gebakx.github.io/ihlt/sts/resources/test-gold.tgz

  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  2003  100  2003    0     0  47690      0 --:--:-- --:--:-- --:--:-- 47690
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  122k  100  122k    0     0   505k      0 --:--:-- --:--:-- --:--:--  503k
  % Total    % Received % Xferd  Average Speed   Time    Time     Time  Current
                                 Dload  Upload   Total   Spent    Left  Speed
100  115k  100  115k    0     0   345k      0 --:--:-- --:--:-- --:--:--  345k


# Bring data

In [15]:
!tar zxvf ../final_project/train.tgz
!tar zxvf ../final_project/trial.tgz
!tar zxvf ../final_project/test-gold.tgz

!rm ../final_project/train.tgz
!rm ../final_project/test-gold.tgz 
!rm ../final_project/trial.tgz


train/
train/00-readme.txt
train/STS.output.MSRpar.txt
train/STS.input.SMTeuroparl.txt
train/STS.input.MSRpar.txt
train/STS.gs.MSRpar.txt
train/STS.input.MSRvid.txt
train/STS.gs.MSRvid.txt
train/correlation.pl
train/STS.gs.SMTeuroparl.txt
trial/
trial/STS.input.txt
trial/00-readme.txt
trial/STS.gs.txt
trial/STS.ouput.txt
test-gold/
test-gold/STS.input.MSRpar.txt
test-gold/STS.gs.MSRpar.txt
test-gold/STS.input.MSRvid.txt
test-gold/STS.gs.MSRvid.txt
test-gold/STS.input.SMTeuroparl.txt
test-gold/STS.gs.SMTeuroparl.txt
test-gold/STS.input.surprise.SMTnews.txt
test-gold/STS.gs.surprise.SMTnews.txt
test-gold/STS.input.surprise.OnWN.txt
test-gold/STS.gs.surprise.OnWN.txt
test-gold/STS.gs.ALL.txt
test-gold/00-readme.txt


# Usesful functions

In [127]:
from typing import List


def apply_jaccard_lesk(sentence1: str, sentence2: str):

  # Apply lesk to sentence 1
  synset1 = [ lesk(sentence1, word) for word in sentence1 ]
  synset1 = { word for word in synset1 if word is not None }

  # Apply lesk to sentence 1
  synset2 = [ lesk(sentence2, word) for word in sentence2 ]
  synset2 = { word for word in synset2 if word is not None }

  # Calculate distance
  distance = jaccard_distance(synset1, synset2)

  return distance

# ------------------------------ #
# Lemmatization text process
# ------------------------------ #

lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"NN": "n",
                "NNS": "n",
                "NNP": "n",
                "NNPS": "n",
                "VB": "v",
                "VBD": "v",
                "VBG": "v",
                "VBN": "v",
                "VBP": "v",
                "VBZ": "v",
                "RB": "r",
                "RBR": "r",
                "RBS": "r",
                "JJ": "a",
                "JJR": "a",
                "JJS": "a",}
        
    return tag_dict.get(tag, wordnet.NOUN)


def lemmatize(column):
  lemmas = []
  for sentence in tqdm(column):
    sentence_lemmas = []
    for word in nltk.word_tokenize(sentence):
      sentence_lemmas.append(lemmatizer.lemmatize(word.lower(), get_wordnet_pos(word.lower())))
    lemmas.append(sentence_lemmas)
  return lemmas

  #return  [ list(lemmatizer.lemmatize(word.lower(), get_wordnet_pos(word.lower())) for word in nltk.word_tokenize(sentence)) for sentence in column ]  

# ------------------------------ #
#   Stopwords initialization
# ------------------------------ #
stopwords = nltk.corpus.stopwords.words("english")
stopwords[:10]
stopwords += string.punctuation
stopwords += ['.', ',', ';', '."']

# ------------------------------ #
# Similarity Function
# ------------------------------ #
def jaccard_similarity(s1: List[str], s2: List[str]):
    s1 = set(s1)
    s2 = set(s2)
    intersection = len(s1.intersection(s2))
    union = len(s1) + len(s2) - intersection
    return float(intersection) / float(union)


def jaccard_similarity_list(s1: List[List[str]], s2: List[List[str]]):
    return np.array(list(map(jaccard_similarity, s1, s2)))


def calculateJarowinklerSimilarity(dataframe, column1, column2):

  aux = []
  for row in dataframe.itertuples():
    
    # Longest one selected
    if len(row[column1]) >= len(row[column2]):
      sentence1 = row[column1]
      sentence2 = row[column2]
    else:
      sentence1 = row[column2]
      sentence2 = row[column1]

    similarities_array = []
    for word1 in sentence1:
      max = 0

      for word2 in sentence2:
        similarity = distance.get_jaro_distance(str(word1), str(word2), winkler=True, scaling=0.1)
           
        if max < similarity:
          max = similarity
        
      similarities_array.append(max)

    aux.append(np.array(similarities_array).mean())

  return aux


# ------------------------------ #
# Lemmatization text process
# ------------------------------ #

lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

# Pre-processing

### Data information
- trial : includes the definition of the scores, a sample of 5 sentence pairs and the input and output formats. It is not needed, but it is useful for prototyping.

- train : training data from paraphrasing data sets, input and output formats.

- test : test data from paraphrasing data sets.

In [71]:
train_path = '../final_project/train'
trial_path = '../final_project/trial'
test_path  = '../final_project/test-gold'

In [210]:
def read_data(text_datas: list[str], gs_datas: list[str]):
  all_df_text = []
  for text_data, gs_data in zip(text_datas, gs_datas):
    df_text = pd.read_csv(text_data, sep=r'\t', engine='python', header=None)
    df_text.columns = ["text1", "text2"]
    df_text['gs'] = pd.read_csv(gs_data, sep='\t', header=None)
    all_df_text.append(df_text.dropna())
  return pd.concat(all_df_text)

def get_dataset(path: str) -> pd.DataFrame:
  files = sorted(os.listdir(path))
  input_files = [ os.path.join(path, file) for file in files if 'input' in file ]
  gs_files = [ os.path.join(path, file) for file in files if 'gs' in file ]
  df = read_data(input_files, gs_files)
  return df

# **Similarities**

In [211]:
tokenized_text1 = [nltk.word_tokenize(phrase) for phrase in df['text1']]


In [112]:
syn = wordnet.synsets('hello')

In [113]:
print(syn)

[Synset('hello.n.01')]


In [173]:
from nltk.corpus import wordnet
#syn = wordnet.synsets('hello')[0]

synset = [wordnet.synsets(phrase)[0] for phrase in tqdm(df['text1'])]
print(synset[0])


  0%|          | 0/2234 [00:00<?, ?it/s]

IndexError: list index out of range

In [139]:
def get_features(df: pd.DataFrame):

    #--------------------------------------------#
    # 1. Tokenize features
    #--------------------------------------------#
    print("Tokenize features")
    tokenized_text1 = [nltk.word_tokenize(phrase) for phrase in tqdm(df['text1'])]
    tokenized_text2 = [nltk.word_tokenize(phrase) for phrase in tqdm(df['text2'])]

    #--------------------------------------------#
    # 2. Lemmatize features
    #--------------------------------------------#
    print("Lemmatize features")
    lemmatize_text1 = lemmatize(df['text1'])
    lemmatize_text2 = lemmatize(df['text2'])

    #--------------------------------------------#
    # 3. Stopwords features
    #--------------------------------------------#
    print("Stopwords features")
    stopwords_text1 = []
    stopwords_text2 = []

    for i in tqdm(range(len(tokenized_text1))):
        stopwords_text1.append([w for w in lemmatize_text1[i] if not w.lower() in stopwords])
        stopwords_text2.append([w for w in lemmatize_text2[i] if not w.lower() in stopwords])

    #--------------------------------------------#
    # 4. NLTK Words features
    #--------------------------------------------#
    print("NLTK Words features")

    

    #--------------------------------------------#
    # 5. Synset features
    #--------------------------------------------#
    print("Synset features")

    #--------------------------------------------#
    # 6. Spacy words features
    #--------------------------------------------#
    print("Spacy words features")

    #--------------------------------------------#
    # 7. Ngrams features
    #--------------------------------------------#
    print("Ngrams features")

    #--------------------------------------------#
    # 8.Word synonyms features
    #--------------------------------------------#
    print("Word synonyms features")


    features = [
        jaccard_similarity_list(tokenized_text1, tokenized_text2),
        jaccard_similarity_list(lemmatize_text1, lemmatize_text2),
        jaccard_similarity_list(stopwords_text1, stopwords_text2),
    ]
    return np.array(features)

Tokenize features


100%|██████████| 2234/2234 [00:00<00:00, 6714.01it/s]
100%|██████████| 2234/2234 [00:00<00:00, 7651.85it/s]


Lemmatize features


100%|██████████| 2234/2234 [00:05<00:00, 441.02it/s]
100%|██████████| 2234/2234 [00:04<00:00, 476.19it/s] 


Stopwords features


100%|██████████| 2234/2234 [00:00<00:00, 15164.63it/s]


NLTK Words features
Synset features
Spacy words features
Ngrams features
Word synonyms features


array([[0.5483871 , 0.42105263, 0.33333333, ..., 1.        , 0.56666667,
        0.44444444],
       [0.5483871 , 0.42105263, 0.34782609, ..., 1.        , 0.56666667,
        0.44444444],
       [0.5       , 0.46153846, 0.33333333, ..., 1.        , 0.52941176,
        0.16666667]])

# **Training**

## Get training dataset

In [212]:
train_dataset = get_dataset(train_path)
print(train_dataset.shape)
train_dataset.head()

(2234, 3)


Unnamed: 0,text1,text2,gs
0,But other sources close to the sale said Viven...,But other sources close to the sale said Viven...,4.0
1,Micron has declared its first quarterly profit...,Micron's numbers also marked the first quarter...,3.75
2,The fines are part of failed Republican effort...,"Perry said he backs the Senate's efforts, incl...",2.8
3,"The American Anglican Council, which represent...","The American Anglican Council, which represent...",3.4
4,The tech-loaded Nasdaq composite rose 20.96 po...,The technology-laced Nasdaq Composite Index <....,2.4


In [213]:
y_train = train_dataset['gs'].values
y_train.shape

(2234,)

## Get features of the training dataset

In [214]:
X_train_features: np.ndarray = get_features(train_dataset)
X_train_features.shape

Tokenize features


100%|██████████| 2234/2234 [00:00<00:00, 7156.03it/s]
100%|██████████| 2234/2234 [00:00<00:00, 6807.93it/s]


Lemmatize features


100%|██████████| 2234/2234 [00:04<00:00, 452.73it/s]
100%|██████████| 2234/2234 [00:04<00:00, 461.01it/s] 


Stopwords features


100%|██████████| 2234/2234 [00:00<00:00, 15124.47it/s]

NLTK Words features
Synset features
Spacy words features
Ngrams features
Word synonyms features





(3, 2234)

# **Testing**

## Get the test dataset

In [215]:
test_dataset = get_dataset(test_path)
print(test_dataset.shape)
test_dataset.head()

(2817, 3)


Unnamed: 0,text1,text2,gs
0,The problem likely will mean corrective change...,He said the problem needs to be corrected befo...,4.4
1,The technology-laced Nasdaq Composite Index .I...,The broad Standard & Poor's 500 Index .SPX inc...,0.8
2,"""It's a huge black eye,"" said publisher Arthur...","""It's a huge black eye,"" Arthur Sulzberger, th...",3.6
3,SEC Chairman William Donaldson said there is a...,"""I think there's a building confidence that th...",3.4
4,Vivendi shares closed 1.9 percent at 15.80 eur...,"In New York, Vivendi shares were 1.4 percent d...",1.4


## Get features of the test dataset

In [216]:
X_test_features: np.ndarray = get_features(test_dataset)
X_test_features.shape

Tokenize features


100%|██████████| 2817/2817 [00:00<00:00, 9272.33it/s]
100%|██████████| 2817/2817 [00:00<00:00, 9510.47it/s]


Lemmatize features


100%|██████████| 2817/2817 [00:04<00:00, 622.26it/s] 
100%|██████████| 2817/2817 [00:04<00:00, 622.94it/s] 


Stopwords features


100%|██████████| 2817/2817 [00:00<00:00, 16974.67it/s]

NLTK Words features
Synset features
Spacy words features
Ngrams features
Word synonyms features





(3, 2817)

In [217]:
y_test = test_dataset['gs'].values
y_test.shape

(2817,)

# Normalize all features

In [275]:
# Normalize the data

scaler = StandardScaler()
y_test_norm = scaler.fit_transform(y_test.reshape((len(y_test), 1)))[:, 0]
y_train_norm = scaler.fit_transform(y_train.reshape((len(y_train), 1)))[:, 0]
print(y_test_norm.shape)
print(y_train_norm.shape)
print(y_test_norm)
print(y_test_norm)

(2817,)
(2234,)
[ 0.84951116 -2.22262725  0.16681373 ...  0.72150539  0.08147656
  0.08147656]
[ 0.84951116 -2.22262725  0.16681373 ...  0.72150539  0.08147656
  0.08147656]


## Train the model

In [276]:
# Print all shapes
print("X_train_features shape: ", X_train_features.T.shape)
print("y_train shape: ", y_train_norm.shape)
print("X_test_features shape: ", X_test_features.T.shape)
print("y_test shape: ", y_test_norm.shape)

X_train_features shape:  (2234, 3)
y_train shape:  (2234,)
X_test_features shape:  (2817, 3)
y_test shape:  (2817,)


In [277]:
# Create a linear regression
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
reg.fit(X_train_features.T, y_train_norm)
print(reg.score(X_train_features.T, y_train_norm))

0.4072025229562516


In [278]:
reg.score(X_test_features.T, y_test_norm)

-0.5307562886856259

In [279]:
from lazypredict.Supervised import LazyRegressor
from sklearn.model_selection import train_test_split
from sklearn import datasets

# fit all models
reg = LazyRegressor(predictions=True)
regresion_models, regresion_predictions = reg.fit(X_train_features.T, X_test_features.T, y_train_norm, y_test_norm)

 67%|██████▋   | 28/42 [00:28<00:03,  4.38it/s]

In [None]:
regresion_models

Unnamed: 0_level_0,Adjusted R-Squared,R-Squared,RMSE,Time Taken
Model,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
DummyRegressor,-0.02,-0.02,1.18,0.01
LassoLars,-0.02,-0.02,1.18,0.01
Lasso,-0.02,-0.02,1.18,0.02
QuantileRegressor,-0.03,-0.03,1.19,268.13
ElasticNet,-0.11,-0.11,1.24,0.01
TweedieRegressor,-0.41,-0.41,1.39,0.01
RANSACRegressor,-0.56,-0.56,1.46,0.13
PoissonRegressor,-0.61,-0.6,1.48,0.01
PassiveAggressiveRegressor,-0.77,-0.77,1.56,0.01
AdaBoostRegressor,-0.78,-0.78,1.56,0.05
