# Feature Extraction

This notebook is mostly based on both Wei Xin's features and Keith's features

Features
```
1. is_equal
2. question_len
3. longest_common_substring
4. gestalt_ratio
5. levenshtein_ratio
6. jaro_ratio
7. jaro_wrinkle_ratio
8. bigram_similarity
9. trigram_similarity
10. count_similarity
11. tfidf_similarity
12. word2vec_similarity
13. is_same_category
14. shared_word_percentage
15. word_mover_distance
16. question1_distilbert_vec
17. question2_distilbert_vec
18. quora_distilbert_similarity
```


Import libraries

In [None]:
import pandas as pd
import numpy as np
import scipy

# For data preprocessing
import string
import re
import nltk
import spacy
import gensim
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

# For gestalt pattern matching
from difflib import SequenceMatcher

# For n-gram similarity
import ngram

# For levenshtein similarity
import Levenshtein

# For CountVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity

# For TfidfVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer

# For Bert
from sentence_transformers import SentenceTransformer, util

Download nltk libraries

In [None]:
nltk.download('all')

Load Data

In [None]:
data = pd.read_csv("train.csv")

In [None]:
data = pd.read_pickle("train_preprocess_final.pkl")

Word2Vec

In [None]:
import gensim.downloader

embedding = gensim.downloader.load('word2vec-google-news-300')

## Data Cleanup

We drop rows with missing values because a question without a corresponding pair is not useful for identifying duplicate questions.

In [None]:
data = data.dropna()

### Standarized data

Tokenize data

In [None]:
data['question1_tokenised'] = data['question1'].apply(nltk.word_tokenize)
data['question2_tokenised'] = data['question2'].apply(nltk.word_tokenize) 

Standardize data

```
1. tokenize
2. convert to lower case
3. remove special characters
4. remove stop words
```

In [None]:
stopwords_set = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

# https://www.machinelearningplus.com/nlp/lemmatization-examples-python/

wordnet_lemmatizer = WordNetLemmatizer()

def remove_special_characters(s: str) -> str:
  return re.sub('[^A-Za-z0-9 ]+', '', s)


def standardise(tokenized_sentence):
  result = [w.lower() for w in tokenized_sentence if not w.lower() in stopwords_set] # remove stop words and convert to lower
  result = [remove_special_characters(w) for w in result] # remove special characters
  result = [wordnet_lemmatizer.lemmatize(word) for word in result] # lemmatize
  return result

data['question1_standardised'] = data['question1_tokenised'].apply(standardise)
data['question2_standardised'] = data['question2_tokenised'].apply(standardise)

data['question1_standardised_str'] = data['question1_standardised'].apply(lambda x : ' '.join(x))
data['question2_standardised_str'] = data['question2_standardised'].apply(lambda x : ' '.join(x))

Drop rows that have empty strings after removing stop words

In [None]:
data = data.replace('', np.nan).dropna()

## Extract Feature

### Is Equal

This feature is a binary variable indicating if question1_standardised and question2_standardised are equal. Questions will most likely duplicates if they are equal. This feature will be useful for a decision tree as it leads to high information gain.

In [None]:
is_equal = data[:]['question1_standardised'] == data[:]['question2_standardised']
data['is_equal'] = is_equal

In [None]:
"Probability that question1 and question2 are duplicates if they are equal: {:.2%}".format(data[is_equal == True].loc[:,'is_duplicate'].sum() / len(data[is_equal == True]))

'Probability that question1 and question2 are duplicates if they are equal: 78.63%'

### Question Len

This feature is a variable indicating the ratio of the length of the shorter question to the length of the longer question. If the two questions are too far apart in length, they are unlikely to be a duplicate question as the shorter question likely has too little detail.

In [None]:
def question_len(row) -> float:
  q1_len = len(row['question1_tokenised'])
  q2_len = len(row['question2_tokenised'])
  return min(q1_len, q2_len) / max(q1_len, q2_len)

data['question_len'] = data.apply(question_len, axis=1)

### Longest Common substring

This feature is a variable indicating the ratio of the longest common substring to the minimum of the length of question1 and question2, i.e., `lcs(question1, question2) / max(len(question1), len(question2))`. Two questions are likely to be duplicates if they have a long common substring. We normalise the length of the longest common substring by dividing it over the minimum length between both questions to make the feature feasible for distance-based algorithms such as kNN or SVM.

In [None]:
def longest_common_substring(row) -> float:
  question1 = row['question1_standardised']
  question2 = row['question2_standardised']
  longest = 0
  for i in range(len(question1)):
    for j in range(len(question2)):
      for k in range(len(question2)):
        if (i + k >= len(question1) or j + k >= len(question2)):
          break
        if question1[i + k] != question2[j + k]:
          longest = max(longest, k)    
          break
  return longest / max(len(question1), len(question2))

data['longest_common_substring'] = data.apply(longest_common_substring, axis=1)

### Gestalt Ratio
This feature uses the gestalt pattern matching algorithm to determine the similarity of two strings.

#### Calculation
The algorithm is implemented in Python's `SequenceMatcher` module. We use `SequenceMatcher` to compute the similarity between question1 and question2.

In [None]:
def gestalt_ratio(row) -> float:
  question1 = row['question1_standardised_str']
  question2 = row['question2_standardised_str']
  return SequenceMatcher(None, question1, question2).ratio()

data['gestalt_ratio'] = data.apply(gestalt_ratio, axis=1)

### Levenshtein Ratio
This feature is a variable indicating the levensthein ratio between question1 and question2. Levenshtein ratio is a commonly used metric for computing edit distance between two strings.

In [None]:
def levenshtein_ratio(row) -> float:
  question1 = row['question1_standardised_str']
  question2 = row['question2_standardised_str']
  return Levenshtein.ratio(question1, question2)

data['levenshtein_ratio'] = data.apply(levenshtein_ratio, axis=1)

### Jaro Ratio
This feature is a variable indicating the jaro ratio between question1 and question2. Jaro ratio is a commonly used metric for computing edit distance between two strings.

In [None]:
def jaro_ratio(row) -> float:
  question1 = row['question1_standardised_str']
  question2 = row['question2_standardised_str']
  return Levenshtein.jaro(question1, question2)

data['jaro_ratio'] = data.apply(jaro_ratio, axis=1)

### Jaro Winkler Ratio
This feature is a variable indicating the jaro ratio between question1 and question2. Jaro ratio is a commonly used metric for computing edit distance between two strings, giving more weight to the prefix of the strings.

In [None]:
def jaro_winkler_ratio(row) -> float:
  question1 = row['question1_standardised_str']
  question2 = row['question2_standardised_str']
  return Levenshtein.jaro_winkler(question1, question2)

data['jaro_winkler_ratio'] = data.apply(jaro_winkler_ratio, axis=1)

### Bigram Similarity
This feature is a variable indicating the bigram similarity between question1 and question2. Bigram similarity is a generalisation of the longest common subsequence feature above, but it may be a more useful feature as words are grouped into bigrams which are more likely to convey the meaning of the question. The similarity score is between 0 and 1 which makes it suitable for distance-based algorithms such as kNN or SVM.

#### Calculation
We use ngram to compare the two questions with `N=2`

In [None]:
def bigram_similarity(row) -> float:
  question1 = row['question1_standardised_str']
  question2 = row['question2_standardised_str']
  return ngram.NGram.compare(question1, question2, N=2)

data['bigram_similarity'] = data.apply(bigram_similarity, axis=1)

### Trigram Similarity
This feature is a variable indicating the trigram similarity between question1 and question2. Trigram similarity is a generalisation of the longest common subsequence feature above, but it may be a more useful feature as words are grouped into trigrams which are more likely to convey the meaning of the question. The similarity score is between 0 and 1 which makes it suitable for distance-based algorithms such as kNN or SVM.

#### Calculation
We use ngram to compare the two questions with `N=3`

In [None]:
def trigram_similarity(row) -> float:
  question1 = row['question1_standardised_str']
  question2 = row['question2_standardised_str']
  return ngram.NGram.compare(question1, question2, N=3)

data['trigram_similarity'] = data.apply(trigram_similarity, axis=1)

### Count Similarity

This feature is a variable indicating the cosine similarity between the two questions that have been vectorised using CountVectorizer.

In [None]:
count_vectorizer = CountVectorizer()
count_vectorizer.fit(pd.concat((data['question1_standardised_str'], data['question2_standardised_str'])))

def count_similarity(row) -> float:
  question1 = row['question1_standardised_str']
  question2 = row['question2_standardised_str']
  return cosine_similarity(count_vectorizer.transform(pd.Series([question1])), count_vectorizer.transform(pd.Series([question2])))[0][0]

data['count_similarity'] = data.apply(count_similarity, axis=1)

### TFIDF Similarity
This feature is a variable indicating the cosine similarity between the two questions that have been vectorised using TfidfVectorizer.

In [None]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_vectorizer.fit(pd.concat((data['question1_standardised_str'], data['question2_standardised_str'])))

def tfidf_similarity(row) -> float:
  question1 = row['question1_standardised_str']
  question2 = row['question2_standardised_str']
  return cosine_similarity(tfidf_vectorizer.transform(pd.Series([question1])), tfidf_vectorizer.transform(pd.Series([question2])))[0][0]

data['tfidf_similarity'] = data.apply(tfidf_similarity, axis=1)

### Word2Vec Similarity

This feature is a variable indicating the cosine similarity between the two questions that have been vectorised using Word2Vec.

In [None]:
def sen2vec(question):
  vectors = []
  for w in question:
    if (w in embedding):
      vectors.append(embedding[w])
  if (len(vectors) == 0):
    return None
  return np.sum(vectors, axis=0)/len(vectors)

def senDifCos(q1, q2):
  if (type(q1) == type(None) or type(q2) == type(None)):
    return 0
  return float(np.dot(q1, q2)/(np.linalg.norm(q1)*np.linalg.norm(q2)))

def word2vec_similarity(row) -> float:
  return senDifCos(sen2vec(row['question1_standardised']), sen2vec(row['question2_standardised']))  

data['word2vec_similarity'] = data.apply(word2vec_similarity, axis=1)

### Is Same Category (this is done before dropping the stopwords)

In [None]:
# returns 1 iff same starting question word, otherwise 0 (either not same, or cannot determine)
def is_same_category(row):
  question1 = row['question1_tokenised']
  question2 = row['question2_tokenised']

  if (len(question1) == 0 or len(question2) == 0):
    return 0

  categories = {'what', 'which', 'why', 'where', 'when', 'who', 'how'}
  
  if question1[0].lower() not in categories or question2[0].lower() not in categories:
    return 0
  
  return 1 if question1[0].lower() == question2[0].lower() else -1

data['is_same_category'] = data.apply(is_same_category, axis=1)

### Shared Word Percentage

In [None]:
def shared_word_percentage(row):
  question1_words = set(row['question1_standardised'])
  question2_words = set(row['question2_standardised'])

  shared = question1_words.intersection(question2_words)
  union = question1_words.union(question2_words)
  return len(shared) / len(union)

data['shared_word_percentage'] = data.apply(shared_word_percentage, axis=1)

### Word Mover Distance

In [None]:
def distance(q1, q2):
  d = embedding.wmdistance(q1, q2)
  if (d == np.inf):
    return 0
  return d

data["word_mover_distance"] = data.apply(lambda x: distance(x.question1_standardised, x.question2_standardised), axis = 1)

### Bert Vector

In [None]:
from sentence_transformers import SentenceTransformer, util

In [None]:
model_name = 'sentence-transformers/quora-distilbert-multilingual'
# model_name = 'sentence-transformers/distilbert-multilingual-nli-stsb-quora-ranking'
model = SentenceTransformer(model_name)

data['question1_distilbert_vec'] = data['question1'].apply(model.encode)
data['question2_distilbert_vec'] = data['question2'].apply(model.encode)

### Bert Vector Similarity

In [None]:
data['quora_distilbert_similarity'] = data.apply(lambda row: cosine_similarity([row['question1_distilbert_vec']], [row['question2_distilbert_vec']])[0][0], axis=1)

## Export data

In [None]:
path = 'train_preprocess_final.pkl'
data.to_pickle(path)