# Utils Alba Garcia Romo

Some utils for cleaning the data and feature extraction. Also, testing different models with those features

In [141]:
import sys
import string
import nltk
import sklearn
import numpy as np
from typing import Iterable
import pandas as pd
import scipy
import sklearn
from sklearn import *
import os
import re
import unidecode
from nltk.corpus import stopwords
#import unicodedata

## Cleaning the data

### Some functions using regex

Next, we define some functions using **regex** with the goal of preprocessing data.

In [142]:
# all lower case sentence
def lowercase_sentence(sentence):
    """
    Args:
    sentence (str): The input sentence to be put in lower case.

    Returns:
    list: A list of tokens extracted from the input sentence.
    """
    new_sentence = sentence.lower()
    return new_sentence


# remove punctuation from a sentence
def remove_punctuation(sentence):
    """
    Args:
    sentence (str): The input sentence to remove punctuation.

    Returns:
    list: The sentence without punctuation symbols.
    """    
    new_sentence = re.sub(r'[^\w\s]', '', sentence) # matches non words and non spaces (includes '?') 
    return new_sentence


# remove accents
def remove_accents(sentence):
    '''
    Args:
      sentence (str): The input sentence to remove accent
    Return:
      str : The sentence without accents
    '''
    new_sentence = unidecode.unidecode(sentence) 
    return new_sentence


# remove non-alpha characters and non-alphanumeric characters (that is, special characters: punctuation marks, spaces, accents)
def remove_special_characters(sentence, numeric = False):
    """
    Args:
    sentence (str): The input sentence to remove non-alphanumeric characters.
    numeric (bool): if true, numbers are also removed

    Returns:
    str: The sentence without non-alphanumric characters (includes punctuation symbols and spaces).
    """
    if numeric:
        new_sentence = re.sub(r'[^a-zA-Z]', ' ', sentence) # matches non-alpha characters 
    else:
        new_sentence = re.sub(r'[^a-zA-Z0-9]', ' ', sentence) # matches non-alphanumeric characters
    return new_sentence


# remove stop words
def remove_stopwords(sentence):
    """
    Args:
    sentence (str): The input sentence from which stop words will be removed.

    Returns:
    str: The input sentence with stop words removed.
    """
    #stop_words = set(stopwords.words('english')) # predefined stop words in English
    stop_words = set(['the', 'and', 'to', 'in', 'of', 'that', 'is', 'it', 'for',
    'on', 'this', 'you', 'be', 'are', 'or', 'from', 'at', 'by', 'we',
    'an', 'not', 'have', 'has', 'but', 'as', 'if', 'so', 'they', 'their',
    'was', 'were','some', 'there', 'these', 'those', 'than', 'then', 'been', 'also',
    'much', 'many', 'other']) # custom defined set
    
    words = nltk.word_tokenize(sentence)
    filtered_words = [word for word in words if word.lower() not in stop_words]
    new_sentence = ' '.join(filtered_words)
    return new_sentence


# Normalize spaces - Replace all consecutive whitespace characters in the text string with a single space.
def normalize_spaces(sentence):
    '''
    Args:
      sentence (str): The input sentence to normalize
    Returns:
      str: The final sentence normalized 
    '''
    new_sentence = re.sub(r'\s+', ' ', sentence)
    return new_sentence
    

### Examples

For tokenizing we just us **nltk.word_tokenize**.

In [52]:
# Tokenize text
sentence = 'This is an example sentence to test the given tokenizer.'
print(f"From => {sentence} -> {nltk.word_tokenize(sentence)}")

From => This is an example sentence to test the given tokenizer. -> ['This', 'is', 'an', 'example', 'sentence', 'to', 'test', 'the', 'given', 'tokenizer', '.']


In [53]:
sentence = "Hello, höw are you doing? I hope everything is \ going well! Lét's meet at 3:00 PM. (It's raining outside.)"
print("Original sentence:\n\t", sentence)

# All lowercase
new_sentence = lowercase_sentence(sentence)
print("\nIn lower case:\n\t", new_sentence)

# Remove punctuation
new_sentence = remove_punctuation(sentence)
print("\nWithout punctuation symbols:\n\t", new_sentence)

# Remove accents
new_sentence = remove_accents(sentence)
print("\nWitout accents:\n\t", new_sentence)

# Remove special characters
new_sentence = remove_special_characters(sentence)
print("\nWithout special characters:\n\t", new_sentence)
new_sentence = remove_special_characters(sentence, numeric = True)
print("\nWithout special characters nor numbers:\n\t", new_sentence)

# Normalize spaces
norm_sentence = normalize_spaces(new_sentence)
print("\nNormalized spaces after removing special characters:\n\t", norm_sentence)

# Remove stop words
new_sentence = remove_stopwords(sentence)
print("\nWithout stop words:\n\t", new_sentence)


Original sentence:
	 Hello, höw are you doing? I hope everything is \ going well! Lét's meet at 3:00 PM. (It's raining outside.)

In lower case:
	 hello, höw are you doing? i hope everything is \ going well! lét's meet at 3:00 pm. (it's raining outside.)

Without punctuation symbols:
	 Hello höw are you doing I hope everything is  going well Léts meet at 300 PM Its raining outside

Witout accents:
	 Hello, how are you doing? I hope everything is \ going well! Let's meet at 3:00 PM. (It's raining outside.)

Without special characters:
	 Hello  h w are you doing  I hope everything is   going well  L t s meet at 3 00 PM   It s raining outside  

Without special characters nor numbers:
	 Hello  h w are you doing  I hope everything is   going well  L t s meet at      PM   It s raining outside  

Normalized spaces after removing special characters:
	 Hello h w are you doing I hope everything is going well L t s meet at PM It s raining outside 

Without stop words:
	 Hello , höw doing ? I hop

### BK Tree

Implemt a BK Tree to later perform **spelling correction** as part of the preprocess stage.

In [54]:
visited_nodes = []
class BKTree:
    def __init__(self, distfn, words):
        self.distfn = distfn

        it = iter(words)
        root = next(it)
        self.tree = (root, {})

        for i in it:
            self._add_word(self.tree, i)

    def _add_word(self, parent, word):
        pword, children = parent
        d = self.distfn(word, pword)
        if d in children:
            self._add_word(children[d], word)
        else:
            children[d] = (word, {})
            
    def _search_descendants(self, parent, max_distance, distance, query_word):
        node_word, children_dict = parent
        dist_to_node = distance(query_word, node_word)
        self.visited_nodes.append(node_word)
        results = []

        if dist_to_node <= max_distance:
            results.append((dist_to_node, node_word))

        I = range(max(0, dist_to_node - max_distance), dist_to_node + max_distance + 1)
        for dist in I:
            if dist in children_dict:
                child = children_dict[dist]
                if child[0] not in self.visited_nodes:
                    results.extend(self._search_descendants(child, max_distance, distance, query_word))
        return results

    def query(self, query_word, max_distance):
        self.visited_nodes = []
        results = self._search_descendants(self.tree, max_distance, self.distfn, query_word)
        sorted_results = sorted(results)
        return sorted_results

In [55]:
import editdistance

# spellinc correction

def correct_bktree(sentence, words):
    '''
    Args:
      sentence (str): The input sentence to be corrected
      words (set): The vocabulary that contains all known words
    Returns:
      str: The final sentence corrected 
    '''
    bk_tree = BKTree(editdistance.eval, words)
    correction = []
    for w in sentence.split(" "):
        if w in words:
            correction.append(w)
        else:
            w_similar = bk_tree.query(w,2)
            if len(w_similar)>0:
                w_corrected = w_similar[0][1]
                correction.append(w_corrected)
            else:
                # no word found, simply append the unedited word
                correction.append(w)
    #return correction
    return ' '.join(correction)

### Example

In [56]:
# define the vocabulary we will use
words = nltk.corpus.words.words() 

# sentence to correct
phrase = "the man wentt to the antimonarchik protest because he did not like the king"

In [38]:
print("Original:\n", phrase)
print("Corrected:\n", correct_bktree(phrase, words))

Original:
 the man wentt to the antimonarchik protest because he did not like the king
Corrected:
 the man went to the antimonarchic protest because he did not like the king


### Stemming and lemmantization

Another type of preprocessing: **stemming** and **lemmantization**.

**To choose between stemmers**: the choice between PorterStemmer and LancasterStemmer depends on your specific requirements and the characteristics of your text data. If you need a more conservative approach with stems closer to the original words, PorterStemmer may be a better choice. However, if you prefer a more aggressive stemming approach that produces shorter stems, LancasterStemmer might be more suitable. It's often a good idea to experiment with both stemmers on your data to determine which one performs better for your particular task.

In [143]:
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.stem import PorterStemmer
from nltk.stem import LancasterStemmer
porter = PorterStemmer()
lancaster = LancasterStemmer()
from nltk.stem import WordNetLemmatizer
wordnet_lemmatizer = WordNetLemmatizer()

In [144]:
# stemming (using both methods) -> remove prefixes and suffixes, may return non existing word 
def stem(sentence, type_porter = True):
    '''
    Args:
      sentence (str): The input sentence for stemming
      type_porter (bool): if True we use the Porter method, if false, the Lancaster method
    Returns:
      str: The final sentence stemmed
    '''
    token_words = word_tokenize(sentence)
    sentence_stemmed = []
    if type_porter:
        for word in token_words:
            sentence_stemmed.append(porter.stem(word))
            sentence_stemmed.append(" ")
    else:
        for word in token_words:
            sentence_stemmed.append(lancaster.stem(word))
            sentence_stemmed.append(" ")
    return "".join(sentence_stemmed)


# lemmantization (using wordnet_lemmatizer.lemmatize(w)) -> remove endings to return base word (it is a valid word)
def lemma(sentence):
    '''
    Args:
      sentence (str): The input sentence for lemmantization
      str: The final sentence lemmantized
    '''
    token_words = word_tokenize(sentence)
    sentence_lemma = []
    for word in token_words:
        sentence_lemma.append(wordnet_lemmatizer.lemmatize(word)) # focus on verbs
        sentence_lemma.append(" ")
    return "".join(sentence_lemma)
    

### Examples

In [41]:
sentence = "Hello, höw are you doing? I hope everything is \ going well! Lét's meet at 3:00 PM. (It's raining outside.)"
print("Original sentence:\n\t", sentence)

# stemming
new_sentence = stem(sentence)
print("\nStemmed sentence (Porter):\n\t", new_sentence)
new_sentence = stem(sentence, type_porter = False)
print("\nStemmed sentence (Lancaster):\n\t", new_sentence)

# lemmantization
new_sentence = lemma(sentence)
print("\nLemmantization sentence:\n\t", new_sentence)

Original sentence:
	 Hello, höw are you doing? I hope everything is \ going well! Lét's meet at 3:00 PM. (It's raining outside.)

Stemmed sentence (Porter):
	 hello , höw are you do ? i hope everyth is \ go well ! lét 's meet at 3:00 pm . ( it 's rain outsid . ) 

Stemmed sentence (Lancaster):
	 hello , höw ar you doing ? i hop everyth is \ going wel ! lét 's meet at 3:00 pm . ( it 's rain outsid . ) 

Lemmantization sentence:
	 Hello , höw are you doing ? I hope everything is \ going well ! Lét 's meet at 3:00 PM . ( It 's raining outside . ) 


**Observation:** nothing much changes with lemmantization

## Text features

Extracting **other interesting text features** like the number of words, the number of common words between two sentences, if the first word is the same, if the last word is the same and the number of words that are in the same position between two sentences.

In [145]:
# Number of words in a sentence
def number_words(sentence):
    '''
    Args:
      sentence (str): The input sentence to count the number of words
      
    Returns:
      int : The number of words in the given text
    '''
    return len(word_tokenize(sentence))


# Number of common words between two sentences
def number_common_words(s1, s2):
    '''
    Args:
      s1 (str): First sentence
      s2 (str): Second sentence
    
    Return:
      int: The number of common words that the two sentences have in common
    '''
    # Tokenize
    tokens1 = set(word_tokenize(s1))
    tokens2 = set(word_tokenize(s2))
    
    common = tokens1 & tokens2 # list of common tokens
    return len(common)


# Number of common words in the same position
def number_common_words_2(s1, s2):
    """
    Args:
      s1 (str): The first input sentence.
      s2 (str): The second input sentence.

    Returns:
      int: The number of common words at the same position in both sentences.
    """
    # Tokenize
    tokens1 = word_tokenize(s1)
    tokens2 = word_tokenize(s2)

    min_length = min(len(tokens1), len(tokens2))

    # Common words at the same position
    common_count = 0
    for i in range(min_length):
        if tokens1[i].lower() == tokens2[i].lower():
            common_count += 1

    return common_count


# If the first word of two sentences is equal
def first_word_equal(s1, s2):
    """
    Args:
      s1 (str): First sentence
      s2 (str): Second sentence
    Returns:
      A binary value indicating whether the firsts words of the two questions are equal.
    """
    # Tokenize
    tokens1 = word_tokenize(s1)
    tokens2 = word_tokenize(s2)
    
    if tokens1[0].lower() == tokens2[0].lower():
            return 1
    
    return 0


# If the last word of two sentences is equal
def last_word_equal(s1, s2):
    """
    Args:
      s1 (str): First sentence
      s2 (str): Second sentence
    Returns:
      A binary value indicating whether the lasts words of the two questions are equal.
    """
    # Tokenize
    tokens1 = word_tokenize(s1)
    tokens2 = word_tokenize(s2) # with word_tokenize, counts '.' as different token
    
    if tokens1[-1].lower() == tokens2[-1].lower():
            return 1
    
    return 0

### Examples

In [43]:
sentence1 = 'This is an example sentence to test the count of words'
sentence2 = 'This is a second example to test the count of common words'
sentence3 = 'Another different sentence'
print("Original sentence 1:\n\t", sentence1)
print("Original sentence 2:\n\t", sentence2)
print("Original sentence 3:\n\t", sentence3)

# number of words
k1 = number_words(sentence1)
k2 = number_words(sentence2)
print("\nNumber of words of sentence 1:", k1)
print("\nNumber of words of sentence 2:", k2)

# number of common words
k3 = number_common_words(sentence1, sentence2)
print("\nNumber of common words between sentence 1 and sentence 2:", k3)

k4 = number_common_words(sentence1, sentence3)
print("\nNumber of common words between sentence 1 and sentence 3:", k4)

# number of common words in the same postion
k3 = number_common_words_2(sentence1, sentence2)
print("\nNumber of common words in the same position between sentence 1 and sentence 2:", k3)

k4 = number_common_words_2(sentence1, sentence3)
print("\nNumber of common words in the same position between sentence 1 and sentence 3:", k4)

# first and last words equal
print("\nComparing the first words of sentence1 and sentence2:", first_word_equal(sentence1,sentence2))
print("\nComparing the first words of sentence1 and sentence3:", first_word_equal(sentence1,sentence3))

print("\nComparing the last words of sentence1 and sentence2:", last_word_equal(sentence1,sentence2))
print("\nComparing the last words of sentence1 and sentence3:", last_word_equal(sentence1,sentence3))

Original sentence 1:
	 This is an example sentence to test the count of words
Original sentence 2:
	 This is a second example to test the count of common words
Original sentence 3:
	 Another different sentence

Number of words of sentence 1: 11

Number of words of sentence 2: 12

Number of common words between sentence 1 and sentence 2: 9

Number of common words between sentence 1 and sentence 3: 1

Number of common words in the same position between sentence 1 and sentence 2: 7

Number of common words in the same position between sentence 1 and sentence 3: 0

Comparing the first words of sentence1 and sentence2: 1

Comparing the first words of sentence1 and sentence3: 0

Comparing the last words of sentence1 and sentence2: 1

Comparing the last words of sentence1 and sentence3: 0


## Models with preprocessing

We will apply our preprocessing functions to the data and construct the same model as in the simple solution to see if the performance is improved.

In [146]:
import pandas as pd
import matplotlib.pyplot as plt
from sklearn import *
import numpy as np
import sklearn
import pickle
import scipy
import os

import sys

import seaborn as sns
sns.set()

from utils import *

In [147]:
path_folder_quora = '../nlp_deliv1_materials/'

# Train and Validation data
train_df = pd.read_csv(os.path.join(path_folder_quora, "quora_train_data.csv"))
# use this to provide the expected generalization results
test_df = pd.read_csv(os.path.join(path_folder_quora,"quora_test_data.csv"))

A_df, te_df = sklearn.model_selection.train_test_split(train_df, test_size=0.05, random_state=123)
tr_df, va_df = sklearn.model_selection.train_test_split(A_df, test_size=0.05, random_state=123)

In [148]:
# dividng X and y for each dataset
y_tr = tr_df['is_duplicate'].values
X_tr_df = tr_df.drop(['is_duplicate'], axis =1)

y_va = va_df['is_duplicate'].values
X_va_df = va_df.drop(['is_duplicate'], axis =1)

y_te = te_df['is_duplicate'].values
X_te_df = te_df.drop(['is_duplicate'], axis =1)

print(f'Training:\n X train {X_tr_df.shape}\n y train {y_tr.shape}\n {"-"*20}')
print(f'Validation:\n X val {X_va_df.shape}\n y val {y_va.shape}\n {"-"*20}')
print(f'Test:\n X test {X_te_df.shape}\n y test {y_te.shape}\n {"-"*20}')

Training:
 X train (291897, 5)
 y train (291897,)
 --------------------
Validation:
 X val (15363, 5)
 y val (15363,)
 --------------------
Test:
 X test (16172, 5)
 y test (16172,)
 --------------------


In [149]:
# convert input data into list of strings

q1_train =  cast_list_as_strings(list(X_tr_df["question1"]))
q2_train =  cast_list_as_strings(list(X_tr_df["question2"]))

q1_val =  cast_list_as_strings(list(X_va_df["question1"]))
q2_val =  cast_list_as_strings(list(X_va_df["question2"]))

q1_test =  cast_list_as_strings(list(X_te_df["question1"]))
q2_test =  cast_list_as_strings(list(X_te_df["question2"]))

In [63]:
# example

print(q1_train[0])
print(q2_train[0])
print(y_tr[0])

Is Java or C++ or C the most popular language amongst startups for backend development?
How do I develop a software which will have a Java GUI and a C++ or C backend?
0


### Apply preprocessing functions to data

We apply the more agressive function 'remove_special_characters' as well as 'remove_stopwords' and 'normalize_spaces' to test the model with the simplified data.

In [150]:
# preprocess data with the function remove_special_characters that is the more agressive function
# also remove stopwords and normalize spaces

def preprocess_data(question_list):
    """
    Args:
      question_list (str list): list of string questions
    Returns:
      The list after preprocessing (we apply the preprocessing functions).
    """
    q_lower = [lowercase_sentence(question) for question in question_list] # lovercase
    q_sc = [remove_special_characters(question) for question in q_lower] # remove special characters (all)
    q_sw = [remove_stopwords(question) for question in q_sc] # remove stop words
    q_preprocessed = [normalize_spaces(question) for question in q_sw] # normalize spaces
    return q_preprocessed

In [151]:
q1_train_preprocessed = preprocess_data(q1_train)
q2_train_preprocessed = preprocess_data(q2_train)
q1_val_preprocessed = preprocess_data(q1_val)
q2_val_preprocessed = preprocess_data(q2_val)
q1_test_preprocessed = preprocess_data(q1_test)
q2_test_preprocessed = preprocess_data(q2_test)

In [67]:
# example

print(q1_train_preprocessed[0])
print(q2_train_preprocessed[0])
print(y_tr[0])

java c c most popular language amongst startups backend development
how do i develop a software which will a java gui a c c backend
0


### Use a countvectorizer like in the simple solution

In [68]:
# fit the countvectorizer
all_q_train_preprocessed = q1_train_preprocessed+q2_train_preprocessed

count_vectorizer = sklearn.feature_extraction.text.CountVectorizer(ngram_range=(1,1))
count_vectorizer.fit(all_q_train_preprocessed)

In [152]:
# Create a DataFrame with preprocessed columns
X_tr_pre = pd.DataFrame({'question1': q1_train_preprocessed, 'question2': q2_train_preprocessed}, columns=['question1', 'question2'])
#print(X_tr_pre)
X_va_pre = pd.DataFrame({'question1': q1_val_preprocessed, 'question2': q2_val_preprocessed}, columns=['question1', 'question2'])
X_te_pre = pd.DataFrame({'question1': q1_test_preprocessed, 'question2': q2_test_preprocessed}, columns=['question1', 'question2'])

In [153]:
# get features (concatenating q1+q2)
X_tr_q1q2_pre = get_features_from_df(X_tr_pre, count_vectorizer) # it converts list as strings and performs count_vectorizer
X_va_q1q2_pre = get_features_from_df(X_va_pre, count_vectorizer)
X_te_q1q2_pre = get_features_from_df(X_te_pre, count_vectorizer)

In [93]:
X_tr_q1q2_pre

<291897x148172 sparse matrix of type '<class 'numpy.int64'>'
	with 4253758 stored elements in Compressed Sparse Row format>

In [72]:
print(f'Training:\n X train {X_tr_q1q2_pre.shape}\n {"-"*20}')
print(f'Validation:\n X val {X_va_q1q2_pre.shape}\n{"-"*20}')
print(f'Test:\n X test {X_te_q1q2_pre.shape}\n{"-"*20}')

Training:
 X train (291897, 148172)
 --------------------
Validation:
 X val (15363, 148172)
--------------------
Test:
 X test (16172, 148172)
--------------------


In [76]:
# training a simple model
logistic = sklearn.linear_model.LogisticRegression(solver="liblinear", random_state=123)
logistic.fit(X_tr_q1q2_pre, y_tr)

In [129]:
# train metrics
train_metrics = evaluate_model(X_tr_q1q2_pre, y_tr, model=logistic, display=False)
train_metrics

{'accuracy': 0.8121803238813691,
 'roc_auc': 0.7844546064505284,
 'precision': 0.7827302825427779,
 'recall': 0.6789375365942063,
 'f1': 0.7271487582740258}

In [130]:
# Validation results
validation_metrics = evaluate_model(X_va_q1q2_pre, y_va, model=logistic, display=False)
validation_metrics

{'accuracy': 0.7461433313805897,
 'roc_auc': 0.7163589696383614,
 'precision': 0.6744323790720632,
 'recall': 0.6027880712899242,
 'f1': 0.636600819977637}

In [131]:
# Test results
test_metrics  = evaluate_model(X_te_q1q2_pre, y_te, model=logistic, display=False)
test_metrics

{'accuracy': 0.7556270096463023,
 'roc_auc': 0.7260376912293912,
 'precision': 0.6944181646168401,
 'recall': 0.6109538871316798,
 'f1': 0.6500177116542686}

**Conclusion**: after preprocessing the data, we obtain a very similar result to the simple solution. We will to use it to obtain a more homogenic data and because other models need it as well.

### Save preprocessed dataset

In [87]:
# save dataset with correct features:

# Save as model_name+(X/y)+(tr/va/te) (depending if its dataset or labels and what type they are)

# save model
if not os.path.isdir("model"):
    os.mkdir("model")

if not os.path.isdir("model/simple_solution_pre"):
        os.mkdir("model/simple_solution_pre")
        
with open('model/simple_solution_pre/simple_model_pre.pkl','wb') as f:
    pickle.dump(logistic,f)

with open('model/simple_solution_pre/simple_model_pre_X_tr.pkl','wb') as f:
    pickle.dump(X_tr_q1q2_pre,f)  
with open('model/simple_solution_pre/simple_model_pre_X_va.pkl','wb') as f:
    pickle.dump(X_va_q1q2_pre,f)   
with open('model/simple_solution_pre/simple_model_pre_X_te.pkl','wb') as f:
    pickle.dump(X_te_q1q2_pre,f)
with open('model/simple_solution_pre/simple_model_pre_y_tr.pkl','wb') as f:
    pickle.dump(y_tr,f)
with open('model/simple_solution_pre/simple_model_pre_y_va.pkl','wb') as f:
    pickle.dump(y_va,f)
with open('model/simple_solution_pre/simple_model_pre_y_te.pkl','wb') as f:
    pickle.dump(y_te,f)

## Models with text features

We will construct a more extensive dataframe with the basic text features defined above to see if the performance from the simple solution improves.

In [154]:
def build_numeric_features(q1_list, q2_list):
    """
    Args:
      q1_list (str list): list of string questions
      q2_list (str list): list of string questions
    Returns:
      A data frame containing the text features applied to both lists.
    """
    # number of words
    q1_f1 = [number_words(question) for question in q1_list]
    q2_f1 = [number_words(question) for question in q2_list]
    
    # number of common words
    q1q2_f2 = [number_common_words(question1, question2) for question1, question2 in zip(q1_list, q2_list)]
    
    # number of common words in the same position
    q1q2_f3 = [number_common_words_2(question1, question2) for question1, question2 in zip(q1_list, q2_list)]
    
    # first word equal
    q1q2_f4 = [first_word_equal(question1, question2) for question1, question2 in zip(q1_list, q2_list)]
    
    # last word equal
    q1q2_f5 = [last_word_equal(question1, question2) for question1, question2 in zip(q1_list, q2_list)]
    
    # build dataframe with features
    df_features = pd.DataFrame({'num_words_1': q1_f1, 'num_words_2': q2_f1, 'num_common_words': q1q2_f2,
                               'num_common_words_2': q1q2_f3, 'first_word': q1q2_f4, 'last_word': q1q2_f5}, 
                               columns=['num_words_1', 'num_words_2', 'num_common_words', 'num_common_words_2', 
                                        'first_word', 'last_word'])
    return df_features

In [155]:
X_tr_features = build_numeric_features(q1_train, q2_train)
X_tr_features

Unnamed: 0,num_words_1,num_words_2,num_common_words,num_common_words_2,first_word,last_word
0,16,19,6,0,0,1
1,16,27,2,0,0,1
2,8,8,7,7,1,1
3,16,11,6,7,1,0
4,5,8,2,1,1,1
...,...,...,...,...,...,...
291892,13,21,9,3,1,1
291893,4,5,4,1,1,1
291894,16,23,5,1,0,1
291895,17,16,2,0,0,1


In [125]:
# save csv 
X_tr_features.to_csv('X_tr_features.csv', index=False)

In [156]:
# combine countvectorizer data with features
X_tr_features_sparse = scipy.sparse.hstack([X_tr_q1q2_pre, scipy.sparse.csr_matrix(X_tr_features)])

In [113]:
logistic2 = sklearn.linear_model.LogisticRegression(solver="liblinear", random_state=123)
logistic2.fit(X_tr_features_sparse, y_tr)

In [132]:
# train metrics
train_metrics2 = evaluate_model(X_tr_features_sparse, y_tr, model=logistic2, display=False)
train_metrics2

{'accuracy': 0.8380627413094345,
 'roc_auc': 0.8221126055607623,
 'precision': 0.7913817085893127,
 'recall': 0.7614104220299445,
 'f1': 0.7761068192475475}

In [157]:
X_va_features = build_numeric_features(q1_val, q2_val)
X_va_features

Unnamed: 0,num_words_1,num_words_2,num_common_words,num_common_words_2,first_word,last_word
0,11,13,6,6,0,1
1,10,6,3,2,0,1
2,7,20,5,0,0,1
3,4,4,3,3,1,1
4,12,11,5,1,1,1
...,...,...,...,...,...,...
15358,11,11,10,10,1,1
15359,16,8,5,0,0,1
15360,6,5,4,3,1,1
15361,8,18,2,0,0,1


In [126]:
# save csv 
X_va_features.to_csv('X_va_features.csv', index=False)

In [158]:
# combine countvectorizer data with features
X_va_features_sparse = scipy.sparse.hstack([X_va_q1q2_pre, scipy.sparse.csr_matrix(X_va_features)])

# evaluate validation
train_metrics2 = evaluate_model(X_va_features_sparse, y_va, model=logistic2, display=False)
train_metrics2

{'accuracy': 0.7801861615569876,
 'roc_auc': 0.7579938330651488,
 'precision': 0.7143391988019469,
 'recall': 0.6733721545791425,
 'f1': 0.6932509764737943}

In [159]:
X_te_features = build_numeric_features(q1_test, q2_test)
X_te_features

Unnamed: 0,num_words_1,num_words_2,num_common_words,num_common_words_2,first_word,last_word
0,9,12,9,4,1,1
1,15,10,4,0,0,1
2,20,17,6,0,0,1
3,10,9,6,1,1,1
4,12,11,3,0,0,1
...,...,...,...,...,...,...
16167,14,28,3,0,0,1
16168,14,32,2,0,0,1
16169,9,10,3,2,1,1
16170,7,11,6,1,1,1


In [127]:
# save csv 
X_te_features.to_csv('X_te_features.csv', index=False)

In [160]:
# combine countvectorizer data with features
X_te_features_sparse = scipy.sparse.hstack([X_te_q1q2_pre, scipy.sparse.csr_matrix(X_te_features)])

# evaluate validation
train_metrics2 = evaluate_model(X_te_features_sparse, y_te, model=logistic2, display=False)
train_metrics2

{'accuracy': 0.785864457086322,
 'roc_auc': 0.7652421003172968,
 'precision': 0.7237073513893775,
 'recall': 0.685034126852006,
 'f1': 0.7038399042161978}

**Conclusion:** with the features the performance is improved **about 3%-5%** (not much), but something. We decide to keep the dataset with features and test with other models.

### Save features dataset

In [123]:
# save dataset with correct features:

# Save as model_name+(X/y)+(tr/va/te) (depending if its dataset or labels and what type they are)

# save model
if not os.path.isdir("model"):
    os.mkdir("model")

if not os.path.isdir("model/features_solution"):
        os.mkdir("model/features_solution")
        
with open('model/features_solution/features_model.pkl','wb') as f:
    pickle.dump(logistic2,f)

with open('model/features_solution/features_model_X_tr.pkl','wb') as f:
    pickle.dump(X_tr_features_sparse,f)  
with open('model/features_solution/features_model_X_va.pkl','wb') as f:
    pickle.dump(X_va_features_sparse,f)   
with open('model/features_solution/features_model_X_te.pkl','wb') as f:
    pickle.dump(X_te_features_sparse,f)
with open('model/features_solution/features_model_y_tr.pkl','wb') as f:
    pickle.dump(y_tr,f)
with open('model/features_solution/features_model_y_va.pkl','wb') as f:
    pickle.dump(y_va,f)
with open('model/features_solution/features_model_y_te.pkl','wb') as f:
    pickle.dump(y_te,f)

## Test different classifiers

Now that we know that the performance of the basic model is improved when adding features, we want to test more complex classifiers to see if they produce better results

In [161]:
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier

In [None]:
# Define classifiers
def train_models(X_train, y_train):
    rf_model = RandomForestClassifier(max_depth = 5, random_state=123)
    xgb_model = XGBClassifier(random_state=123)
    svc_model = SVC(random_state=123)

    classifiers = [rf_model, xgb_model, svc_model]
    trained_clf = []

    for clf in classifiers:
        clf.fit(X_tr_features_sparse, y_tr)
        clf.append(trained_clf)
        
    return trained_clf

In [None]:
# Evaluate trained classifiers
for clf in trained_clf:
    metrics = evaluate_model(X_test, y_test, clf, display=False)
    print(metrics)