In [1]:
import os
import time 
import warnings
warnings.filterwarnings('ignore')

import nltk
import re
import pickle

%pylab inline
import matplotlib.pylab as plt
import seaborn as sns
plt.style.use('ggplot')
plt.style.use('seaborn-poster')
sns.set_palette('Set1', 10, desat=0.75)

import numpy as np
import pandas as pd
pd.set_option("display.max_rows", 200)
pd.set_option("display.max_colwidth", 200)

Populating the interactive namespace from numpy and matplotlib


In [2]:
DATA_PATH = 'data/'
USE_SPELLCHECK = False
NUM_CORES = 6

In [3]:
data = pd.read_csv(os.path.join(DATA_PATH, 'train.csv'))#[:10000]
data.columns = ['id', 'qid1', 'qid2', 'q1', 'q2', 'target']
print (data.shape[0], data.dropna().shape[0], data.q1.nunique(), data.q2.nunique())
data.fillna('xxx', inplace=True)

kagg = pd.read_csv(os.path.join(DATA_PATH, 'test.csv'))#[:20000]
kagg.columns = ['test_id', 'q1', 'q2']
print (kagg.shape[0], kagg.dropna().shape[0], kagg.q1.nunique(), kagg.q2.nunique())
kagg.fillna('xxx', inplace=True)

404290 404288 290457 299174
2345796 2345790 2211008 2227399


In [4]:
%%time
def sentence_lower_regex(text):  
    """
    input: text string
    output: string with no capital letters and some common English abbrv replaced
    """
    text = text.split(' ')
    
    # Convert words to lower case and split them
    text = [w.lower() for w in text]

    # Clean the text
    text = " ".join(text)
    text = re.sub(r"[^A-Za-z0-9^,!.\'+-=]", " ", text)
    text = re.sub(r"\'s", " 's ", text)
    text = re.sub(r"\'ve", " have ", text)
    text = re.sub(r"can't", " cannot ", text)
    text = re.sub(r"n't", " not ", text)
    text = re.sub(r"\'re", " are ", text)
    text = re.sub(r"\'d", " would ", text)
    text = re.sub(r"\'ll", " will ", text)
    text = re.sub(r",", " ", text)
    text = re.sub(r"\.", " ", text)
    text = re.sub(r";", " ", text)
    text = re.sub(r"!", " ! ", text)
    text = re.sub(r"\^", " ^ ", text)
    text = re.sub(r"\+", " + ", text)
    text = re.sub(r"\-", " - ", text)
    text = re.sub(r"\=", " = ", text)
    text = re.sub(r"\s{2,}", " ", text)
    
    return text.strip()

def data_lower_regex(data):
    data.q1 = [sentence_lower_regex(q) for q in data.q1]
    data.q2 = [sentence_lower_regex(q) for q in data.q2]
    data.replace('', 'xxx', inplace=True)
    return data

data = data_lower_regex(data)
kagg = data_lower_regex(kagg)

CPU times: user 2min 53s, sys: 1.74 s, total: 2min 55s
Wall time: 2min 56s


In [5]:
from nltk.stem.porter import PorterStemmer
from nltk.stem.wordnet import WordNetLemmatizer 
import multiprocessing as mp

def apply_parallel(df, my_func):
    """
    Input: 
        df: pandas DataFrame or pandas Series
        my_func: custom function which will be apllied to df. Must accept pandas DataFrame or Series as input.
    Output: concatenated results of function application on DataFrame. Either pandas Series or pandas DataFrame.
    
    df is splitted by the number of cores and function applied to each part independetly.
    Results are concatenated and returned
    """
    df_splitted = np.array_split(df, NUM_CORES)
    pool = mp.Pool(NUM_CORES)
    result = pd.concat(pool.map(my_func, df_splitted))
    pool.close()
    pool.join()
    return result

def lemmatize_col(pd_series):
    """
    Input: pandas Series with text inside
    Output: pandas Series with text lemmatized with WordNetLemmatizer
    Word order is unchanged
    """
    lemmatizer = WordNetLemmatizer() 
    def lemm_text(text):
        text = ' '.join([lemmatizer.lemmatize(word) for word in text.split(' ')])
        return text
    return pd_series.astype(str).apply(lemm_text)

def stem_col(pd_series):
    """
    Input: pandas Series with text inside
    Output: pandas Series with text stemmed with PorterStemmer
    Word order is unchanged
    """
    stemmer = PorterStemmer()
    def stem_text(text):
        text = " ".join([stemmer.stem(word) for word in text.split(' ')])
        return text
    return pd_series.astype(str).apply(stem_text)


# Identify stopwords for remove_stopwords_col function
stop_words_set = set(nltk.corpus.stopwords.words("english")) \
                -set(['nor', 'both', 'same', 'against', 'between', 'because', 'not', 'won', 'before', 'doesn',
                     'most', 'shouldn', 'mustn', 'needn', 'wouldn', 'couldn', 'mightn', 'wasn', 'aren', 'isn',
                     'why', 'were', 'no', 'hadn', 'didn', 'weren'])

def remove_stopwords_col(pd_series):
    """
    input: pandas Series with text inside
    output: pandas Series with text with stopwords remove
    Word order is unchanged
    """
    def remove_stops_text(text):
        text = ' '.join([word for word in text.split(' ') if not word in stop_words_set])
        return text
    return pd_series.astype(str).apply(remove_stops_text)

def get_tags(pd_series):
    """
    input: pd Series with question texts. Must not contain Nulls
    output: pd Series, with nltk part-of-speech tags
    """
    return pd_series.apply(lambda Q: ' '.join([word_tag[1] for word_tag in nltk.pos_tag(Q.split(' '))]))

In [6]:
%%time
# Create new columns in datasets with postfixes corresponding to data processing type
# E.g. column 'q1' after stemming will be named 'q1_stem'
# Also renames columns 'q1' and 'q2' to 'q1_src' and 'q2_src'

for col in ['q1', 'q2']:
    data[col+'_stem'] = apply_parallel(data[col], stem_col)
    kagg[col+'_stem'] = apply_parallel(kagg[col], stem_col)
    
    data[col+'_nostops'] = apply_parallel(data[col], remove_stopwords_col)
    kagg[col+'_nostops'] = apply_parallel(kagg[col], remove_stopwords_col)
    
    data[col+'_tags'] = apply_parallel(data[col], get_tags)
    kagg[col+'_tags'] = apply_parallel(kagg[col], get_tags)

    data = data.rename(columns={col: col+'_src'})
    kagg = kagg.rename(columns={col: col+'_src'})

# replace empty strings with xxx. Empty strings may oocur if there were only stopwords in question
data.replace('', 'xxx', inplace=True)
kagg.replace('', 'xxx', inplace=True)

CPU times: user 20.2 s, sys: 15.3 s, total: 35.6 s
Wall time: 21min 44s


In [7]:
%%time
def text_intersect_col(df):
    """
    Input: pandas DataFrame. Must have two columns, corresponding to two questions
    Output: pandas Series, consisting of words present in both questions
    Word order is maintained from the first question.
    """
    cols = df.columns
    Q1 = df[cols[0]]
    Q2 = df[cols[1]]
    
    def text_intersect(t1, t2):
        intersection = []
        t1_words = dict((word,0) for word in t1.split(' '))
        for word in t2.split(' '):
            if word in t1_words:
                intersection.append(word)
        return ' '.join(intersection)
    
    df['inter'] = [text_intersect(q1,q2) for q1,q2 in zip(Q1, Q2)]
    return df['inter']

def extra_words(q1, q2, inter):
    """
    input: three strings: two questions and shared words.
    output: string, containing all words, that are present only in question1 and not in question2 and vice versa.
    In terms os sets output = (q1+q2) - (q1*q2) 
    
    ! Output is not ordered ! 
    """
    return ' '.join(list(set(q1.split(' ') + q2.split(' ')) - set(inter.split(' '))))

    
# create a column for intersection for each processing type
postfixes = ['_src', '_stem', '_nostops', '_tags']
for p in postfixes:
    data['inter'+p] = apply_parallel(data[['q1'+p, 'q2'+p]], text_intersect_col)
    kagg['inter'+p] = apply_parallel(kagg[['q1'+p, 'q2'+p]], text_intersect_col)
    
    data['extra'+p] = data.apply(lambda row: extra_words(row['q1'+p], row['q2'+p], row['inter'+p]), axis=1)
    kagg['extra'+p] = kagg.apply(lambda row: extra_words(row['q1'+p], row['q2'+p], row['inter'+p]), axis=1)
    
data.replace('', 'xxx', inplace=True)
kagg.replace('', 'xxx', inplace=True)

CPU times: user 9min 23s, sys: 26.9 s, total: 9min 50s
Wall time: 10min 16s


In [8]:
data.to_csv(os.path.join(DATA_PATH, 'train_preprocessed.csv'), sep=';', index=False)
kagg.to_csv(os.path.join(DATA_PATH, 'test_preprocessed.csv' ), sep=';', index=False)
data[['target']].to_csv(os.path.join(DATA_PATH, 'target.csv'), index=False)

## Construciton of questions and graphs ids

Let's add question ids to the test data. As can be seen from train data, ids are constructed consequentially. First question is assigned to id #1, second to id #2 and so on. If both questions from pair haven't been seen in data, the quesiotn in th field q1 recieves lower id, than in q2 field.

In [11]:
%%time
question_id_dict = dict()
current_id = 1

for _,row in pd.concat([data, kagg], axis=0).iterrows():
    q1 = row.q1
    q2 = row.q2
    
    if q1 not in question_id_dict:
        question_id_dict[q1] = current_id
        current_id+=1
    if q2 not in question_id_dict:
        question_id_dict[q2] = current_id
        current_id+=1
        
data['qid1'] = data.q1.apply(lambda q: question_id_dict[q])
data['qid2'] = data.q2.apply(lambda q: question_id_dict[q])

kagg['qid1'] = kagg.q1.apply(lambda q: question_id_dict[q])
kagg['qid2'] = kagg.q2.apply(lambda q: question_id_dict[q])

CPU times: user 4min 46s, sys: 2.92 s, total: 4min 48s
Wall time: 4min 50s


We'll add graph ids to train data, since later we will split validation data by graph ids (all questions from the same graph must be in the same fold)

In [12]:
import networkx as nx
G = nx.Graph()
G.add_nodes_from(data['qid1'])
G.add_nodes_from(data['qid2'])
edges = list(data[['qid1', 'qid2']].to_records(index=False))
G.add_edges_from(edges)

graph_dict = dict()
graph_id = 1
for i in range(1,len(G)+1):
    graph = G[i]
    if i not in graph_dict:
        graph_dict[i] = graph_id
        
    for qid in graph:
        if qid not in graph_dict:
            graph_dict[qid] = graph_id
    graph_id+=1
    
print (len(set(graph_dict.values())))  
data['graph_id'] = data.qid1.apply(lambda x: graph_dict[x])

data[['qid1', 'qid2', 'graph_id']].to_csv(os.path.join(DATA_PATH, 'train_ids.csv'), index=False)
kagg[['qid1', 'qid2']]            .to_csv(os.path.join(DATA_PATH, 'test_ids.csv' ), index=False)

249770
