In [71]:
#Import Libraries
import numpy as np
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from sklearn import metrics
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.metrics import accuracy_score
from nltk.stem.porter import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.model_selection import GridSearchCV



In [72]:
df = pd.read_csv('train_with_label.txt', sep='\t+')
df.columns = ['instance_id', 'sentence_1', 'sentence_2', 'gold_label']

df2 = pd.read_csv('dev_with_label.txt', sep='\t+')
df2.columns = ['instance_id', 'sentence_1', 'sentence_2', 'gold_label']

df3 = pd.read_csv('test_without_label.txt', sep='\t+')
df3.columns = ['instance_id', 'sentence_1', 'sentence_2']


  return func(*args, **kwargs)


In [73]:
# A list of contractions from http://stackoverflow.com/questions/19790188/expanding-english-language-contractions-in-python
contractions = { 
"ain't": "am not",
"aren't": "are not",
"can't": "cannot",
"can't've": "cannot have",
"'cause": "because",
"could've": "could have",
"couldn't": "could not",
"couldn't've": "could not have",
"didn't": "did not",
"doesn't": "does not",
"don't": "do not",
"hadn't": "had not",
"hadn't've": "had not have",
"hasn't": "has not",
"haven't": "have not",
"he'd": "he would",
"he'd've": "he would have",
"he'll": "he will",
"he's": "he is",
"how'd": "how did",
"how'll": "how will",
"how's": "how is",
"i'd": "i would",
"i'll": "i will",
"i'm": "i am",
"i've": "i have",
"isn't": "is not",
"it'd": "it would",
"it'll": "it will",
"it's": "it is",
"let's": "let us",
"ma'am": "madam",
"mayn't": "may not",
"might've": "might have",
"mightn't": "might not",
"must've": "must have",
"mustn't": "must not",
"needn't": "need not",
"oughtn't": "ought not",
"shan't": "shall not",
"sha'n't": "shall not",
"she'd": "she would",
"she'll": "she will",
"she's": "she is",
"should've": "should have",
"shouldn't": "should not",
"that'd": "that would",
"that's": "that is",
"there'd": "there had",
"there's": "there is",
"they'd": "they would",
"they'll": "they will",
"they're": "they are",
"they've": "they have",
"wasn't": "was not",
"we'd": "we would",
"we'll": "we will",
"we're": "we are",
"we've": "we have",
"weren't": "were not",
"what'll": "what will",
"what're": "what are",
"what's": "what is",
"what've": "what have",
"where'd": "where did",
"where's": "where is",
"who'll": "who will",
"who's": "who is",
"won't": "will not",
"wouldn't": "would not",
"you'd": "you would",
"you'll": "you will",
"you're": "you are"
}


In [74]:
def clean_text(text, remove_stopwords = True):
    '''Remove unwanted characters, stopwords, and format the text to create fewer nulls word embeddings'''
    
    # Convert words to lower case
    text = text.lower()
    
    # Replace contractions with their longer forms 
    if True:
        text = text.split()
        new_text = []
        for word in text:
            if word in contractions:
                new_text.append(contractions[word])
            else:
                new_text.append(word)
        text = " ".join(new_text)
        
    # Format words and remove unwanted characters
    text = re.sub(r'https?:\/\/.*[\r\n]*', '', text, flags=re.MULTILINE)
    text = re.sub(r'\<a href', ' ', text)
    text = re.sub(r'&amp;', '', text) 
    text = re.sub(r'[_"\-;%()|+&=*%.,!?:#$@\[\]/]', ' ', text)
    text = re.sub(r'<br />', ' ', text)
    text = re.sub(r'\'', ' ', text)
    
    # remove stop words
    if remove_stopwords:
        text = text.split()
        stops = set(stopwords.words("english"))
        text = [w for w in text if not w in stops]
        text = " ".join(text)

    # Tokenize each word
    text =  nltk.WordPunctTokenizer().tokenize(text)
        
    return text

In [75]:
df['Sentence_1_Token'] = list(map(clean_text, df.sentence_1))
df['Sentence_2_Token'] = list(map(clean_text, df.sentence_2))

df2['Sentence_1_Token'] = list(map(clean_text, df2.sentence_1))
df2['Sentence_2_Token'] = list(map(clean_text, df2.sentence_2))

df3['Sentence_1_Token'] = list(map(clean_text, df3.sentence_1))
df3['Sentence_2_Token'] = list(map(clean_text, df3.sentence_2))

In [76]:
lemm = nltk.stem.WordNetLemmatizer()
df['Sentence_1_Cleaned'] = list(map(lambda word:list(map(lemm.lemmatize, word)),df.Sentence_1_Token))
df['Sentence_2_Cleaned'] = list(map(lambda word:list(map(lemm.lemmatize, word)),df.Sentence_2_Token))

lemm = nltk.stem.WordNetLemmatizer()
df2['Sentence_1_Cleaned'] = list(map(lambda word:list(map(lemm.lemmatize, word)),df2.Sentence_1_Token))
df2['Sentence_2_Cleaned'] = list(map(lambda word:list(map(lemm.lemmatize, word)),df2.Sentence_2_Token))

df3['Sentence_1_Cleaned'] = list(map(lambda word:list(map(lemm.lemmatize, word)),df3.Sentence_1_Token))
df3['Sentence_2_Cleaned'] = list(map(lambda word:list(map(lemm.lemmatize, word)),df3.Sentence_2_Token))

In [77]:
stemmer = PorterStemmer()
df['Sentence_1_Cleaned'] = list(map(lambda word:list(map(stemmer.stem, word)),df.Sentence_1_Cleaned))
df['Sentence_2_Cleaned'] = list(map(lambda word:list(map(stemmer.stem, word)),df.Sentence_2_Cleaned))

df2['Sentence_1_Cleaned'] = list(map(lambda word:list(map(stemmer.stem, word)),df2.Sentence_1_Cleaned))
df2['Sentence_2_Cleaned'] = list(map(lambda word:list(map(stemmer.stem, word)),df2.Sentence_2_Cleaned))

df3['Sentence_1_Cleaned'] = list(map(lambda word:list(map(stemmer.stem, word)),df3.Sentence_1_Cleaned))
df3['Sentence_2_Cleaned'] = list(map(lambda word:list(map(stemmer.stem, word)),df3.Sentence_2_Cleaned))

In [78]:
df["Sentence_1_Length"] = df["sentence_1"].apply(lambda n: len(n.split()))
df["Sentence_2_Length"] = df["sentence_2"].apply(lambda n: len(n.split()))

df2["Sentence_1_Length"] = df2["sentence_1"].apply(lambda n: len(n.split()))
df2["Sentence_2_Length"] = df2["sentence_2"].apply(lambda n: len(n.split()))

df3["Sentence_1_Length"] = df3["sentence_1"].apply(lambda n: len(n.split()))
df3["Sentence_2_Length"] = df3["sentence_2"].apply(lambda n: len(n.split()))


In [79]:
df['common_wordsCount'] = df[['sentence_1','sentence_2']].apply(lambda x: len(set(x[0].split(" ")).intersection(set(x[1].split(" ")))) , axis=1 )
df2['common_wordsCount'] = df2[['sentence_1','sentence_2']].apply(lambda x: len(set(x[0].split(" ")).intersection(set(x[1].split(" ")))) , axis=1 )
df3['common_wordsCount'] = df3[['sentence_1','sentence_2']].apply(lambda x: len(set(x[0].split(" ")).intersection(set(x[1].split(" ")))) , axis=1 )


In [80]:
def jaccard_similarity(x,y):
  intersection_cardinality = len(set.intersection(*[set(x), set(y)]))
  union_cardinality = len(set.union(*[set(x), set(y)]))
  return intersection_cardinality/float(union_cardinality)


In [81]:
for ind in df.index:
    sentence1 = df['sentence_1'][ind]
    sentence2 = df['sentence_2'][ind]
    df['Jaccard'] = jaccard_similarity(sentence1, sentence2)
    
for ind in df2.index:
    sentence1 = df2['sentence_1'][ind]
    sentence2 = df2['sentence_2'][ind]
    df2['Jaccard'] = jaccard_similarity(sentence1, sentence2)

for ind in df3.index:
    sentence1 = df3['sentence_1'][ind]
    sentence2 = df3['sentence_2'][ind]
    df3['Jaccard'] = jaccard_similarity(sentence1, sentence2)

In [82]:
for ind in df.index:
    reference = [df['Sentence_1_Cleaned'][ind]]
    candidate = df['Sentence_2_Cleaned'][ind]
    df['METEOR'] = nltk.translate.meteor_score.meteor_score(reference, candidate)
    
for ind in df2.index:
    reference = [df2['Sentence_1_Cleaned'][ind]]
    candidate = df2['Sentence_2_Cleaned'][ind]
    df2['METEOR'] = nltk.translate.meteor_score.meteor_score(reference, candidate)
    
for ind in df3.index:
    reference = [df3['Sentence_1_Cleaned'][ind]]
    candidate = df3['Sentence_2_Cleaned'][ind]
    df3['METEOR'] = nltk.translate.meteor_score.meteor_score(reference, candidate)
    

In [83]:
import math
import re
from collections import Counter

WORD = re.compile(r"\w+")


def get_cosine(vec1, vec2):
    intersection = set(vec1.keys()) & set(vec2.keys())
    numerator = sum([vec1[x] * vec2[x] for x in intersection])

    sum1 = sum([vec1[x] ** 2 for x in list(vec1.keys())])
    sum2 = sum([vec2[x] ** 2 for x in list(vec2.keys())])
    denominator = math.sqrt(sum1) * math.sqrt(sum2)

    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator


def text_to_vector(text):
    words = WORD.findall(text)
    return Counter(words)

In [84]:
for ind in df.index:
    vector1 = text_to_vector(df['sentence_1'][ind])
    vector2 = text_to_vector(df['sentence_2'][ind])
    df['Cosine'] = get_cosine(vector1, vector2)
    
for ind in df2.index:
    vector1 = text_to_vector(df2['sentence_1'][ind])
    vector2 = text_to_vector(df2['sentence_2'][ind])
    df2['Cosine'] = get_cosine(vector1, vector2)
    
for ind in df3.index:
    vector1 = text_to_vector(df3['sentence_1'][ind])
    vector2 = text_to_vector(df3['sentence_2'][ind])
    df3['Cosine'] = get_cosine(vector1, vector2)
    

In [85]:
X = df.drop(columns = ['instance_id', 'sentence_1', 'sentence_2', 'gold_label', 'Sentence_1_Token', 'Sentence_2_Token', 'Sentence_1_Cleaned', 'Sentence_2_Cleaned']).copy()
y = df['gold_label'].values

X_dev = df2.drop(columns = ['instance_id', 'sentence_1', 'sentence_2', 'gold_label', 'Sentence_1_Token', 'Sentence_2_Token', 'Sentence_1_Cleaned', 'Sentence_2_Cleaned']).copy()
y_dev = df2['gold_label'].values

X_final = df3.drop(columns = ['instance_id', 'sentence_1', 'sentence_2', 'Sentence_1_Token', 'Sentence_2_Token', 'Sentence_1_Cleaned', 'Sentence_2_Cleaned']).copy()



In [86]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7, random_state = 0)

In [87]:
from sklearn.preprocessing import StandardScaler  
scaler = StandardScaler()  

scaler.fit(X_train)  
X_train = scaler.transform(X_train)  

X_test = scaler.transform(X_test)
X_dev = scaler.transform(X_dev)
X_final = scaler.transform(X_final)

In [88]:
mlp_gs = MLPClassifier(max_iter=5000)
parameter_space = {
    'hidden_layer_sizes': [(10,30,10),(20,)],
    'activation': ['tanh', 'relu'],
    'solver': ['sgd', 'adam'],
    'alpha': [0.0001, 0.05],
    'learning_rate': ['constant','adaptive'],
}
from sklearn.model_selection import GridSearchCV
clf = GridSearchCV(mlp_gs, parameter_space, n_jobs=-1, cv=5)
clf.fit(X_train, y_train)

predictions = clf.predict(X_test)
acc = accuracy_score(y_test, predictions)
f1 = f1_score(y_test, predictions)
recall = recall_score(y_test, predictions)
precision = precision_score(y_test, predictions)

print("accuracy: {:.3f}, recall: {:.3f}, precision: {:.3f}, f1: {:.3f},".format(acc, recall, precision, f1))

accuracy: 0.891, recall: 0.836, precision: 0.752, f1: 0.792,


In [89]:
y_dev_pred = clf.predict(X_dev)
print(accuracy_score(y_dev, y_dev_pred))
print(f1_score(y_dev, y_dev_pred))

0.8687171792948237
0.7608200455580865


In [90]:
final = clf.predict(X_final)

In [92]:
file = open('ClareSchwarzenberg_test_result.txt', 'a')

for ind in df3.index:
    file.write(df3.iloc[:, 0][ind])
    file.write("\t")
    file.write(str(final[ind]))
    file.write("\n")
file.close()