# Research Skills: Data Processing Advanced
## Group 15

# Final Solution:
## Import Data

In [1]:
import numpy as np
import pandas as pd
import scipy
import nltk
import sklearn.neighbors
import scipy.spatial.distance
from sklearn import feature_extraction
from sklearn.feature_extraction.text import CountVectorizer 
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import pairwise_distances
from sklearn.metrics.pairwise import euclidean_distances
from nltk.corpus import stopwords
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.tokenize import RegexpTokenizer
from nltk.util import ngrams 

In [2]:
with open("competition_descriptions.txt") as f:
    content = f.readlines()

## Preprocessing: Punctuation & Stopwords

In [3]:
def getWords(sentences):
    from nltk.corpus import stopwords
    from nltk.tokenize import word_tokenize 
    from nltk.tokenize import RegexpTokenizer
    from nltk.stem import WordNetLemmatizer     

    stop_words = set(stopwords.words('english')) - {'he', 'she', 'her', 'his', 'they', 'not'}
    filtered = []

    for each in range(len(sentences)):
        sentences[each] = sentences[each].replace("[comma]", "").strip()
        filtered_sentence = []
        comment_text = sentences[each]
        # Remove punctuation & non-alphabetic characters
        tokenizer = RegexpTokenizer(r'\w+')
        # Lemmatization
        lemmatizer = WordNetLemmatizer() 
        # Remove stop-words
        word_tokens = tokenizer.tokenize(comment_text.lower())
        for w in word_tokens: 
            if w not in stop_words: 
                w = lemmatizer.lemmatize(w)
                filtered_sentence.append(w)
        filtered.append(filtered_sentence)
    return filtered

words = getWords(content)
tokens = []
for each in range(len(words)):
    tokens.append(nltk.pos_tag(words[each]))

## Preprocessing: Tokenization & Lemmatization

In [4]:
# Get parts of speech tags
words = getWords(content)
tokens = []
for each in range(len(words)):
    tokens.append(nltk.pos_tag(words[each]))

In [5]:
# Reformat PoS tags to use with the Lemmatizer
lemmatizer = WordNetLemmatizer() 

def getWordnetPos(tags):
    for each in range(len(tags)):
        #print(tags[each])
        for i in range(len(tags[each])):
            if tags[each][i][1].startswith('J'):
                tags[each][i] += (str(wordnet.ADJ), )
            #print(tags[each])
            elif tags[each][i][1].startswith('V'):
                tags[each][i] += (str(wordnet.VERB), )
            #print(tags[each])
            elif tags[each][i][1].startswith('N'):
                tags[each][i] += (str(wordnet.NOUN), )
            #print(tags[each])
            elif tags[each][i][1].startswith('R'):
                tags[each][i] += (str(wordnet.ADV), )
            #print(tags[each])
            else:
                tags[each][i] += (str(""), )
            #print(tags[each])
            
    return tags

tokenized_words = getWordnetPos(tokens)

In [6]:
# Generating lemmatized sentences

lemmatized_sentences = []
lemmatized_words = []

for each in range(len(tokenized_words)):
    lemmatized_words = []
    for i in range(len(tokenized_words[each])):
        if tokenized_words[each][i][2] in ["a", "n", "v", "r"]: 
            if tokenized_words[each][i][0] != lemmatizer.lemmatize(tokenized_words[each][i][0], tokenized_words[each][i][2]):
                print(tokenized_words[each][i][0], "->", lemmatizer.lemmatize(tokenized_words[each][i][0], tokenized_words[each][i][2]), "| position:", tokenized_words[each][i][2])                
                lemmatized_words.append(lemmatizer.lemmatize(tokenized_words[each][i][0], tokenized_words[each][i][2]))
            else:
                lemmatized_words.append(tokenized_words[each][i][0])
        else:
            lemmatized_words.append(tokenized_words[each][i][0])
    lemmatized_sentences.append(lemmatized_words)

looking -> look | position: v
made -> make | position: v
combed -> comb | position: v
kept -> keep | position: v
wearing -> wear | position: v
pulled -> pull | position: v
looking -> look | position: v
looking -> look | position: v
groomed -> groom | position: v
left -> leave | position: v
bit -> bite | position: v
pulled -> pull | position: v
colored -> color | position: v
wearing -> wear | position: v
shaped -> shape | position: v
sized -> size | position: v
seems -> seem | position: v
left -> leave | position: v
looking -> look | position: v
organized -> organize | position: v
curlier -> curly | position: a
observes -> observe | position: v
looking -> look | position: v
dating -> date | position: v
peaked -> peak | position: v
sized -> size | position: v
leaning -> lean | position: v
appears -> appear | position: v
colored -> color | position: v
requires -> require | position: v
piercing -> pierce | position: v
tied -> tie | position: v
combed -> comb | position: v
piercing -> pierc

marking -> mark | position: v
appears -> appear | position: v
appears -> appear | position: v
older -> old | position: a
appears -> appear | position: v
piercing -> pierce | position: v
saying -> say | position: v
looking -> look | position: v
pulled -> pull | position: v
wearing -> wear | position: v
looking -> look | position: v
pulled -> pull | position: v
groomed -> groom | position: v
shaped -> shape | position: v
aged -> age | position: v
disheveled -> dishevel | position: v
looking -> look | position: v
balding -> bald | position: v
shaved -> shave | position: v
wearing -> wear | position: v
pulled -> pull | position: v
pressed -> press | position: v
skinned -> skin | position: v
breasted -> breast | position: v
lacking -> lack | position: v
wearing -> wear | position: v
lacking -> lack | position: v
proportioned -> proportion | position: v
slanted -> slant | position: v
styled -> style | position: v
built -> build | position: v
rose -> rise | position: v
looking -> look | posit

In [7]:
# Test out 
for x in range(3):
    i = np.random.randint(0, len(content))
    print("Original:\n", content[i])
    print("Optimized:\n", " ".join(str(word) for word in words[i]))    
    print("Lemmatized:\n", " ".join(str(lemmatized_word) for lemmatized_word in lemmatized_sentences[i]))
    print()

Original:
 This person is a Caucasian male with brown hair and brown eyes.
Optimized:
 person caucasian male brown hair brown eye
Lemmatized:
 person caucasian male brown hair brown eye

Original:
 A block face with a semi large nose. Short hair that is combed over. Somewhat large forehead.
Optimized:
 block face semi large nose short hair combed somewhat large forehead
Lemmatized:
 block face semi large nose short hair comb somewhat large forehead

Original:
 this person appears to be a female in their late twenties. they have brown hair in a ponytail the length is somewhat hidden but looks to be a bit longer than shoulder length.  she has dark eyes brown hair somewhat thick arched eyebrows that are close to her eyes a slightly wide nose and average lips.  she has her ears pierced. her skin is fair and blemish-free.
Optimized:
 person appears female late twenty they brown hair ponytail length somewhat hidden look bit longer shoulder length she dark eye brown hair somewhat thick arched

## Sparse Matrix Creation

In [8]:
# Generate overall word frequency distribution
input_sentence = lemmatized_sentences

lem_words = []
for i in range(len(input_sentence)):
    for each in input_sentence[i]:
        lem_words.append(each)

lem_freq_dist = nltk.FreqDist(lem_words)

In [9]:
def word_index(text):
    word_index = {}
    index = 0
    for line in text:
        for word in line:
            if not word in word_index:
                word_index[word] = index
                index += 1
    return word_index

def word_matrix_dan(text):
    word_idx = word_index(text)
    word_matrix = scipy.sparse.dok_matrix((len(text), len(word_idx)))
    max_val = max(lem_freq_dist.values())
    
    doc_num = 0
    for line in text:
        for word in line:
            word_num = word_idx[word]
            # Original function:
            # word_matrix[doc_num, word_num] += 1 
            # Our replacement:
            word_matrix[doc_num, word_num] += 1 * (np.log(max_val+1) - np.log(lem_freq_dist[word]+1))
            # Rationale - we are looking for a weighing function to adjust the scale of each dictionary entry
            # according to the word's overall frequency across the entire corpus. After testing TFID, which
            # includes similar functionality, and achieving unsatisfactory results, we built our own formula. 
            # Additional formulae tested:
            #word_matrix[doc_num, word_num] += 1 * (np.log(max_val) - np.log(freq_dist[word]))
            #word_matrix[doc_num, word_num] = word_matrix[doc_num, word_num] / freq_dist[word]
            #word_matrix[doc_num, word_num] = word_matrix[doc_num, word_num] * (np.log(max_val+1) - np.log(freq_dist[word]+1))
        doc_num += 1        
    return word_matrix

doc_matrix = word_matrix_dan(input_sentence)

# Bigram Threshold: 5.83
final_matrix = doc_matrix.toarray()
final_matrix[doc_matrix.toarray() > 5.83] = 0

In [10]:
# SVD Truncation

from sklearn.decomposition import TruncatedSVD
svd = TruncatedSVD(n_components = 250, n_iter = 7, random_state = 42)

tfidf_svd = svd.fit_transform(final_matrix)

## Metrics: Cosine Similarity

In [11]:
# Cosine Similarity

similarity = sklearn.metrics.pairwise.cosine_similarity(tfidf_svd)
similarity = np.around(similarity, decimals=4)
similarity[similarity == 1] = 0

simi_list = []
for i in range(772):
    a, b, c = np.argsort(similarity[i])[-3:]
    simi_list.append(a)
    simi_list.append(b)
    simi_list.append(c)

 ## JACCARD SIMILARITY

In [12]:
def get_jaccard_sim(str1, str2): 
    a = set(str1) 
    b = set(str2)
    c = a.intersection(b)
    return float(len(c)) / (len(a) + len(b) - len(c))

In [15]:
jacc_total=[]
for i in range(len(lemmatized_sentences)):
    jacc_score=[]
    for j in range(len(lemmatized_sentences)):
            jacc_score.append(get_jaccard_sim(lemmatized_sentences[i], lemmatized_sentences[j]))
    jacc_total.append(jacc_score)

#jacc_total can be passed to the output
similarity= np.array(jacc_total)

similarity = np.around(similarity, decimals=4)
similarity[similarity == 1] = 0

simi_list = []
for i in range(772):
    a, b, c = np.argsort(similarity[i])[-3:]
    simi_list.append(a)
    simi_list.append(b)
    simi_list.append(c)

## CORRELATION DISTANCE

In [22]:
from scipy.spatial import distance
doc_corr_matrix = word_matrix_dan(lemmatized_sentences)
doc_corr_matrix= doc_corr_matrix.toarray()
corr_total=[]
for i in range(len(doc_corr_matrix)):
    corr_score=[]
    for j in range(len(doc_corr_matrix)):
            corr_score.append(distance.correlation(doc_corr_matrix[i], doc_corr_matrix[j]))
    corr_total.append(corr_score)
    
similarity= np.array(corr_total)

similarity = np.around(similarity, decimals=4)
similarity[similarity == 1] = 0

simi_list = []
for i in range(772):
    a, b, c = np.argsort(similarity[i])[-3:]
    simi_list.append(a)
    simi_list.append(b)
    simi_list.append(c)

## Output

In [20]:
# Generating submission file

simi_list = np.array(simi_list).reshape(772, 3)
baseline = np.arange(0, 772).reshape(772, 1)
submission_matrix = np.hstack((baseline, simi_list))
np.savetxt("submission.txt", submission_matrix, fmt='%s', delimiter=",")

# Alternative Approaches: 

## Preprocessing: Generating N-Grams

In [None]:
sentences = lemmatized_sentences

bigrams = []
for sentence in sentences:
    sequence = word_tokenize(" ".join([e for e in sentence])) 
    bigrams.extend(list(ngrams(sequence, 2)))

freq_dist = nltk.FreqDist(bigrams)
prob_dist = nltk.MLEProbDist(freq_dist)
number_of_bigrams = freq_dist.N()

In [None]:
import matplotlib.pyplot as plt
from matplotlib.pyplot import figure

# Frequency distribution of top 20 bigrams
plt.figure(figsize=(14,10))
freq_dist.plot(20,cumulative=False)
plt.show()

## Bag of Words Approach

In [34]:
words = []
for each in range(len(lemmatized_sentences)):
    words.append(" ".join([e for e in lemmatized_sentences[each]]))
    
vectorizer = CountVectorizer()
text_features = vectorizer.fit_transform(words).todense()
print(vectorizer.vocabulary_)

{'beautiful': 112, 'eyebrow': 357, 'nice': 678, 'hair': 455, 'cut': 260, 'his': 497, 'complexion': 225, 'clean': 203, 'young': 1100, 'look': 591, 'ear': 317, 'big': 120, 'care': 173, 'although': 61, 'people': 725, 'probably': 773, 'make': 604, 'fun': 415, 'unfortunately': 1036, 'they': 986, 'short': 869, 'dark': 263, 'light': 575, 'brown': 155, 'olive': 698, 'skin': 887, 'thick': 987, 'red': 807, 'lip': 582, 'patchy': 721, 'facial': 365, 'wear': 1065, 'large': 554, 'frame': 400, 'sign': 876, 'acne': 45, 'asian': 85, 'guy': 453, '20': 24, 'he': 471, 'tone': 1007, 'monolid': 644, 'eye': 356, 'small': 906, 'body': 140, 'build': 160, 'like': 579, 'would': 1091, 'straight': 946, 'person': 730, 'crew': 248, 'dirty': 291, 'blonde': 133, 'squarish': 931, 'face': 363, 'round': 830, 'jaw': 535, 'skinny': 889, 'average': 97, 'weight': 1068, 'long': 588, 'comb': 218, 'emo': 334, 'beard': 111, 'scraggily': 844, 'not': 683, 'well': 1070, 'keep': 543, 'she': 862, 'slightly': 898, 'black': 125, 'shirt

In [39]:
# Create predictions arrays
preds_bow = []
preds_bow_final = []
for each in range(len(text_features)):
    preds_bow.append([each])
    preds_bow_final.append([each])

In [40]:
# Append reformatted pairwise distances to each element
for each in range(len(text_features)):
    for f in text_features:
        preds_bow[each].append(float(str(euclidean_distances(text_features[each], f).flatten()).strip("[").strip("]")))

In [41]:
# Remove all but the 3 closest elements
for each in range(len(preds_bow)):
    for i in range(3):
        preds_bow[each][np.argmin(preds_bow[each])] = 100000
        preds_bow_final[each].append(np.argmin(preds_bow[each]))

In [59]:
# Let's have a look at 5 random preds
list(preds_bow_final[i] for i in [np.random.randint(len(preds_bow_final)) for n in range(5)])
# Ready for output

[[660, 388, 590, 21],
 [741, 432, 256, 727],
 [463, 335, 388, 614],
 [230, 578, 674, 160],
 [333, 190, 266, 442]]

## Metrics: TFID

In [69]:
words_tfid = []
for each in range(len(lemmatized_sentences)):
    words_tfid.append(" ".join([e for e in lemmatized_sentences[each]]))

In [70]:
tfidf = TfidfVectorizer().fit_transform(words_tfid)
pairwise_similarity = tfidf * tfidf.T

pw_array = pairwise_similarity.toarray()

In [74]:
# Create predictions array
preds_tfid = []
for each in range(len(pw_array)):
    preds_tfid.append([each])

In [75]:
for each in range(len(pw_array)):
    for i in range(3):
        pw_array[each][np.argmax(pw_array[each])] = 0
    #print("#", np.argmax(pw_array[each]), np.round(pw_array[each][np.argmax(pw_array[each])], 3))
        preds_tfid[each].append(np.argmax(pw_array[each]))

In [77]:
# Let's have a look at 5 random preds
list(preds_tfid[i] for i in [np.random.randint(len(preds_tfid)) for n in range(5)])
# Ready for output

[[727, 476, 368, 127],
 [264, 413, 762, 739],
 [550, 282, 565, 263],
 [592, 51, 285, 238],
 [347, 395, 205, 620]]

## Bayes Approach

In [79]:
# Import coco_val data 
with open("coco_val.txt") as f:
    coco = f.readlines()
    
for each in range(len(coco)):
    coco[each] = coco[each].strip("\n")

In [80]:
from sklearn.feature_extraction.text import CountVectorizer
count_vect = CountVectorizer()
X_train_counts = count_vect.fit_transform(coco)
X_train_counts.shape

(25000, 7180)

In [81]:
bayes = []
c = 0
for each in range(0, X_train_counts.shape[0], 5):
    for i in range(5):
        bayes.append(c)
    c += 1

In [82]:
from sklearn.feature_extraction.text import TfidfTransformer
tf_transformer = TfidfTransformer(use_idf=False).fit(X_train_counts)
X_train_tf = tf_transformer.transform(X_train_counts)
X_train_tf.shape

(25000, 7180)

In [83]:
tfidf_transformer = TfidfTransformer()
X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)
X_train_tfidf.shape

(25000, 7180)

In [84]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB().fit(X_train_tfidf, bayes)

In [89]:
docs_new = coco
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = clf.predict(X_new_tfidf)

In [86]:
len(predicted[predicted == bayes]) / len(predicted)

0.90268

In [93]:
docs_new = sentences
X_new_counts = count_vect.transform(docs_new)
X_new_tfidf = tfidf_transformer.transform(X_new_counts)
predicted = clf.predict(X_new_tfidf)

In [94]:
predicted

array([2378, 1775, 4248, 4248, 1765,  760, 3434, 3020, 1480,  760, 3434,
        585, 2307, 2674, 2373, 2193,   80,  585, 3409,  743, 4248, 4181,
        119, 1942, 2674, 2307, 1942, 2373, 2373, 1765, 1942, 3219, 1942,
        623, 2373, 3409, 4248,  760,  760, 2373, 3409, 1942,  760, 4248,
       1410,  220,  760, 2373,  760, 4248, 1410, 3409, 1942, 4399, 4865,
       4248, 3409, 2373,  743, 2674, 1775,  760, 3020,  585, 1765, 1410,
       4352,  760, 2625, 3344, 4671, 1942, 1765,  585, 4248, 3344, 3020,
       1410, 1410,  760, 2373, 3344, 3434, 2865,  460, 2896, 3024, 4166,
       3409, 4248, 3434, 1775, 3020, 3434, 2373, 4248,  760, 3480, 3020,
       2094, 2373, 2294, 2373, 3020, 2611, 2289, 1410, 2373,  760, 1410,
        760,   80, 1774, 2373, 2082, 3480, 1807, 4859, 2373,  760, 2373,
       2373, 1765, 3480, 2611, 3431, 2373, 2373, 2373, 1647,  585,  760,
        760, 3219, 4865,  583,   80, 3409, 3434, 3020, 2674, 1765,  760,
        760, 2373, 1410,  743,  760, 3434,  760, 30