In [323]:
import numpy as np
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from string import punctuation
import pandas as pd

# Number of train/test files
num_train_file = 10000
num_test_file = 2000

# Functions for removing noise and garbage of the data
def strip_punctuation(data):
    for i in range(len(data)):
        data[i]=''.join([letter for letter in data[i] if letter not in punctuation])
    return data

def lowercase(data):
    return [x.lower() for x in data]

stop_words = set(stopwords.words('english'))
def strip_stop_words(data):
    for i in range(len(data)):
        word_tokens = word_tokenize(data[i])
        filtered_sentence = []
        for word in word_tokens:
            if word not in stop_words:
                filtered_sentence.append(word)
        data[i] = " ".join(filtered_sentence)
    return data

def lemmatizer(data):
    lemmatizer = WordNetLemmatizer()
    for i in range(len(data)):
        word_list = data[i].split(' ')
        temp = []
        for word in word_list:
            temp.append(lemmatizer.lemmatize(word, 'v'))
        data[i] = " ".join(temp)
    return data

In [324]:
def generate_desc_dict(path, num_file):
    
    desc_dict = {}
    for i in range(num_file):
        filename = path + str(i) +'.txt'
        file_number = (filename.split('/')[-1]).split('.')[0]
        
        # Read files.
        file = open(filename, "r")
        contents = [line.rstrip("\n") for line in file]
        
        # Lowercase all of the words.
        contents = lowercase(contents)
        
        # Strip punctuation
        contents = strip_punctuation(contents)
        
        # Strip the stop words
        contents = strip_stop_words(contents)
        
        # Lemmatization of all the words
        contents = lemmatizer(contents)
        
        desc_dict[file_number] = contents
    
    return desc_dict

# Generate train/test description dictionaries
train_desc_dict = generate_desc_dict('./data/descriptions_train/', num_train_file)
test_desc_dict = generate_desc_dict('./data/descriptions_test/', num_test_file)

In [325]:
def generate_unique_indexed_desc_dict(desc_dict):
    
    indexed_desc_dict = {}
    desc_list = []
    for file_num, descriptions in desc_dict.items():
        for desc in descriptions:
            words = desc.split()
            for word in words:
                desc_list.append(word)
                
    desc_list = list(set(desc_list))
    
    index = 0
    for desc in desc_list:
        indexed_desc_dict[desc] = index
        index += 1
        
    return indexed_desc_dict

In [326]:
indexed_desc_dict = generate_unique_indexed_desc_dict(train_desc_dict)
print(len(indexed_desc_dict))

7658


In [327]:
# Reference: https://pandas.pydata.org/pandas-docs/version/0.23.4/generated/pandas.DataFrame.html

# Generate two-dimensional data structure (table)
def generate_desc_vectors(desc_dict, num_file):
    desc_vector = np.zeros((num_file, len(indexed_desc_dict.keys())))
    
    for file_num, descs in desc_dict.items():
        for desc in descs:
            words = desc.split()
            for word in words:                
                if word in indexed_desc_dict.keys():
                    desc_vector[int(file_num), int(indexed_desc_dict[word])] += 1
                
    return pd.DataFrame(desc_vector, columns = indexed_desc_dict.keys())

In [328]:
# Create train/test binary description vectors (table)
train_desc_vectors = generate_desc_vectors(train_desc_dict, num_train_file)
test_desc_vectors = generate_desc_vectors(test_desc_dict, num_test_file)
print(desc_vectors.shape)

(10000, 7658)


In [329]:
def generate_tag_dict(file_path, num_file):
    
    tags_dict = {}
    sub_category = []
    for i in range(num_file):
        filename = file_path + str(i) +'.txt'
        file_number = (filename.split('/')[-1]).split('.')[0]
        
        # Read files.
        file = open(filename, "r")
        tags = [line.rstrip("\n") for line in file]
        
        temp = []
        for tag in tags:
            tag = tag.split(':')
            temp.append(tag[1])
        tags_dict[file_number] = temp
            
    return tags_dict

In [330]:
# Generate train/test tag dictionary.
train_tags_dict = generate_tag_dict('./data/tags_train/', num_train_file)
test_tags_dict = generate_tag_dict('./data/tags_test/', num_test_file)

In [331]:
def generate_unique_indexed_tag_dict(tag_dict):
    
    indexed_tag_dict = {}
    tag_list = []
    for tags in tag_dict.values():
        for tag in tags:
            tag_list.append(tag)
                
    tag_list = list(set(tag_list))
    
    index = 0
    for tag in tag_list:
        indexed_tag_dict[tag] = index
        index += 1
        
    return indexed_tag_dict

In [332]:
indexed_tag_dict = generate_unique_indexed_tag_dict(train_tags_dict)
print(len(indexed_tag_dict))

80


In [333]:
# Reference: https://pandas.pydata.org/pandas-docs/version/0.23.4/generated/pandas.DataFrame.html

# Generate two-dimensional data structure (table)
def generate_tags_vector(tags_dict, num_file):
    tag_vector = np.zeros((num_file, len(indexed_tag_dict.keys())))
    
    for file_num, tags in tags_dict.items():
        for tag in tags:
            if tag in indexed_tag_dict.keys():
                tag_vector[int(file_num), int(indexed_tag_dict[tag])] += 1
                
    return pd.DataFrame(tag_vector, columns = indexed_tag_dict.keys())

In [334]:
# Create train/test binary tag vectors (tables)
train_tag_vectors = generate_tags_vector(train_tags_dict, num_train_file)
test_tag_vectors = generate_tags_vector(test_tags_dict, num_test_file)
print(test_tag_vectors.shape)

(2000, 80)


In [335]:
# Reference: https://stackoverflow.com/questions/18594469/normalizing-a-pandas-dataframe-by-row
# Normalize train/test vectors to be less sensitive to the scale of features
from sklearn import preprocessing
train_normalized_desc_vector = train_desc_vectors.div(train_desc_vectors.sum(axis=1), axis=0)
test_normalized_desc_vector = test_desc_vectors.div(test_desc_vectors.sum(axis=1), axis=0)

In [336]:
# print(train_normalized_desc_vector)
# print(tag_vectors)
# print(test_normalized_desc_vector)

In [337]:
# Reference: 
# https://scikit-learn.org/stable/modules/multiclass.html
# https://www.programcreek.com/python/example/94869/sklearn.multiclass.OneVsRestClassifier
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import LinearSVC,SVC

def SVM(train, trainLabel, test):
    classifier = OneVsRestClassifier(LinearSVC(random_state=0))
    classifier.fit(train, trainLabel)
    predictions = classifier.predict(test)
    
    return predictions

# Perform SVM
test_predic = SVM(train_normalized_desc_vector.values, tag_vectors.values, test_normalized_desc_vector.values)

In [338]:
print(test_predic.shape)

(2000, 80)


In [339]:
# Reference:
# https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
# https://stackoverflow.com/questions/37782049/sklearn-kneighbours-memory-error-python
from sklearn.neighbors import NearestNeighbors

def kNN(tag_vectors, k, predic):
    neigh = NearestNeighbors(n_neighbors= k , algorithm='kd_tree').fit(tag_vectors)
    dist, idx = neigh.kneighbors(predic)
    return dist, idx

# Perform knn
dist, predic_idx = kNN(test_tag_vectors, 20, test_predic)

In [340]:
print(preds)

[[ 292   36 1743 ...  897  281  760]
 [ 833  589 1529 ...  784  948  758]
 [1724  649 1866 ... 1033 1059  953]
 ...
 [ 104 1481 1726 ...  429   58  235]
 [1218 1660 1535 ...   63   70   28]
 [1342  653 1430 ...  346  405 1129]]


In [341]:
# Reference: https://pythonspot.com/save-a-dictionary-to-a-file/
import csv

# Create top 20 dictionary
result_dic = {}
for i in range(2000):
    desc_id = str(i) + '.txt'
    temp = []
    images = preds[i]
    for image in images:
        temp.append(str(image) + '.jpg')
    top_20 = " ".join(temp)
    result_dic[desc_id] = top_20

# Write csv file using the dictionary above
w = csv.writer(open("submission_3.csv", "w", newline=''))
w.writerow(["Descritpion_ID", "Top_20_Image_IDs"])
for key, val in result_dic.items():
    w.writerow([key, val])