In [54]:
#Import scripts we need
import numpy as np
import csv
import string
from IPython.display import display, Image
import matplotlib.pyplot as plt
import matplotlib.image as mpimg
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from sklearn import linear_model, svm
from sklearn.model_selection import train_test_split as cross_validation

#This is the stop words set
stop_words = set(stopwords.words('english')) 
punctuation = set(string.punctuation)

In [55]:
# Preprocesses text for us
def PreProcess(text = str):
    return_list = []   
    wnl = WordNetLemmatizer()
    text_list = text.lower().split()
    for word in text_list:
        word = ''.join(ch for ch in word if ch not in punctuation)
        if word not in stop_words:
            return_list.append(wnl.lemmatize(word, 'v'))
    return return_list

In [56]:
# Reads data from a single file and sends it back as one line
# NOTE: Lowercases, strips, and lemmatizes all words
def GetDescFileData(path = str):
    # Open file path
    file_path = open(path, 'r')
    
    # Read line by line and append them all into data
    line = file_path.readline()
    data = []
    while line:  
        processed_lines = PreProcess(line)
        for entry in processed_lines:
            data.append(entry)
        line = file_path.readline()
    return np.array(data)

In [57]:
# Reads all description data
# Outputs data as a list. First index is test data, second is training.
def GetDescriptionData():
    data = []
    train_data = []
    test_data = []
    for i in range(10000):
        train_data.append(GetDescFileData('data/descriptions_train/' + str(i)  + '.txt'))    
    for i in range(2000):
        test_data.append(GetDescFileData('data/descriptions_test/' + str(i) + '.txt'))    
    data.append(np.array(train_data))
    data.append(np.array(test_data))
    return np.array(data)

In [58]:
#Example of desc data

# desc_data = GetDescriptionData()
# print('Complete')

In [59]:
# Reads all tag data from a single file
def GetTagFileData(path = str):
    # Open file path
    file_path = open(path, 'r')
    
   # Read line by line and append them all into data
    line = file_path.readline()
    data = []
    while line:
        wnl = WordNetLemmatizer()
        split_line = line.split(':')
        line = (split_line[0].lower().strip(), split_line[1].lower().strip())
        for word in line:
            if word not in stop_words:
                word = wnl.lemmatize(word, 'v')
        data.append(line)
        line = file_path.readline()
    return np.array(data)

In [60]:
# Reads all tag data
def GetTagData():
    data = []
    train_data = []
    test_data = []
    for i in range(2000):
        test_data.append(GetTagFileData('data/tags_test/' + str(i) + '.txt'))
    for i in range(10000):
        train_data.append(GetTagFileData('data/tags_train/' + str(i) + '.txt'))
    data.append(np.array(train_data))
    data.append(np.array(test_data))
    return np.array(data)

In [61]:
# Generate new tags from description and add to tags
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from string import punctuation

#Generates tags from description data according to which indexes you want.
def CombineDescIntoTags(desc_data = [], tag_data = []):
    combined_list = []
    for i in range(len(desc_data)):
        word_list = []
        new_list = []
        tag_data_list = tag_data[i].tolist()
        for pair in tag_data_list:
            word_list.append(pair[0])
            word_list.append(pair[1])
        for word in desc_data[i]:
            word_list.append(word)
        for word in word_list:
            if word_list.count(word) >= 2:
                new_list.append(word)
        combined_list.append(new_list)
    return combined_list

In [62]:
# desc_data = GetDescriptionData()
# tag_data = GetTagData()
# combined_train_data = CombineDescIntoTags(desc_data[0], tag_data[0])

In [63]:
# print(combined_train_data[0])

In [64]:
# Convert tags to the same format as desc
def ConvertTagToDescFormat(tag_data = []):
    new_list = []
    for entry in tag_data:
        new_entry = []
        for pairs in entry:
            new_entry.append(pairs[0])
            new_entry.append(pairs[1])
        new_list.append(new_entry)
    return new_list

In [65]:
# Example of getting tag data
# tag_data = GetTagData()

In [66]:
# Reads data from one csv file 
# NOTE: Can't convert to numpy array, causes memory error.
def GetFeaturesFileData(path = str):
     # Open file path
    file_path = open(path, 'r')
    csv_reader = csv.reader(file_path, delimiter=',')
    
    # Read the csv row by row
    data = []
    for row in csv_reader:
        data.append(row)
    return data

In [67]:
# GetFeaturesFileData('data/features_train/features_resnet1000_train.csv')

In [68]:
# Display image function, takes index
def DisplayImages(list = [], data_type = str):
    for index in list:
        DisplayImage(index, data_type)

In [69]:
# Display a single image
def DisplayImage(index = int, data_type =str):
    if data_type is 'train':
        image = mpimg.imread('data/images_train/' + str(index) + '.jpg')
    elif data_type is 'test':
        image = mpimg.imread('data/images_test/' + str(index) + '.jpg')
    print('Image ' + str(index) + '\n')
    imgplot = plt.imshow(image)
    imgplot.axes.get_xaxis().set_visible(False)
    imgplot.axes.get_yaxis().set_visible(False)
    plt.show()           

In [70]:
# Example of using DisplayImages
# DisplayImages([0,1,2], 'train')

In [71]:
# Return a list of words of all words in the given data set 
# Input lists should be a list of lists of words.
def GetBagOfWordsVector (data = []):
    words = []
    for entry in data:
        for word in entry:
            if word not in words:
                words.append(word)
    return np.array(words)

In [72]:
# Example of getting the bag of words vector from data.
# print(desc_data[0])
# DESC_BAG_VECTOR = GetBagOfWordsVector(desc_data[0])
# print(DESC_BAG_VECTOR)

In [73]:
# Converts a given list of words and converts it into a bag of words representation based on the bag of words vector provided
def ConvertToBagVector(data = [], bag_vector = []):
    vector = [0 for i in range(len(bag_vector))]
    bag_vector = bag_vector.tolist()
    for word in data:
        if word in bag_vector:
            # print(word)
            vector[bag_vector.index(word)] += 1 
    return np.array(vector)

In [74]:
# Below is an example that prints out a bag vector from index 20 from the training description data
# convert1 = ConvertToBagVector(desc_data[1][20], DESC_BAG_VECTOR)

In [75]:
# Returns a numpy array of all descriptions as a bag of words
def ConvertDescToBagVector(data = [], bag_vector = []):
    return_data = []
    for desc in data:
        return_data.append(ConvertToBagVector(desc, bag_vector))
    return np.array(return_data)

In [76]:
# Below are the bag vectors of just the descriptions
# DESC_BAG_VECTORS = ConvertDescToBagVector(desc_data[0], DESC_BAG_VECTOR)
# TEST_DESC_BAG_VECTORS = ConvertDescToBagVector(desc_data[1], DESC_BAG_VECTOR)

In [77]:
# Combines description info and tag info
def CombineDescAndTags(desc_data = [], tag_data = []):
    return_data = []
    for i in range(len(desc_data)):
        new_line = []
        for word in desc_data[i]:
            new_line.append(word)
        for tag in tag_data[i]:
            for word in tag:
                new_line.append(word)
        return_data.append(np.array(new_line))
    return np.array(return_data)    

In [78]:
# Below is the combined data of desc and tags with no weighting to anything
# COMBINED_FLAT_DATA = CombineDescAndTags(desc_data[0], tag_data[0])
# COMBINED_FLAT_TEST_DATA = CombineDescAndTags(desc_data[1], tag_data[1])

In [79]:
# Below is the combined data and desc with tags flat bag of words vector
# COMBINED_FLAT_BAG_VECTOR = GetBagOfWordsVector(COMBINED_FLAT_DATA)

In [80]:
# Below is the conversion of the flat data into flat bag vectors
# FLAT_BAG_DATA = ConvertDescToBagVector(COMBINED_FLAT_DATA, COMBINED_FLAT_BAG_VECTOR)

In [81]:
# Combines description info and tag info with weights
# Weights are done just by adding the word multiple times.
# NOTE: Tag weights are. Sub = 2. Super = 3.
def CombineDescAndTagsWeighted(desc_data = [], tag_data = []):
    return_data = []
    for i in range(len(desc_data)):
        new_line = []
        for word in desc_data[i]:
            new_line.append(word)
        for tag in tag_data[i]:
            new_line.append(tag[0])
            new_line.append(tag[0])
            new_line.append(tag[1])
            new_line.append(tag[1])
            new_line.append(tag[1])
        return_data.append(np.array(new_line))
    return np.array(return_data)    

In [82]:
# Below is the combined data of desc and tags with no weighting to anything
# COMBINED_WEIGHTED_DATA = CombineDescAndTagsWeighted(desc_data[0], tag_data[0])

In [83]:
# Below is the combined bag of words vector for combined weighted data
# COMBINED_WEIGHTED_BAG_VECTOR = GetBagOfWordsVector(COMBINED_WEIGHTED_DATA)

In [84]:
# Below is the weighted bag data using the combined weighted data and weighted bag vector
# WEIGHTED_BAG_DATA = ConvertDescToBagVector(COMBINED_WEIGHTED_DATA, COMBINED_WEIGHTED_BAG_VECTOR)

In [85]:
# print(desc_data[0][0])