In [12]:
# from rnn import read_data
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from scipy.spatial import distance
import codecs

In [28]:
def read_data(file_name):
    """
    read in conll file
    
    :param file_name: path to read from
    :returns: list with sequences of words and labels for each sentence
    """
    data = []
    current_words = []
    current_tags = []

    for line in codecs.open(file_name, encoding='utf-8'):
        line = line.strip()

        if line:
            tok = line.split('\t')
            word = tok[0]
            tag = tok[1]

            current_words.append(word)
            current_tags.append(tag)

        else:
            if current_words:  # skip empty lines
                data.append((current_words, current_tags))
            current_words = []
            current_tags = []

    # check for last one
    if current_tags != []:
        data.append((current_words, current_tags))
    return data


def unite_lists(data):
    result = []
    for l in data:
        result += l
    return result



In [25]:
conll = read_data("../Data/conll2003/dev.txt")
ai = read_data("../Data/ai/dev.txt")
literature = read_data("../Data/literature/dev.txt")
music = read_data("../Data/music/dev.txt")
politics = read_data("../Data/politics/dev.txt")
science = read_data("../Data/science/dev.txt")

In [31]:
conll_str = ' '.join(unite_lists(pd.DataFrame(conll)[0]))
ai_str = ' '.join(unite_lists(pd.DataFrame(ai)[0]))
literature_str = ' '.join(unite_lists(pd.DataFrame(literature)[0]))
music_str = ' '.join(unite_lists(pd.DataFrame(music)[0]))
politics_str = ' '.join(unite_lists(pd.DataFrame(politics)[0]))
science_str = ' '.join(unite_lists(pd.DataFrame(science)[0]))

In [32]:
texts = [conll_str, ai_str, literature_str, music_str, politics_str, science_str]


count_vectorizer = CountVectorizer(stop_words='english')
count_vectorizer = CountVectorizer()
matrix = count_vectorizer.fit_transform(texts)

table = matrix.todense()
df = pd.DataFrame(table, 
                  columns=count_vectorizer.get_feature_names(), 
                  index=['conll', 'ai', 'literature', 'music', 'politics', 'science'])
df





Unnamed: 0,00,000,0003,000s,000th,002,0025,003,005,006,...,đilas,đinđić,định,ōe,śmigły,śrīharṣa,šetalište,ἀχίλλειον,白川,英樹
conll,122,339,2,2,1,2,1,3,2,2,...,0,0,0,0,0,0,0,0,0,0
ai,0,1,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
literature,0,1,0,0,0,0,0,0,0,0,...,0,0,0,1,0,1,0,1,0,0
music,0,2,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
politics,0,2,0,0,0,0,0,0,0,0,...,1,1,1,0,1,0,1,0,0,0
science,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,1


In [34]:
matrix = distance.cdist(df, df, 'euclidean')

df_eucl = pd.DataFrame(matrix, 
                  columns= ['conll', 'ai', 'literature', 'music', 'politics', 'science'],
                  index=['conll', 'ai', 'literature', 'music', 'politics', 'science'])
df_eucl


Unnamed: 0,conll,ai,literature,music,politics,science
conll,0.0,11997.009752,11069.931301,11971.230137,11578.892218,11676.83279
ai,11997.009752,0.0,1044.036398,157.391232,530.475259,383.701707
literature,11069.931301,1044.036398,0.0,1009.260125,647.86264,708.23513
music,11971.230137,157.391232,1009.260125,0.0,505.155422,367.381273
politics,11578.892218,530.475259,647.86264,505.155422,0.0,301.948671
science,11676.83279,383.701707,708.23513,367.381273,301.948671,0.0
