In [1]:
SPACY_DATASET = '../generated_datasets/SpaCy_example_stats-and-counts-plusplus.npz'
NLTK_DATASET  = '../generated_datasets/NLTK_example_stats-and-counts.npz'

In [2]:
import numpy as np
import pandas

data_spacy = np.load(SPACY_DATASET)
data_nltk  = np.load(NLTK_DATASET)

X_spacy, y_spacy = data_spacy['X'], data_spacy['y']
X_nltk, y_nltk = data_nltk['X'], data_nltk['y']

In [3]:
statistical_features = [
    'num_sentences',       ## Number of sentences
    'avg_sent_word_len',   ## Average sentence length in words
    'avg_sent_char_len',   ## Average sentence length in chars
    'var_sent_char_len',   ## Variance of sentence length in chars
    'avg_word_len',        ## Average word length
    'var_word_len',        ## Variance of word length
    'punct_freq',          ## Frequency of punctuation
    'capital_freq',        ## Frequency of capital letters
    'types_token_ratio'    ## Types to atoms ratio
]

features_nltk = [statistical_features[i] if i < len(statistical_features) else str(i) \
                 for i in range(X_nltk.shape[1])]

In [4]:
statistical_features_extra = [
    'avg_tree_height',        ## Average height of parse tree
    'num_internal_links',     ## Number of internal links
    'num_external_links',     ## Number of external links
]

NUM_POS_TAGS = 19
statistical_features_extra += ['percent_tokens_per_pos_{}'.format(i) for i in range(NUM_POS_TAGS)]

features_spacy = statistical_features + statistical_features_extra
features_spacy += [str(i) for i in range(X_spacy.shape[1] - len(features_spacy))]

In [5]:
df_spacy = pandas.DataFrame(X_spacy, columns=features_spacy)
df_nltk = pandas.DataFrame(X_nltk, columns=features_nltk)

In [6]:
df_spacy.describe()

Unnamed: 0,num_sentences,avg_sent_word_len,avg_sent_char_len,var_sent_char_len,avg_word_len,var_word_len,punct_freq,capital_freq,types_token_ratio,avg_tree_height,...,41,42,43,44,45,46,47,48,49,50
count,645.0,645.0,645.0,645.0,645.0,645.0,645.0,645.0,645.0,645.0,...,645.0,645.0,645.0,645.0,645.0,645.0,645.0,645.0,645.0,645.0
mean,37.87907,18.815001,96.417366,5698.047363,4.805143,9.058295,0.031523,0.054067,0.450313,5.702182,...,0.741085,1.034109,3.829457,0.751938,0.750388,0.488372,0.827907,0.733333,0.548837,0.868217
std,36.286613,5.977197,33.194557,17645.457031,0.815667,41.865051,0.012548,0.032851,0.123329,1.255408,...,1.617913,1.650922,7.02317,1.823888,1.390302,1.056746,2.281405,2.054435,1.202812,1.703941
min,1.0,3.8,22.617977,0.0,3.437964,3.260843,0.000598,0.00973,0.123853,2.261261,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,15.0,15.09375,74.789474,2680.987549,4.576577,5.727561,0.023614,0.038647,0.378594,5.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,27.0,18.200001,92.736839,4032.626465,4.752688,6.43089,0.029106,0.048485,0.438923,5.616667,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
75%,48.0,21.826086,113.64286,5976.160156,4.94,7.273612,0.036279,0.059531,0.509464,6.36937,...,1.0,2.0,5.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
max,295.0,87.199997,464.600006,434049.03125,23.183098,1010.938293,0.130435,0.517087,0.916667,22.4,...,15.0,16.0,73.0,21.0,12.0,10.0,22.0,18.0,18.0,17.0


In [7]:
df_nltk.describe()

Unnamed: 0,num_sentences,avg_sent_word_len,avg_sent_char_len,var_sent_char_len,avg_word_len,var_word_len,punct_freq,capital_freq,types_token_ratio,9,...,49,50,51,52,53,54,55,56,57,58
count,645.0,645.0,645.0,645.0,645.0,645.0,645.0,645.0,645.0,645.0,...,645.0,645.0,645.0,645.0,645.0,645.0,645.0,645.0,645.0,645.0
mean,26.716279,27.630632,145.761589,19801.18,4.923357,7.320649,0.031523,0.053673,0.576591,0.631008,...,0.651163,0.570543,0.621705,1.054264,3.829457,0.762791,0.750388,0.827907,0.567442,0.868217
std,25.791721,21.877265,140.055943,143146.4,0.385684,6.557132,0.012548,0.031625,0.126856,1.677481,...,1.319062,1.188741,1.456244,1.682528,7.023169,1.84858,1.390302,2.281405,1.4448,1.703941
min,1.0,5.25,22.75,0.0,3.915929,3.195154,0.000598,0.00973,0.159218,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,10.0,20.871796,105.370369,2963.653,4.705491,5.905273,0.023614,0.038638,0.505155,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
50%,18.0,24.777779,125.75,4926.694,4.908451,6.702977,0.029106,0.048454,0.572368,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0
75%,35.0,29.666666,152.857147,9105.245,5.100529,7.475405,0.036279,0.059524,0.648579,1.0,...,1.0,1.0,1.0,2.0,5.0,1.0,1.0,1.0,1.0,1.0
max,216.0,435.0,2327.0,2405872.0,10.206413,118.809097,0.130435,0.517087,1.0,19.0,...,12.0,10.0,13.0,16.0,73.0,21.0,12.0,22.0,27.0,17.0
