In [1]:
import pandas as pd
from tqdm import tqdm

tqdm.pandas()

Shortcuts:
- Allready separated ngrams further down
- Year/depency/adjective summarized

In [2]:
df = pd.read_csv('./data/ngram_dependencies_adjectives.csv')
print(df.head())
print(df.info())

                    ngram  year  count
0  ongoing_adj=>later_adj  1973      2
1  ongoing_adj=>later_adj  1977      5
2  ongoing_adj=>later_adj  1980      6
3  ongoing_adj=>later_adj  1982      2
4  ongoing_adj=>later_adj  1986      3
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 63719879 entries, 0 to 63719878
Data columns (total 3 columns):
 #   Column  Dtype 
---  ------  ----- 
 0   ngram   object
 1   year    int64 
 2   count   int64 
dtypes: int64(2), object(1)
memory usage: 1.4+ GB
None


In [3]:
df['ngram'] = df['ngram'].str.replace('_adj', '', n=2)
df.head()

Unnamed: 0,ngram,year,count
0,ongoing=>later,1973,2
1,ongoing=>later,1977,5
2,ongoing=>later,1980,6
3,ongoing=>later,1982,2
4,ongoing=>later,1986,3


In [4]:
df[['dependency', 'adjective']] = df['ngram'].str.split('=>', expand=True)
df.head()

KeyboardInterrupt: 

In [None]:
df_no_ngram = df.drop('ngram', 1)

In [6]:
df_no_ngram.to_csv('./data/ngram_dependencies_adjectives_separated.csv', index=False)

In [5]:
df_no_ngram = pd.read_csv('./data/ngram_dependencies_adjectives_separated.csv')
df_no_ngram.head()

In [6]:
TOP_N = 1000

adjective_occurences = df_no_ngram\
    .groupby('adjective')\
    .sum()\
    .sort_values('count', ascending=False)\
    .reset_index()

tops = adjective_occurences['adjective'][:TOP_N]
tops_set = set(tops)

In [7]:
df_tops = df_no_ngram[df_no_ngram['adjective'].progress_apply(lambda x: x in tops_set)]

100%|██████████| 63719879/63719879 [00:52<00:00, 1223997.67it/s]


In [8]:
print(f'Size before: {df_no_ngram.shape}')
print(f'Size after: {df_tops.shape}')

Size before: (63719879, 4)
Size after: (26873815, 4)


In [9]:
df_summarized = df_tops.groupby(['dependency', 'year', 'adjective']).sum().reset_index()

In [11]:
df_summarized = df_summarized.reset_index().rename(columns={'year': 'calendar_year'})

In [16]:
df_summarized.to_csv("./data/ngram_dependencies_adjectives_separated_summarized.csv", index=False)

In [12]:
df_summarized = pd.read_csv("./data/ngram_dependencies_adjectives_separated_summarized.csv")

In [14]:
df_wide = pd.pivot(df_summarized, index=['dependency', 'calendar_year'], columns='adjective', values='count').reset_index()

In [64]:
df_wide_filled = df_wide.fillna(0)

In [65]:
print(df_wide_filled.info())
df_wide_filled.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132597 entries, 0 to 132596
Columns: 1002 entries, dependency to zero
dtypes: float64(1000), int64(1), object(1)
memory usage: 1013.7+ MB
None


adjective,dependency,calendar_year,1,10,11,12,13,14,15,16,...,worst,worth,worthy,wrong,year,yellow,york,young,younger,zero
0,abatable,1910,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,abatable,1913,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,abatable,1917,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,abatable,1918,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,abatable,1930,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [66]:
df_wide_filled_indices = df_wide_filled.iloc[:, :2]
df_wide_filled_features = df_wide_filled.iloc[:, 2:]
row_mean = df_wide_filled_features.mean(axis=1)
row_std = df_wide_filled_features.std(axis=1)

In [67]:
df_wide_filled_features_normalized = df_wide_filled_features.subtract(row_mean, axis=0).divide(row_std, axis=0)
df_wide_filled_normalized = pd.concat([df_wide_filled_indices, df_wide_filled_features_normalized], axis=1)

In [56]:
print(df_wide_filled_normalized.info())
df_wide_filled_normalized.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 132597 entries, 0 to 132596
Columns: 1002 entries, dependency to zero
dtypes: float64(1000), int64(1), object(1)
memory usage: 1013.7+ MB
None


adjective,dependency,calendar_year,1,10,11,12,13,14,15,16,...,worst,worth,worthy,wrong,year,yellow,york,young,younger,zero
0,abatable,1910,0.0,-0.031639,-0.031639,-0.031639,-0.031639,-0.031639,-0.031639,-0.031639,...,-0.031639,-0.031639,-0.031639,-0.031639,-0.031639,-0.031639,-0.031639,-0.031639,-0.031639,-0.031639
1,abatable,1913,0.0,-0.031639,-0.031639,-0.031639,-0.031639,-0.031639,-0.031639,-0.031639,...,-0.031639,-0.031639,-0.031639,-0.031639,-0.031639,-0.031639,-0.031639,-0.031639,-0.031639,-0.031639
2,abatable,1917,0.0,-0.031639,-0.031639,-0.031639,-0.031639,-0.031639,-0.031639,-0.031639,...,-0.031639,-0.031639,-0.031639,-0.031639,-0.031639,-0.031639,-0.031639,-0.031639,-0.031639,-0.031639
3,abatable,1918,0.0,-0.031639,-0.031639,-0.031639,-0.031639,-0.031639,-0.031639,-0.031639,...,-0.031639,-0.031639,-0.031639,-0.031639,-0.031639,-0.031639,-0.031639,-0.031639,-0.031639,-0.031639
4,abatable,1930,0.0,-0.031639,-0.031639,-0.031639,-0.031639,-0.031639,-0.031639,-0.031639,...,-0.031639,-0.031639,-0.031639,-0.031639,-0.031639,-0.031639,-0.031639,-0.031639,-0.031639,-0.031639


In [68]:
df_wide_filled.to_csv("./data/ngram_dependencies_adjectives_wide.csv", index=False)

In [69]:

df_wide_filled_normalized.to_csv("./data/ngram_dependencies_adjectives_wide_normalized.csv", index=False)