# Huggingface OpenAI GPT and GPT2 models

Hugginface comes with several versions of the OpenAI GPT model for generating facke test

Here, we compare the original GPT with the later GPT2. 

Both models have the exact same architecture, but arev trained with different size data sets.
The GPT2 model is trained with a much larger data set and performs significantly better:


_"GPT-2 is a direct scale-up of GPT, with more than 10X the parameters and trained on more than 10X the amount of data."_  

 https://openai.com/blog/better-language-models/


### Observations

- The distrobution of $\alpha$ exponents is smaller
- Information flow is better in GPT2
- Frobenius and Spectral Norms are Larger for GPT2 vs GPT
- Embedding / first 2 layers, and last couple layes, have unusually large spectral norms

#### Could be a normalization issue ?

- BUT I think we see rank collapse in GPT, throwing the metrics


In [None]:
import numpy as np
import pandas as pd
import numpy as np
import scipy.sparse as sp
import random, datetime

from tqdm import tqdm 

import os, gc, logging
logger = logging.getLogger()

import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
!pip install transformers

In [None]:
import logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

import weightwatcher as ww
print("weightwatcher version {}".format(ww.__version__))


In [None]:
import torch
import transformers
from transformers import OpenAIGPTModel,GPT2Model
print("transformers version {}".format(transformers.__version__))

In [None]:
gpt_model = OpenAIGPTModel.from_pretrained('openai-gpt')
gpt_model.eval();

In [None]:
gpt2_model = GPT2Model.from_pretrained('gpt2')
gpt2_model.eval();

In [None]:
watcher = ww.WeightWatcher(model=gpt_model, logger=logger)
results = watcher.analyze(alphas=True, softranks=True, spectralnorms=True, plot=False)

summary =  watcher.get_summary()

details  = watcher.get_details(results=results)
details.drop(columns=['slice', 'slice_count'], inplace=True)
details.dropna(inplace=True)
details['NxM'] = pd.to_numeric(details.N * details.M) 

In [None]:
watcher = ww.WeightWatcher(model=gpt2_model, logger=logger)
results2 = watcher.analyze(alphas=True, softranks=True, spectralnorms=True, plot=True)

summary2 =  watcher.get_summary()

details2  = watcher.get_details(results=results2)
details2.drop(columns=['slice', 'slice_count'], inplace=True)
details2.dropna(inplace=True)
details2['NxM'] = pd.to_numeric(details2.N * details2.M) 

In [None]:
alpha = [(x) for x in details['alpha'].to_numpy()]
alpha2 = [(x) for x in details2['alpha'].to_numpy()]

In [None]:
logsnorm = [np.log10(x) for x in details['spectralnorm'].to_numpy()]
logsnorm2 = [np.log10(x) for x in details2['spectralnorm'].to_numpy()]


In [None]:
plt.rcParams.update({'font.size': 20})
from pylab import rcParams
rcParams['figure.figsize'] = 10,10

In [None]:
params = {'legend.fontsize': 'x-large',
          'figure.figsize': (10, 7),
         'axes.labelsize': 'x-large',
         'axes.titlesize':'xx-large',
         'xtick.labelsize':'xx-large',
         'ytick.labelsize':'xx-large'}
plt.rcParams.update(params)

In [None]:
plt.hist(alpha, bins=100, color='blue', alpha=0.5, density=True, label='openAI GPT')
plt.hist(alpha2, bins=100, color='red', alpha=0.5, density=True, label='openAI GPT2')

plt.legend()
plt.title(r"Histogram: Power Law exponents $(\alpha)$ ")#for layers of"+"\nOpenAI GPT and GPT2 Pretrained Models")
plt.xlabel(r"Power Law exponent $(\alpha)$")
plt.ylabel("Density")
plt.tight_layout()

plt.savefig("img/GPT-alpha-hist.png")
plt.show()

In [None]:
plt.hist(logsnorm, bins=100, color='blue', alpha=0.5, density=True, label='openAI GPT')
plt.hist(logsnorm2, bins=100, color='red', alpha=0.5, density=True, label='openAI GPT2')

plt.legend()
plt.title(r"Histogram: Spectral Norms $(\log\Vert\mathbf{W}\Vert_{\infty})$" )
          #for layers of"+"\nOpenAI GPT and GPT2 Pretrained Models")
plt.xlabel(r"log Spectral Norm $(\log\Vert\mathbf{W}\Vert_{\infty})$")
plt.ylabel("Density")
plt.tight_layout()

plt.savefig("img/GPT-snorm-hist.png")
plt.show()

In [None]:
x = np.array(details.index)
y = details.alpha.to_numpy(dtype=np.float)
plt.scatter(x,y, label='GPT')

y2 = details2.alpha.to_numpy(dtype=np.float)
plt.scatter(x,y2, label='GPT2')

plt.xlabel("layer id ")
plt.ylabel(r"$\alpha$")
plt.legend()
plt.title(r"PL Exponent in GPT and GPT2"+ "\n"+r"$\alpha$ vs layer id")

plt.tight_layout()
plt.savefig("img/GPT-alpha-depth.png")

In [None]:
details.alpha[0:10]

In [None]:
x = np.array(details.index)
y = np.log10(details.spectralnorm.to_numpy(dtype=np.float))
plt.scatter(x,y, label='GPT')

y2 = np.log10(details2.spectralnorm.to_numpy(dtype=np.float))
plt.scatter(x,y2, label='GPT2')

plt.xlabel("layer id ")
plt.ylabel(r"$\log\Vert\mathbf{W}\Vert_{\infty}$")
plt.legend()
plt.title("Log Spectral Norms in GPT and GPT2\n"+r"$\log\;\Vert\mathbf{W}\Vert_{\infty}$ vs layer id")
plt.tight_layout()

plt.savefig("img/GPT-snorm-depth.png")

In [None]:
details2.head(2)

In [None]:
x = np.array(details.index)
y = details.logpnorm.to_numpy(dtype=np.float)
plt.scatter(x,y, label='GPT')

y2 = details2.logpnorm.to_numpy(dtype=np.float)
plt.scatter(x,y2, label='GPT2')

plt.xlabel("layer id ")
plt.ylabel(r"$\log\Vert\mathbf{X}\Vert_{\alpha}^{\alpha}$")
plt.legend()
plt.title(r"Log $\alpha$-Norms in GPT and GPT2"+"\n"+r"$\log\;\Vert\mathbf{X}\Vert_{\alpha}^{\alpha}$ vs layer id")
plt.tight_layout()

plt.savefig("img/GPT-pnorm-depth.png")

In [None]:
s = summary
" & {:.2f} & {:.2f} & {:.2f} & {:.2f} \\".format(s['lognorm'], s['spectralnorm'], s['alpha_weighted'], s['logpnorm'] )



In [None]:
def avglogmetric(d, col):
    norm = d[col].to_numpy(dtype=np.float)
    lognorm=np.log10(norm)
    avglognorm = np.average(lognorm)
    return avglognorm

def avgmetric(d, col):
    norm = d[col].to_numpy(dtype=np.float)
    avgnorm = np.average(norm)
    return avgnorm

for s, d in zip([summary, summary2],[details, details2]):
    
    d = d[d.level==ww.LEVEL.SLICE]
    d = d.loc[2:]

    avglognorm = avglogmetric(d, 'norm')
    avglogsnorm = avglogmetric(d, 'spectralnorm')
    avgwalpha = avgmetric(d, 'alpha_weighted')
    avglogpnorm = avgmetric(d, 'logpnorm')
    
    line = " & {} & {:.2f} &{:.2f}& {:.2f} & {:.2f} \\\\"
    print(line.format(len(d),avglognorm,avglogsnorm,avgwalpha,avglogpnorm))