In [2]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import nltk

from nltk.corpus import stopwords
from nltk import word_tokenize, sent_tokenize
from nltk.stem import LancasterStemmer, PorterStemmer, WordNetLemmatizer
from nltk.stem.snowball import SnowballStemmer
from nltk import ne_chunk
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation
from sqlalchemy import create_engine
%matplotlib inline 

In [3]:
nltk.download("stopwords")
nltk.download("punkt")
nltk.download("wordnet")
nltk.download("omw-1.4")

stop_words = set(nltk.corpus.stopwords.words("english"))
conn = create_engine('postgresql://root:password@localhost:5432/hf')

[nltk_data] Downloading package stopwords to /home/elang/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/elang/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /home/elang/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /home/elang/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [4]:
df = pd.read_sql_query("""SELECT DISTINCT ON (commit_hash) commit_message
    FROM hf_commits""", con=conn)

In [5]:
df.describe()

Unnamed: 0,commit_message
count,7522
unique,2662
top,initial commit
freq,912


In [6]:
def clean_text(message):
    lemmatizer = WordNetLemmatizer()
    word_tokens = word_tokenize(message)
    tokens = [lemmatizer.lemmatize(token) for token in word_tokens 
              if token not in stop_words and len(token) > 3]
    cleaned_text = " ".join(tokens)
    
    return cleaned_text

In [7]:
df["commit_message"] = df["commit_message"].apply(clean_text)

In [8]:
df.head()

Unnamed: 0,commit_message
0,initial commit
1,initial commit
2,initial commit
3,Upload CryptoPunks.py
4,Update squad_multitask.py


In [9]:
vect = TfidfVectorizer(stop_words=stop_words, max_features=1000)
vect_text = vect.fit_transform(df["commit_message"])

print(vect_text.shape)
print(vect_text)

(7522, 1000)
  (0, 208)	0.6966280483300364
  (0, 482)	0.7174324792479668
  (1, 208)	0.6966280483300364
  (1, 482)	0.7174324792479668
  (2, 208)	0.6966280483300364
  (2, 482)	0.7174324792479668
  (3, 690)	0.4794475725490822
  (3, 251)	0.833599061963867
  (3, 913)	0.2743038991151113
  (4, 803)	0.88800801917058
  (4, 910)	0.20744436551815662
  (4, 690)	0.4103761604960895
  (5, 208)	0.6966280483300364
  (5, 482)	0.7174324792479668
  (6, 910)	1.0
  (7, 558)	0.5228597249372511
  (7, 711)	0.5056324788577862
  (7, 243)	0.686260522223647
  (8, 385)	0.9444650662151614
  (8, 258)	0.3286118359085547
  (9, 841)	0.26416331029925355
  (9, 721)	0.26463492384369514
  (9, 461)	0.261182836967107
  (9, 203)	0.26072698940692574
  (9, 426)	0.2613134568034284
  :	:
  (7517, 913)	0.30975310137371664
  (7518, 938)	0.5651920966008608
  (7518, 526)	0.26400321692479306
  (7518, 424)	0.26400321692479306
  (7518, 640)	0.27282130119818565
  (7518, 3)	0.4177815439363102
  (7518, 2)	0.4171032900734515
  (7518, 258)	0.

In [10]:
idf = vect.idf_
dd = dict(zip(vect.get_feature_names_out(), idf))
l = sorted(dd, key=(dd).get)

print(l[0], l[-1])
print(dd["update"])
print(dd["zxnrm9"])

update zxnrm9
2.0620690081115702
9.232573093000276


In [11]:
lsa_model = TruncatedSVD(n_components=10, algorithm='randomized', n_iter=10, random_state=42)
lsa_top=lsa_model.fit_transform(vect_text)

In [12]:
print(lsa_top)
print(lsa_top.shape)

[[ 1.65999768e-02  9.99570406e-01 -3.16992185e-03 ...  5.72348639e-03
  -3.09401679e-06 -4.00539277e-03]
 [ 1.65999768e-02  9.99570406e-01 -3.16992185e-03 ...  5.72348639e-03
  -3.09401679e-06 -4.00539277e-03]
 [ 1.65999768e-02  9.99570406e-01 -3.16992185e-03 ...  5.72348639e-03
  -3.09401679e-06 -4.00539277e-03]
 ...
 [ 6.41197041e-02 -9.43506732e-04  3.53236600e-03 ... -2.30714138e-02
   6.77256083e-07  1.99293252e-01]
 [-3.48994792e-17 -7.64697186e-18  4.30582687e-17 ... -1.33157563e-07
   9.99999903e-01  9.37849111e-06]
 [ 9.93649491e-01 -1.69369505e-02 -1.01194131e-02 ...  7.34357119e-03
  -1.33778850e-06 -2.22471782e-03]]
(7522, 10)


In [13]:
l=lsa_top[0]
print("Document 0 :")
for i,topic in enumerate(l):
  print("Topic ",i," : ",topic*100)

Document 0 :
Topic  0  :  1.6599976844997735
Topic  1  :  99.95704055306686
Topic  2  :  -0.3169921848347223
Topic  3  :  0.0051059943322054775
Topic  4  :  0.006928706006512238
Topic  5  :  -0.007288855245144336
Topic  6  :  -1.083122968753771
Topic  7  :  0.5723486389639196
Topic  8  :  -0.0003094016787536982
Topic  9  :  -0.4005392769700201


In [14]:
print(lsa_model.components_.shape) # (no_of_topics*no_of_words)
print(lsa_model.components_)

(10, 1000)
[[ 4.86740195e-07  3.65086310e-06  5.86481307e-04 ...  1.09711500e-04
   2.27045181e-07  3.84114522e-06]
 [ 3.32531818e-07  5.90573313e-08  2.57403341e-04 ...  6.90701760e-06
   7.42587847e-08 -9.14475718e-08]
 [ 1.56098759e-04  1.29795810e-05  1.22173704e-01 ...  5.00570880e-05
   6.77139943e-06 -1.22822527e-07]
 ...
 [ 1.40166586e-04 -1.21074650e-05  4.65889537e-01 ...  1.68854097e-04
   1.54868083e-04  4.43553723e-06]
 [-2.57339609e-07  5.79153254e-08 -1.08927059e-05 ...  8.69754882e-07
  -7.40848667e-07 -2.67475048e-09]
 [ 2.61298879e-04  1.13223470e-04 -1.93439720e-01 ...  1.85679147e-03
   2.81403923e-04 -2.21777928e-06]]


In [15]:
vocab = vect.get_feature_names()

for i, comp in enumerate(lsa_model.components_):
    vocab_comp = zip(vocab, comp)
    sorted_words = sorted(vocab_comp, key= lambda x:x[1], reverse=True)[:10]
    print("Topic "+str(i)+": ")
    for t in sorted_words:
        print(t[0],end=" ")
    print("\n")

Topic 0: 
datasets file library note releases release tag github huggingface com 

Topic 1: 
commit initial file transformers data init 218e496519ff14b4bc69ea559616af6f2ef89e57 first parquet upload 

Topic 2: 
data upload git lfs parquet train 00494 00000 00001 test 

Topic 3: 
readme md update create py json qalb test test_ldkp delete 

Topic 4: 
json dataset_infos upload test update qalb csv py delete jsonl 

Topic 5: 
update py test_ldkp qalb data parquet test dataset metrics 00494 

Topic 6: 
data file 00000 00001 test json qalb commit validation added 

Topic 7: 
00000 00001 test csv validation upload git lfs delete jsonl 

Topic 8: 
huggingartists file 01233 train json git lfs parquet qalb dataset 

Topic 9: 
csv py create delete file upload test test_ldkp data jsonl 





In [16]:
lda_model=LatentDirichletAllocation(n_components=10,learning_method='online',random_state=42,max_iter=1) 

In [17]:
lda_top=lda_model.fit_transform(vect_text)

In [18]:
print(lda_top.shape)
print(lda_top)

(7522, 10)
[[0.04142398 0.04142398 0.04142398 ... 0.04142398 0.04142398 0.04142398]
 [0.04142398 0.04142398 0.04142398 ... 0.04142398 0.04142398 0.04142398]
 [0.04142398 0.04142398 0.04142398 ... 0.04142398 0.04142398 0.04142398]
 ...
 [0.0390533  0.03904573 0.64857814 ... 0.03904596 0.03904612 0.0390473 ]
 [0.05       0.05       0.05       ... 0.05       0.05       0.05      ]
 [0.02301723 0.02301546 0.02301623 ... 0.02301561 0.02301552 0.79285798]]


In [19]:
sum=0
for i in lda_top[0]:
  sum=sum+i
print(sum)  

1.0


In [20]:
print("Document 0: ")
for i,topic in enumerate(lda_top[0]):
  print("Topic ",i,": ",topic*100,"%")

Document 0: 
Topic  0 :  4.142398220802241 %
Topic  1 :  4.142398211989103 %
Topic  2 :  4.142398206803978 %
Topic  3 :  4.142398206682384 %
Topic  4 :  4.142398219405575 %
Topic  5 :  4.142407526460775 %
Topic  6 :  62.7184067418854 %
Topic  7 :  4.1423982068799265 %
Topic  8 :  4.142398206781031 %
Topic  9 :  4.142398252309597 %


In [21]:
print(lda_model.components_)
print(lda_model.components_.shape)

[[0.9937703  0.10387214 0.10383334 ... 0.10446433 0.1035593  0.10403754]
 [0.10442454 0.10415555 0.1356207  ... 0.10392133 0.10412554 0.10514369]
 [0.10321015 0.10391832 0.10422119 ... 2.47841244 0.10368016 0.10381942]
 ...
 [0.1035873  0.10613124 0.1040642  ... 0.10396642 0.10315627 0.1247359 ]
 [0.10455492 0.11129556 0.1038151  ... 0.10424846 2.28564331 0.10384095]
 [0.10352771 0.10358448 0.10452014 ... 0.10415669 0.10323042 0.10341478]]
(10, 1000)


In [22]:
for i, comp in enumerate(lda_model.components_):
    vocab_comp = zip(vocab, comp)
    sorted_words = sorted(vocab_comp, key= lambda x:x[1], reverse=True)[:10]
    print("Topic "+str(i)+": ")
    for t in sorted_words:
        print(t[0],end=" ")
    print("\n")

Topic 0: 
readme md update create jsonl adding first gz sentihood folder 

Topic 1: 
txt added loader 00011 00003 debugging upload valid 00005 fixed 

Topic 2: 
py 18 update rename metrics attempt image alffa_amharic batch13 scifi_tv_shows 

Topic 3: 
data 00000 parquet 00001 lfs git upload train metadata validation 

Topic 4: 
upload train git lfs parquet data 00494 json huggingartists dataset_infos 

Topic 5: 
dataset script card name init updated loading dummy info change 

Topic 6: 
commit initial test_ldkp version code transformers question recon bumbp datset 

Topic 7: 
sample flagged split path link download track file changed config 

Topic 8: 
csv delete upload test gitattributes import raw feature gec val 

Topic 9: 
datasets file tag huggingface github release http com note library 

