In [11]:
import pandas as pd
import re
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.decomposition import LatentDirichletAllocation
import lda
from nltk.tokenize import word_tokenize, RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import nltk
from tqdm import tqdm
import re
import numpy as np

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Arjit\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Arjit\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Arjit\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\Arjit\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [12]:
df = pd.read_csv('data_scrapped.csv', index_col=0)

In [13]:
df

Unnamed: 0,group,text
0,DeFimillion,⛏ To succeed where others have failed! To lead...
1,VerifiedCryptoNews,BITCOIN Update: BTCUSDT - H4 chart. Date: 29th...
2,universalcryptosignals,Follow us on tradingview for daily upto 10 fre...
3,altsignals,**You struggled with these fake and eye catchi...
4,mycryptopedia,**🤣 Meme of the Week 🤣** **👉TIP OF THE DAY**👈...
5,btcchamp,💎`STYLIKE by FASHION` `TV💎 📲A web3 fashion ap...
6,onwardbtc_official,https://youtu.be/eetgnDcHku8 https://youtu.be/...


In [14]:
ignore = set(stopwords.words('english'))
## load stopwords.txt and add to ignore
with open('stopwords.txt', 'r') as f:
    for line in f:
        ignore.add(line.strip())
        
tokenizer = RegexpTokenizer(r'\w+')  # remove punctuation and emojis from text
stemmer = WordNetLemmatizer()  # lemmatize words

In [15]:
data = df['text'].tolist()

In [16]:
# Preprocess the text data (can be improved)
text = []
for i in range(len(data)):
    stemmed = []
    words = tokenizer.tokenize(data[i])
    for word in tqdm(words):
        word = re.sub(r'\d+', '', word)
        word = word.strip()
        if word[:4] == 'box_' or word[:2] == '0x':
            continue
        if word not in ignore:
            stemmed.append(stemmer.lemmatize(word))
    text.append(' '.join(stemmed))


100%|██████████| 26845/26845 [00:01<00:00, 16567.10it/s]
100%|██████████| 181664/181664 [00:00<00:00, 192601.06it/s]
100%|██████████| 90492/90492 [00:00<00:00, 193307.60it/s]
100%|██████████| 167932/167932 [00:00<00:00, 183488.74it/s]
100%|██████████| 124026/124026 [00:00<00:00, 189306.79it/s]
100%|██████████| 47639/47639 [00:00<00:00, 185323.66it/s]
100%|██████████| 62334/62334 [00:00<00:00, 211248.03it/s]


In [17]:
# choosing N-grams from 1 to 2 i.e. unigrams and bigrams
vec = CountVectorizer(analyzer='word', ngram_range=(1, 2)) 
X = vec.fit_transform(text)


In [42]:
# 1 topic
model = LatentDirichletAllocation(n_components=1,learning_method='online') 
model.fit(X)


LatentDirichletAllocation(learning_method='online', n_components=1)

In [43]:
topic_word = model.exp_dirichlet_component_


In [44]:
n_top_words = 500 # Top 500 words to display for topic

In [45]:
vocab = vec.get_feature_names_out()
for i, topic_dist in enumerate(topic_word):
    topic_words = np.array(vocab)[np.argsort(topic_dist)][:-(n_top_words+1):-1]
    print(topic_words)


['profit' 'target' 'btc' 'binance' 'usdt' 'profit target' 'period'
 'minutes' 'profit period' 'hours' 'hours minutes' 'take' 'take profit'
 'target profit' 'bitcoin' 'period hours' 'futures' 'binance futures'
 'usdt take' 'we' 'bybit' 'market' 'crypto' 'bitmex' 'buy' 'the' 'price'
 'join' 'eth' 'trade' 'minutes binance' 'trading' 'marketcap' 'get' 'days'
 'days hours' 'signal' 'all' 'period days' 'support' 'volume' 'vip'
 'channel' 'sell' 'btc usdt' 'time' 'achieved' 'stop' 'target achieved'
 'tradingview' 'hit' 'achieved profit' 'all profit' 'loss' 'also' 'usd'
 'bybit usdt' 'bittrex' 'update' 'token' 'resistance' 'xrp' 'leverage'
 'this' 'one' 'short' 'io' 'ltc' 'term' 'post' 'good' 'long' 'bch' 'week'
 'verifiedcryptonews' 'month' 'exchange' 'eos' 'free' 'target target'
 'dominance' 'usdt all' 'level' 'it' 'btc usd' 'first' 'coin' 'still'
 'altsignals' 'stop loss' 'premium' 'kucoin' 'change' 'if' 'news' 'next'
 'ada' 'chart' 'today' 'last' 'buy sell' 'blockchain' 'go' 'old'
 'crypto

In [46]:
# sample conversation
convo = "Yea this is definitely the case. They only started their crypto division in 2017. So plenty of capital to bail out their subsidiary. Now all eyes on that wallet to see where the eth moves."

In [47]:
stemmed = []
words = tokenizer.tokenize(convo)
for word in tqdm(words):
    word = re.sub(r'\d+', '', word)
    word = word.strip()
    if word[:4] == 'box_' or word[:2] == '0x':
        continue
    if word not in ignore:
        stemmed.append(stemmer.lemmatize(word))


100%|██████████| 35/35 [00:00<00:00, 34663.67it/s]


In [48]:
for word in stemmed: # print extracted keywords
    for keyword in topic_words:
        if word == keyword:
            print(word)

crypto
capital
wallet
eth
