In [10]:
import pandas as pd
import re

In [3]:
tweets='Train_QuantumTunnel_Tweets.csv'
tweets=pd.read_csv(tweets,encoding='utf-8')

In [5]:
def mentions_hashtags_urls(tw):
    mnt = re.compile("@\w+(?:[-’]\w+)*|")
    hash = re.compile('#\w+(?:[-]\w+)*')
    urls = re.compile('http\S+')
    mention = " ".join(mnt.findall(tw))
    hashtag = " ".join(hash.findall(tw))
    link = " ".join(urls.findall(tw))
    return mention, hashtag, link

In [11]:
tweets['Mentions'],tweets['Hashtags'],tweets['URLs']=zip(*tweets['Tweet'].map(mentions_hashtags_urls))

In [14]:
tweets[['Mentions','URLs','Hashtags']].tail(3)

Unnamed: 0,Mentions,URLs,Hashtags
321,@R_Trotta ...,,
322,...,https://t.co/no4Usx6djV,#maths
323,...,http://t.co/fW7pSgTWGj,


In [16]:
import nltk
from nltk.tokenize.casual import TweetTokenizer

porter=nltk.PorterStemmer()

In [19]:
from nltk.corpus import stopwords
import string

stop_words=stopwords.words('english')
stop_words.extend(["i've"])

In [18]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\91821\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [22]:
def tw_preprocess(tw):
    tw = tw.lower()
    tw = re.sub("@\w+(?:[-’]\w+)*","", tw)
    tw = re.sub(r"\S*\d\S*","", tw)
    tw = re.sub("http\S+","", tw)
    tw = re.sub("[#|']", "", tw)
    tokens = TweetTokenizer().tokenize(tw)
    tokens = [t for t in tokens if t not in
    stop_words]
    tokens = [porter.stem(t) for t in tokens]
    tokens = [t for t in tokens if t not in
    string.punctuation]
    tokens = ' '.join(tokens)
    return tokens

In [23]:
tweets["Processed_Tweet"]=tweets["Tweet"].apply(tw_preprocess)

In [24]:
tweets["Processed_Tweet"].tail()

319                                   perhap peopl level
320    yay connect automat eduroam univers michigan g...
321    true mean would cinema arriv late also paid en...
322                 report card famou mathematician math
323         princeton guid linear model logist regress r
Name: Processed_Tweet, dtype: object

In [30]:
from sklearn.feature_extraction.text import CountVectorizer
no_features=1000

vectoriser=CountVectorizer(min_df=2,max_features=no_features)

In [27]:
%pip install scikit-learn

Note: you may need to restart the kernel to use updated packages.


In [32]:
tw_vectorizer=vectoriser.fit_transform(tweets["Processed_Tweet"])

In [37]:
tw_vectorizer_names=vectoriser.get_feature_names_out()

In [39]:
print(tw_vectorizer_names[:5])

['actual' 'ai' 'algorithm' 'alien' 'amaz']


In [40]:
from sklearn.decomposition import LatentDirichletAllocation

In [41]:
n_components = range(3, 8)
search_params = {"n_components": n_components,
"learning_decay": [0.6, 0.8, 1.0]}

In [42]:
lda = LatentDirichletAllocation(
max_iter=10,
learning_method="online",
random_state=0,
evaluate_every=-1,
learning_offset=50.0)

In [47]:
from sklearn.model_selection import GridSearchCV 
model = GridSearchCV(lda,param_grid=search_params,cv=15)


In [50]:
model.fit(tw_vectorizer) 
best_lda_model = model.best_estimator_


In [51]:
print("Best Model’s Params: ",model.best_params_)


Best Model’s Params:  {'learning_decay': 1.0, 'n_components': 3}
