In [1]:
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer
from multiprocessing import Pool
import numpy as np

In [2]:
tweets = pd.read_csv('tweets.csv')

In [3]:
# fill empty cells with empty string
tweets = tweets.fillna('')

In [4]:
# lowercase and remove special characters
tweets['tweet'] = tweets['tweet'].apply(lambda x: ''.join([c.lower() for c in ' '.join(x.split()) if ord(c) < 128]))
tweets['accountDescription'] = tweets['accountDescription'].apply(lambda x: ''.join([c.lower() for c in ' '.join(x.split()) if ord(c) < 128]))
tweets['hashtags'] = tweets['hashtags'].str.lower().str.strip('][')


In [5]:
print(tweets[['tweet', 'hashtags']].head(5))

                                               tweet  \
0  rt @officialtravlad: $20,000 #crypto giveaway ...   
1  the nightly mint: daily nft recap https://t.co...   
2  @abnormal_crypto https://t.co/smntf0k9f0 join ...   
3  rt @ydragons_: its here and its ready for you!...   
4  @stepnofficial @crypto_birb how long does it t...   

                                            hashtags  
0                                           'crypto'  
1  'bitcoin', 'crypto', 'blockchain', 'eth', 'cry...  
2  'bitcoin', 'crypto', 'blockchain', 'eth', 'cry...  
3                                        'avalanche'  
4                                        'avalanche'  


In [6]:
# words we don't want neither in tweet nor hashtags
tweets_to_remove = [
    'giveaway', 
    'give away',
    'gift',
    'historic', 
    'giving',
    'avalanche', 
    'airdrop',
    'game', 
    'gaming', 
    'loto', 
    'lottery', 
    'winner', 
    'smartphone', 
    'cash', 
    'money', 
    'bone',
    'hours',
    'promote',
    'hrs',
    'cash',
    '48hrs',
    'idr',
    'xgem',
    'giezwacoin',
    'telegram',
    'whopping',
    'chat',
    'launch',
    'project',
    'passive',
    'token',
    'free',
    'account',
    'grab'
    ]

# words we don't want in account description
desc_to_remove = [
    'giveaway', 
    'give away',
    'gift',
    'avalanche', 
    'airdrop',
    'game', 
    'gaming', 
    'loto', 
    'lottery', 
    'winner', 
    'smartphone', 
    'bone',
    'promote',
    'hrs',
    '48hrs',
    'idr',
    'xgem',
    'giezwacoin',
    'whopping',
    'launch',
    'passive',
    'free'
    ]

In [7]:
print('Rows before cleaning: ' + str(tweets.shape[0]))
# clean from hastags col
tweets=tweets[~tweets.hashtags.str.contains('|'.join(tweets_to_remove), na=False)]
print('Rows after hashtags cleaning: ' + str(tweets.shape[0]))
# clean from tweet col
tweets=tweets[~tweets.tweet.str.contains('|'.join(tweets_to_remove))]
print('Rows after tweet cleaning: ' + str(tweets.shape[0]))
# clean from description col
tweets=tweets[~tweets.accountDescription.str.contains('|'.join(desc_to_remove))]
print('Rows after accountDescription cleaning: ' + str(tweets.shape[0]))


Rows before cleaning: 389989
Rows after hashtags cleaning: 281393
Rows after tweet cleaning: 177107
Rows after accountDescription cleaning: 167516


In [8]:
import re
# remove urls
tweets['tweet'] = tweets['tweet'].apply(lambda x: re.sub(r'http\S+', '', x))
# remove usernames
tweets['tweet'] = tweets['tweet'].apply(lambda x: (re.sub('@[^\s]+','',x)))
# remove # character
# tweets['tweet'] = tweets['tweet'].str.replace('#', '')

In [9]:
# strip whitespaces
tweets['tweet'] = tweets['tweet'].str.strip()
tweets = tweets[tweets['tweet']!='']
print('Rows after empty tweet cleaning: ' + str(tweets.shape[0]))

Rows after empty tweet cleaning: 156617


In [57]:
tweets.to_csv('tweets_no_scams.csv')

In [58]:
def porter_stemmer(sentence, cores=4):
    with Pool(processes=cores) as pool:
        stemmer = PorterStemmer()
        result = " ".join(pool.map(stemmer.stem, sentence.split(" ")))
    return result

vfunc = np.vectorize(porter_stemmer)
tweets['tweet_stemmer'] = vfunc(np.array(tweets['tweet']))

# tweets['tweet'] = tweets['tweet'].apply(porter_stemmer)
print(tweets['tweet_stemmer'].head(10))

1     the nightli mint: daili nft recap  #bitcoin #c...
10    when bitcoin at $500,000? novogratz say five y...
14    rt   we have our first #bitcoin royalty: princ...
16    whi vechain (vet) could be on the verg of a 40...
26    dogecoin price fall as india central bank ban ...
29    rt  stay tuned!  $flux #fluxnod #crypto #naa #...
30                                          awesom elon
31    bitcoin bullish signal: exchang reserv lose an...
33    rt  #crypto exchang which will be list $ape ( ...
39    what bitcoin role after end of petrodollar sys...
Name: tweet_stemmer, dtype: object


In [59]:
tweets.to_csv('tweets_stemmed.csv')

In [69]:
tweets['tweet_stemmer_hashtags'] = tweets['tweet_stemmer'] + ' ' + tweets['hashtags'].apply(lambda x: " ".join(["#"+c.strip() for c in x.replace("'","").split(',')]))
print(tweets['tweet_stemmer_hashtags'].head(10))

1     the nightli mint: daili nft recap  #bitcoin #c...
10    when bitcoin at $500,000? novogratz say five y...
14    rt   we have our first #bitcoin royalty: princ...
16    whi vechain (vet) could be on the verg of a 40...
26    dogecoin price fall as india central bank ban ...
29    rt  stay tuned!  $flux #fluxnod #crypto #naa #...
30           awesom elon #fluxnodes #crypto #naas #web3
31    bitcoin bullish signal: exchang reserv lose an...
33    rt  #crypto exchang which will be list $ape ( ...
39    what bitcoin role after end of petrodollar sys...
Name: tweet_stemmer_hashtags, dtype: object


In [70]:
tweets.to_csv('tweets_stemmed_hashtags.csv')

In [77]:
tweets['tweet_stemmer_hashtags'] = tweets['tweet_stemmer_hashtags'].replace(r'\s+', ' ', regex=True)
print(tweets['tweet_stemmer_hashtags'].head(5))

1     the nightli mint: daili nft recap #bitcoin #cr...
10    when bitcoin at $500,000? novogratz say five y...
14    rt we have our first #bitcoin royalty: princ p...
16    whi vechain (vet) could be on the verg of a 40...
26    dogecoin price fall as india central bank ban ...
Name: tweet_stemmer_hashtags, dtype: object


In [78]:
tweets.to_csv('tweets_stemmed_hashtags.csv')

In [82]:
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words = set(stopwords.words('english'))

tweets['tweet_stemmer_hashtags_no_stopwords'] = tweets['tweet_stemmer_hashtags'].apply(lambda x: " ".join([c for c in x.split(' ') if c not in stop_words]))

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/matthieu/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [83]:
print(tweets['tweet_stemmer_hashtags_no_stopwords'].head(5))

1     nightli mint: daili nft recap #bitcoin #crypto...
10    bitcoin $500,000? novogratz say five year #bit...
14    rt first #bitcoin royalty: princ philip yugosl...
16    whi vechain (vet) could verg 40% ralli #bitcoi...
26    dogecoin price fall india central bank ban cry...
Name: tweet_stemmer_hashtags_no_stopwords, dtype: object


In [84]:
tweets.to_csv('tweets_stemmed_hashtags_no_stopwords.csv')

## Vectorizing all words

In [86]:
from sklearn.feature_extraction.text import CountVectorizer

# vectorizing with frequency filter
#vectorizer = CountVectorizer(min_df=0.05, max_df=0.8)
vectorizer = CountVectorizer()
vect = vectorizer.fit(tweets["tweet_stemmer_hashtags_no_stopwords"])
n_features3 = len(vect.vocabulary_)
print("There are %s features in vocabulary" % n_features3)
X = vect.transform(tweets["tweet_stemmer_hashtags_no_stopwords"])

There are 25256 features in vocabulary


In [87]:
print(vect.vocabulary_)



In [88]:
print(X)

  (0, 3096)	1
  (0, 3104)	1
  (0, 4708)	2
  (0, 4887)	2
  (0, 7117)	2
  (0, 7208)	1
  (0, 7218)	1
  (0, 7329)	1
  (0, 7330)	1
  (0, 7613)	1
  (0, 9535)	2
  (0, 15403)	1
  (0, 16200)	1
  (0, 16463)	1
  (0, 18942)	1
  (1, 1)	1
  (1, 1729)	1
  (1, 3096)	1
  (1, 3104)	1
  (1, 4708)	3
  (1, 4887)	2
  (1, 7117)	2
  (1, 7208)	1
  (1, 7218)	1
  (1, 7329)	1
  :	:
  (156628, 7117)	1
  (156628, 14057)	1
  (156628, 14454)	1
  (156628, 15132)	1
  (156628, 15165)	1
  (156628, 18207)	1
  (156628, 20075)	1
  (156628, 20198)	1
  (156628, 20628)	1
  (156629, 2572)	1
  (156629, 3028)	1
  (156629, 5915)	1
  (156629, 6251)	1
  (156629, 7117)	1
  (156629, 8570)	1
  (156629, 8921)	1
  (156629, 10195)	1
  (156629, 10999)	1
  (156629, 11448)	1
  (156629, 11513)	2
  (156629, 12039)	1
  (156629, 14362)	1
  (156629, 16797)	1
  (156629, 19608)	1
  (156629, 20005)	1
