In [72]:
import pandas as pd
import re
from typing import List

In [73]:
df = pd.read_csv('./ye-tweets.csv')
df = df["tweet"]
df

0                      KANYE 2024 https://t.co/Zm2pKcn12t
1                      I VOTED 🇺🇸 https://t.co/hlgIJUST4x
2                    KANYE2020 🇺🇸 https://t.co/3kd8vrrHZQ
3                               🕊 https://t.co/tFqpKyQzkY
4       The first vote of my life         We are here ...
                              ...                        
1662    try to avoid any contractual situation where y...
1663    You have to protect your ability to create at ...
1664    As a creative your ideas are your strongest fo...
1665    often people working with the existing conscio...
1666    Some people have to work within the existing c...
Name: tweet, Length: 1667, dtype: object

In [74]:
# Constant variable
PUNCTUATIONS = r"""!()-[]{};:'"\,<>./?@#$%^&*_~"""

STOPWORDS = [
    "the", "is", "are", "an", "be", "I","an","of","example",
    "both","yeah","in", "on", "at", "and", "because", "but", 
    "been", "by", "a", "for",  "from",
    "all", "as", "can", "can't", "about", "do", "even", "every"
]

In [84]:
def tokenize(text: str, stopwords: List = STOPWORDS, clean_token = [], corpus = []) -> List[str]:
    
    # remove the links and the user name 
    text1 = re.sub('(http|ftp|https):\/\/([\w_-]+(?:(?:\.[\w_-]+)+))([\w.,@?^=%&:\/~+#-]*[\w@?^=%&\/~+#-])', '', text) #replacce the link with nothing, remove it
    #find out the tags (regex might be @sdjfhkfhda with the space after), and remove it
    text2 = re.sub('^@?(\w){1,15}$', '', text1) #replace the username/tags to '' kinda remove it
    text3 = re.sub('/[^a-zA-Z ]/g', ' ', text2)
    
    # obtains tokens with a least 1 alphabet
    pattern = re.compile(r'[A-Za-z]+[\w^\']*|[\w^\']*[A-Za-z]+[\w^\']*')
    tokens = pattern.findall(text3.lower())

    #remove the stopwords
    for token in tokens:
        # remove the stopwords
        if token not in stopwords:
            clean_token.append(token)

        # create a list of words
        if token not in corpus:
            corpus.append(token)   
        
        # sort the list
        clean_token.sort()
        corpus.sort()
        return clean_token

In [78]:
tokens = []
corpus = [] # is the dictionary (like the things are not repeatable)

for line in df:
    tokenize(line, stopwords=STOPWORDS, clean_token=tokens, corpus=corpus)

In [77]:
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.metrics.pairwise import cosine_similarity

# cv = CountVectorizer()
# cv_mat = cv.fit_transform(df)
# cv_mat.todense() # change into dense

# # Create word matrix
# df_cv_words = pd.DataFrame(cv_mat.todense(),columns=cv.get_feature_names_out())
# df_cv_words

In [79]:
# count vectorize

df

0                      KANYE 2024 https://t.co/Zm2pKcn12t
1                      I VOTED 🇺🇸 https://t.co/hlgIJUST4x
2                    KANYE2020 🇺🇸 https://t.co/3kd8vrrHZQ
3                               🕊 https://t.co/tFqpKyQzkY
4       The first vote of my life         We are here ...
                              ...                        
1662    try to avoid any contractual situation where y...
1663    You have to protect your ability to create at ...
1664    As a creative your ideas are your strongest fo...
1665    often people working with the existing conscio...
1666    Some people have to work within the existing c...
Name: tweet, Length: 1667, dtype: object

In [87]:
from collections import Counter
from scipy.sparse import csr_matrix

def create_vocab(corpus):
    vocab = {}
    for i, word in enumerate(sorted(list(corpus))):
        vocab[word] = i    
    return vocab

# create the vocab like yeah
vocab = create_vocab(corpus=corpus)
row, col, val = [], [], []

# main loop to form the matrix
for idx, sentence in enumerate(df):

    # need to fix this part since some parts they might not recognized
    count_word = dict(Counter(tokenize(sentence.lower())))

    for word, count in count_word.items():
        if len(word) > 2:
            col_index = vocab.get(word)
            if col_index >= 0:
                row.append(idx)
                col.append(col_index)
                val.append(count)

output_mat = csr_matrix((val, (row, col)), shape=(len(df), len(vocab))).toarray()
output_mat

array([[0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       ...,
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0],
       [0, 0, 0, ..., 0, 0, 0]])

In [88]:
def word_counter(corpus):
    return dict(Counter(corpus)) 

word_counter(corpus=corpus)       

{'11am': 1,
 '13th': 1,
 '15th': 1,
 '16th': 1,
 '18th': 1,
 '1st': 1,
 '2000s': 1,
 '2020vision': 1,
 '2020visuon': 1,
 '20s': 1,
 '21st': 1,
 '22nd': 1,
 '25th': 1,
 '2chainz': 1,
 '2chnz': 1,
 '2gthr': 1,
 '2mrw': 1,
 '2nd': 1,
 '2pm': 1,
 '350s': 1,
 '40th': 1,
 '4th': 1,
 '60s': 1,
 '700s': 1,
 '7pm': 1,
 '7th': 1,
 '8pm': 1,
 '8th': 1,
 '90s': 1,
 'a': 1,
 'aaaaaaaaa': 1,
 'aaaaand': 1,
 'aaaand': 1,
 'abfalecbaldwin': 1,
 'ability': 1,
 'able': 1,
 'abolish': 1,
 'aborted': 1,
 'aborting': 1,
 'about': 1,
 'absence': 1,
 'absolutely': 1,
 'ac': 1,
 'academy': 1,
 'accept': 1,
 'acceptable': 1,
 'access': 1,
 'accessories': 1,
 'accomplished': 1,
 'accountable': 1,
 'achievement': 1,
 'acknowledge': 1,
 'acknowledging': 1,
 'acres': 1,
 'across': 1,
 'act': 1,
 'acting': 1,
 'actions': 1,
 'activity': 1,
 'actors': 1,
 'acts': 1,
 'actual': 1,
 'actually': 1,
 'ad': 1,
 'added': 1,
 'addicted': 1,
 'addictive': 1,
 'adidas': 1,
 'admiration': 1,
 'advanced': 1,
 'advances': 1,
 '