In [1]:
import pandas as pd
import numpy as np
import re
import nltk

In [2]:
#Read csv file and name columns
data = pd.read_csv("../Data/Twitter.csv", header=None, encoding='latin-1')
data.columns = ['sentiment', 'id', 'date', 'query', 'user', 'tweet']

data.head()

Unnamed: 0,sentiment,id,date,query,user,tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [3]:
#Drop the column 'query', as it only contains 'NO_QUERY'
data = data.drop(columns= 'query')

#Replace the 4 for a positive sentiment with a 1 for easier understanding (there are no numbers between 0 and 4)
data['sentiment'] = data['sentiment'].replace(4, 1)
#0 = negative, 1 = positive

data.head()

Unnamed: 0,sentiment,id,date,user,tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,Karoli,"@nationwideclass no, it's not behaving at all...."


In [4]:
#Convert Date and time column into datetime format (By stripping day, month, year and time manually as strings and passing them into datetime function)
data['date_new'] = data['date'].str[8:10] + "/" + data['date'].str[4:7] + "/" + data['date'].str[24:28] + ", " + data['date'].str[11:19]
data['date_new'] = pd.to_datetime(data['date_new'], format="%d/%b/%Y, %H:%M:%S")

#extract DateTime information from column
data['year'] = pd.DatetimeIndex(data['date_new']).year
data['month'] = pd.DatetimeIndex(data['date_new']).month

#Weekday where 0 = Monday and 6 = Sunday
data['weekday'] = pd.DatetimeIndex(data['date_new']).weekday
data['time'] = pd.DatetimeIndex(data['date_new']).time
data['hour'] = pd.DatetimeIndex(data['date_new']).hour

# Extract Timezones
data['date'] = data['date'].astype('string')
data['timezone'] = data['date'].str[20:23]
data.head()

##sort time into groups
#create list of conditions (time groups)
conditions = [
    (data['hour'] < 4),
    (data['hour'] >= 4) & (data['hour'] < 8),
    (data['hour'] >= 8) & (data['hour'] < 12),
    (data['hour'] >= 12) & (data['hour'] < 16),
    (data['hour'] >= 16) & (data['hour'] < 20),
    (data['hour'] >= 20)
    ]

# create a list of the values we want to assign for each condition
values = ['0-4', '4-8', '8-12', '12-16', '16-20', '20-24']

# create a new column and use np.select to assign values to it using our lists as arguments
data['time_group'] = np.select(conditions, values)

#drop old date column
data.drop(['date', 'timezone'], axis = 1, inplace=True)
data.rename(columns={'date_new': 'date'}, inplace=True)

data.head()

Unnamed: 0,sentiment,id,user,tweet,date,year,month,weekday,time,hour,time_group
0,0,1467810369,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",2009-04-06 22:19:45,2009,4,0,22:19:45,22,20-24
1,0,1467810672,scotthamilton,is upset that he can't update his Facebook by ...,2009-04-06 22:19:49,2009,4,0,22:19:49,22,20-24
2,0,1467810917,mattycus,@Kenichan I dived many times for the ball. Man...,2009-04-06 22:19:53,2009,4,0,22:19:53,22,20-24
3,0,1467811184,ElleCTF,my whole body feels itchy and like its on fire,2009-04-06 22:19:57,2009,4,0,22:19:57,22,20-24
4,0,1467811193,Karoli,"@nationwideclass no, it's not behaving at all....",2009-04-06 22:19:57,2009,4,0,22:19:57,22,20-24


In [5]:
#Count the number of words per tweet
data['word_count'] = data['tweet'].str.split().str.len()

#Check, if certain special characters occur in a tweet (one-hot encoded)
data['dot_dot_dot'] = data['tweet'].str.contains('\.\.\.')
data['exclamation_mark'] = data['tweet'].str.contains('!')
data['question_mark'] = data['tweet'].str.contains('\?')
data['at_symbol'] = data['tweet'].str.contains('\@')
data['link'] = data['tweet'].str.contains('http')
data['money'] = data['tweet'].str.contains('\$|\€|\£')
data['paragraph_symbol'] = data['tweet'].str.contains('\§')
data['hashtag'] = data['tweet'].str.contains('#')

data.head()

Unnamed: 0,sentiment,id,user,tweet,date,year,month,weekday,time,hour,time_group,word_count,dot_dot_dot,exclamation_mark,question_mark,at_symbol,link,money,paragraph_symbol,hashtag
0,0,1467810369,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",2009-04-06 22:19:45,2009,4,0,22:19:45,22,20-24,19,False,False,False,True,True,False,False,False
1,0,1467810672,scotthamilton,is upset that he can't update his Facebook by ...,2009-04-06 22:19:49,2009,4,0,22:19:49,22,20-24,21,True,True,False,False,False,False,False,False
2,0,1467810917,mattycus,@Kenichan I dived many times for the ball. Man...,2009-04-06 22:19:53,2009,4,0,22:19:53,22,20-24,18,False,False,False,True,False,False,False,False
3,0,1467811184,ElleCTF,my whole body feels itchy and like its on fire,2009-04-06 22:19:57,2009,4,0,22:19:57,22,20-24,10,False,False,False,False,False,False,False,False
4,0,1467811193,Karoli,"@nationwideclass no, it's not behaving at all....",2009-04-06 22:19:57,2009,4,0,22:19:57,22,20-24,21,False,False,True,True,False,False,False,False


In [6]:
#Define the typical pattern of links ('https://', 'http://', 'www.'), tags ('@') and hashtags ('#')
url_pattern_1 = r'https?://\S+'
url_pattern_2 = r'www\.\S+'
tag_pattern = r'@\S+'
hashtag_pattern = r'#\S+'

#Add a new column for the tokenized tweet and remove all links, tags and hashtags from the tweets
data.insert(4, 'tweet_tokenized', data['tweet'].apply(lambda x: re.sub(url_pattern_1, '', x)))
data['tweet_tokenized'] = data['tweet_tokenized'].apply(lambda x: re.sub(url_pattern_2, '', x))
data['tweet_tokenized'] = data['tweet_tokenized'].apply(lambda x: re.sub(tag_pattern, '', x))
data['tweet_tokenized'] = data['tweet_tokenized'].apply(lambda x: re.sub(hashtag_pattern, '', x))

#Remove all punctuation from the tweets
data['tweet_tokenized'] = data['tweet_tokenized'].str.replace('[^\w\s]', '')

#Remove all whitespaces from the beginning or end of the tweets
data['tweet_tokenized'] = data['tweet_tokenized'].str.strip()

#Set all characters to lowercase
data['tweet_tokenized'] = data['tweet_tokenized'].apply(lambda x: x.lower())

data.head()


  data['tweet_tokenized'] = data['tweet_tokenized'].str.replace('[^\w\s]', '')


Unnamed: 0,sentiment,id,user,tweet,tweet_tokenized,date,year,month,weekday,time,...,time_group,word_count,dot_dot_dot,exclamation_mark,question_mark,at_symbol,link,money,paragraph_symbol,hashtag
0,0,1467810369,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",awww thats a bummer you shoulda got david car...,2009-04-06 22:19:45,2009,4,0,22:19:45,...,20-24,19,False,False,False,True,True,False,False,False
1,0,1467810672,scotthamilton,is upset that he can't update his Facebook by ...,is upset that he cant update his facebook by t...,2009-04-06 22:19:49,2009,4,0,22:19:49,...,20-24,21,True,True,False,False,False,False,False,False
2,0,1467810917,mattycus,@Kenichan I dived many times for the ball. Man...,i dived many times for the ball managed to sav...,2009-04-06 22:19:53,2009,4,0,22:19:53,...,20-24,18,False,False,False,True,False,False,False,False
3,0,1467811184,ElleCTF,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,2009-04-06 22:19:57,2009,4,0,22:19:57,...,20-24,10,False,False,False,False,False,False,False,False
4,0,1467811193,Karoli,"@nationwideclass no, it's not behaving at all....",no its not behaving at all im mad why am i her...,2009-04-06 22:19:57,2009,4,0,22:19:57,...,20-24,21,False,False,True,True,False,False,False,False


Ich habe jetzt mal alle Satzzeichen entfernt, inklusive dem '  

Damit ist sowas wie I'm und can't nicht mehr korrekt. Bin mir aber nicht sicher ob das so richtig ist.

Wenn wir die drinne lassen wollen müssen wir den Code zu remove punctuation einfach so schreiben:

data['tweet_tokenized'] = data['tweet_tokenized'].str.replace('[^\w\s]', '')

In [7]:
#Tokenize sentences based on non-alphanumeric characters
def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens

data['tweet_tokenized'] = data['tweet_tokenized'].apply(lambda x: tokenize(x))

data.head()

Unnamed: 0,sentiment,id,user,tweet,tweet_tokenized,date,year,month,weekday,time,...,time_group,word_count,dot_dot_dot,exclamation_mark,question_mark,at_symbol,link,money,paragraph_symbol,hashtag
0,0,1467810369,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","[awww, thats, a, bummer, you, shoulda, got, da...",2009-04-06 22:19:45,2009,4,0,22:19:45,...,20-24,19,False,False,False,True,True,False,False,False
1,0,1467810672,scotthamilton,is upset that he can't update his Facebook by ...,"[is, upset, that, he, cant, update, his, faceb...",2009-04-06 22:19:49,2009,4,0,22:19:49,...,20-24,21,True,True,False,False,False,False,False,False
2,0,1467810917,mattycus,@Kenichan I dived many times for the ball. Man...,"[i, dived, many, times, for, the, ball, manage...",2009-04-06 22:19:53,2009,4,0,22:19:53,...,20-24,18,False,False,False,True,False,False,False,False
3,0,1467811184,ElleCTF,my whole body feels itchy and like its on fire,"[my, whole, body, feels, itchy, and, like, its...",2009-04-06 22:19:57,2009,4,0,22:19:57,...,20-24,10,False,False,False,False,False,False,False,False
4,0,1467811193,Karoli,"@nationwideclass no, it's not behaving at all....","[no, its, not, behaving, at, all, im, mad, why...",2009-04-06 22:19:57,2009,4,0,22:19:57,...,20-24,21,False,False,True,True,False,False,False,False


In [8]:
#nltk.download('stopwords')
stopword = nltk.corpus.stopwords.words('english') #Create a list of english stopwords from nltk
stopword

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [9]:
def remove_stopwords(tokenized_list): #Function to remove all stopword from our list of tokenized tweets
    text = [word for word in tokenized_list if word not in stopword] #Write each word from our tokenized list into a new list, if it is not in the stopword list
    return text

#Create new column with tokenized tweets without stopwords
data['tweet_no_stop'] = data['tweet_tokenized'].apply(lambda x: remove_stopwords(x)) 
data.head()

Unnamed: 0,sentiment,id,user,tweet,tweet_tokenized,date,year,month,weekday,time,...,word_count,dot_dot_dot,exclamation_mark,question_mark,at_symbol,link,money,paragraph_symbol,hashtag,tweet_no_stop
0,0,1467810369,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","[awww, thats, a, bummer, you, shoulda, got, da...",2009-04-06 22:19:45,2009,4,0,22:19:45,...,19,False,False,False,True,True,False,False,False,"[awww, thats, bummer, shoulda, got, david, car..."
1,0,1467810672,scotthamilton,is upset that he can't update his Facebook by ...,"[is, upset, that, he, cant, update, his, faceb...",2009-04-06 22:19:49,2009,4,0,22:19:49,...,21,True,True,False,False,False,False,False,False,"[upset, cant, update, facebook, texting, might..."
2,0,1467810917,mattycus,@Kenichan I dived many times for the ball. Man...,"[i, dived, many, times, for, the, ball, manage...",2009-04-06 22:19:53,2009,4,0,22:19:53,...,18,False,False,False,True,False,False,False,False,"[dived, many, times, ball, managed, save, 50, ..."
3,0,1467811184,ElleCTF,my whole body feels itchy and like its on fire,"[my, whole, body, feels, itchy, and, like, its...",2009-04-06 22:19:57,2009,4,0,22:19:57,...,10,False,False,False,False,False,False,False,False,"[whole, body, feels, itchy, like, fire]"
4,0,1467811193,Karoli,"@nationwideclass no, it's not behaving at all....","[no, its, not, behaving, at, all, im, mad, why...",2009-04-06 22:19:57,2009,4,0,22:19:57,...,21,False,False,True,True,False,False,False,False,"[behaving, im, mad, cant, see]"


In [10]:
#nltk.download('wordnet')
wnlemm = nltk.WordNetLemmatizer() 

def lemmatizing(tokenized_text): #Function to lemmatize all words in our tokenized tweets list without stopwords
    text = [wnlemm.lemmatize(word) for word in tokenized_text] #Lemmatize each word in our tokenized list and write it into a new list
    return text

#Create new column with lemmatized tweets from our tokenized tweets without stopwords
data['tweet_lemmatized'] = data['tweet_no_stop'].apply(lambda x: lemmatizing(x))
data.head()

Unnamed: 0,sentiment,id,user,tweet,tweet_tokenized,date,year,month,weekday,time,...,dot_dot_dot,exclamation_mark,question_mark,at_symbol,link,money,paragraph_symbol,hashtag,tweet_no_stop,tweet_lemmatized
0,0,1467810369,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","[awww, thats, a, bummer, you, shoulda, got, da...",2009-04-06 22:19:45,2009,4,0,22:19:45,...,False,False,False,True,True,False,False,False,"[awww, thats, bummer, shoulda, got, david, car...","[awww, thats, bummer, shoulda, got, david, car..."
1,0,1467810672,scotthamilton,is upset that he can't update his Facebook by ...,"[is, upset, that, he, cant, update, his, faceb...",2009-04-06 22:19:49,2009,4,0,22:19:49,...,True,True,False,False,False,False,False,False,"[upset, cant, update, facebook, texting, might...","[upset, cant, update, facebook, texting, might..."
2,0,1467810917,mattycus,@Kenichan I dived many times for the ball. Man...,"[i, dived, many, times, for, the, ball, manage...",2009-04-06 22:19:53,2009,4,0,22:19:53,...,False,False,False,True,False,False,False,False,"[dived, many, times, ball, managed, save, 50, ...","[dived, many, time, ball, managed, save, 50, r..."
3,0,1467811184,ElleCTF,my whole body feels itchy and like its on fire,"[my, whole, body, feels, itchy, and, like, its...",2009-04-06 22:19:57,2009,4,0,22:19:57,...,False,False,False,False,False,False,False,False,"[whole, body, feels, itchy, like, fire]","[whole, body, feel, itchy, like, fire]"
4,0,1467811193,Karoli,"@nationwideclass no, it's not behaving at all....","[no, its, not, behaving, at, all, im, mad, why...",2009-04-06 22:19:57,2009,4,0,22:19:57,...,False,False,True,True,False,False,False,False,"[behaving, im, mad, cant, see]","[behaving, im, mad, cant, see]"


Vectorization

Uni/Bigram

In [16]:
#N-gram vectorizing and tfidf need a list of strings passed to it, so we need to convert our list
data['tweet_lemm_string'] = data['tweet_lemmatized'].apply(lambda x: ' '.join(x)) #Join each word in our list with a space inbetween
data

Unnamed: 0,sentiment,id,user,tweet,tweet_tokenized,date,year,month,weekday,time,...,exclamation_mark,question_mark,at_symbol,link,money,paragraph_symbol,hashtag,tweet_no_stop,tweet_lemmatized,tweet_lemm_string
0,0,1467810369,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","[awww, thats, a, bummer, you, shoulda, got, da...",2009-04-06 22:19:45,2009,4,0,22:19:45,...,False,False,True,True,False,False,False,"[awww, thats, bummer, shoulda, got, david, car...","[awww, thats, bummer, shoulda, got, david, car...",awww thats bummer shoulda got david carr third...
1,0,1467810672,scotthamilton,is upset that he can't update his Facebook by ...,"[is, upset, that, he, cant, update, his, faceb...",2009-04-06 22:19:49,2009,4,0,22:19:49,...,True,False,False,False,False,False,False,"[upset, cant, update, facebook, texting, might...","[upset, cant, update, facebook, texting, might...",upset cant update facebook texting might cry r...
2,0,1467810917,mattycus,@Kenichan I dived many times for the ball. Man...,"[i, dived, many, times, for, the, ball, manage...",2009-04-06 22:19:53,2009,4,0,22:19:53,...,False,False,True,False,False,False,False,"[dived, many, times, ball, managed, save, 50, ...","[dived, many, time, ball, managed, save, 50, r...",dived many time ball managed save 50 rest go b...
3,0,1467811184,ElleCTF,my whole body feels itchy and like its on fire,"[my, whole, body, feels, itchy, and, like, its...",2009-04-06 22:19:57,2009,4,0,22:19:57,...,False,False,False,False,False,False,False,"[whole, body, feels, itchy, like, fire]","[whole, body, feel, itchy, like, fire]",whole body feel itchy like fire
4,0,1467811193,Karoli,"@nationwideclass no, it's not behaving at all....","[no, its, not, behaving, at, all, im, mad, why...",2009-04-06 22:19:57,2009,4,0,22:19:57,...,False,True,True,False,False,False,False,"[behaving, im, mad, cant, see]","[behaving, im, mad, cant, see]",behaving im mad cant see
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1599995,1,2193601966,AmandaMarie1028,Just woke up. Having no school is the best fee...,"[just, woke, up, having, no, school, is, the, ...",2009-06-16 08:40:49,2009,6,1,08:40:49,...,False,False,False,False,False,False,False,"[woke, school, best, feeling, ever]","[woke, school, best, feeling, ever]",woke school best feeling ever
1599996,1,2193601969,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...,"[thewdbcom, very, cool, to, hear, old, walt, i...",2009-06-16 08:40:49,2009,6,1,08:40:49,...,True,False,False,True,False,False,False,"[thewdbcom, cool, hear, old, walt, interviews, â]","[thewdbcom, cool, hear, old, walt, interview, â]",thewdbcom cool hear old walt interview â
1599997,1,2193601991,bpbabe,Are you ready for your MoJo Makeover? Ask me f...,"[are, you, ready, for, your, mojo, makeover, a...",2009-06-16 08:40:49,2009,6,1,08:40:49,...,False,True,False,False,False,False,False,"[ready, mojo, makeover, ask, details]","[ready, mojo, makeover, ask, detail]",ready mojo makeover ask detail
1599998,1,2193602064,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...,"[happy, 38th, birthday, to, my, boo, of, alll,...",2009-06-16 08:40:49,2009,6,1,08:40:49,...,True,False,False,False,False,False,False,"[happy, 38th, birthday, boo, alll, time, tupac...","[happy, 38th, birthday, boo, alll, time, tupac...",happy 38th birthday boo alll time tupac amaru ...


In [31]:
from sklearn.feature_extraction.text import CountVectorizer

ngram_vect = CountVectorizer(ngram_range=(1,2)) #Create a vectorizer that creates unigrams and bigrams from our tweets
ngram_matrix = ngram_vect.fit_transform(data['tweet_lemm_string']) #Create a matrix with the counts of each unigram and bigram in each tweet
print(ngram_matrix.shape) #Print the shape of the matrix


(1600000, 4494693)


In [36]:
ngram_matrix

<1600000x4494693 sparse matrix of type '<class 'numpy.int64'>'
	with 20869914 stored elements in Compressed Sparse Row format>

In [41]:
#Look at a sample of the created ngram matrix to see how it looks like
ngram_matrix_sample = ngram_matrix[:10, :10]
feature_names = ngram_vect.get_feature_names_out()[:10]
ngram_matrix_sample = ngram_matrix_sample.toarray()
ngram_matrix_sample_df = pd.DataFrame(ngram_matrix_sample, columns=feature_names)
print(ngram_matrix_sample_df)

   00  00 10  00 agent  00 amp  00 baby  00 disappointed  00 draw  \
0   0      0         0       0        0                0        0   
1   0      0         0       0        0                0        0   
2   0      0         0       0        0                0        0   
3   0      0         0       0        0                0        0   
4   0      0         0       0        0                0        0   
5   0      0         0       0        0                0        0   
6   0      0         0       0        0                0        0   
7   0      0         0       0        0                0        0   
8   0      0         0       0        0                0        0   
9   0      0         0       0        0                0        0   

   00 exception  00 follower  00 good  
0             0            0        0  
1             0            0        0  
2             0            0        0  
3             0            0        0  
4             0            0        0  


Vielleicht sollten wir Zahlen auch aus unserem Datensatz entfernen

In [32]:
print(ngram_vect.get_feature_names_out()) #Print the names of the unigrams and bigrams


['00' '00 10' '00 agent' ... 'ûúøøù øøªûúù' 'ûúù' 'ûúù ù¾øøø³øù']


Wir haben scheinbar noch seltsame Zeichen in unseren Tweets, wie die hier am Ende der Ausgabe.

Man müsste mal schauen ob die verschiedenen vectorizing Ansätze direkt in ne Pipeline eingebaut werden damit man die einfach vergleichen kann. Ansonsten muss man hier dann die Features aus unserem Datensatz an die Matrix joinen oder andersrum.

Tfidf

In [50]:
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf_vect = TfidfVectorizer() #Create a vectorizer that weighs the count of each token by its frequency in the dataset
tfidf_matrix = tfidf_vect.fit_transform(data['tweet_lemm_string']) #Create a matrix with the tfidf score of each token in each tweet
print(tfidf_matrix.shape)
print(tfidf_vect.get_feature_names_out())

(1600000, 428706)
['00' '000' '0000' ... 'ûøùøø³øªù' 'ûúøøù' 'ûúù']


In [51]:
#Look at a sample of the created tdidf matrix to see how it looks like
tfidf_matrix_sample = tfidf_matrix[:10, :10]
feature_names = tfidf_vect.get_feature_names_out()[:10]
tfidf_matrix_sample = tfidf_matrix_sample.toarray()
tfidf_matrix_sample_df = pd.DataFrame(tfidf_matrix_sample, columns=feature_names)
print(tfidf_matrix_sample_df)

    00  000  0000  00000  0000000000  00000001  0000001  0000014  \
0  0.0  0.0   0.0    0.0         0.0       0.0      0.0      0.0   
1  0.0  0.0   0.0    0.0         0.0       0.0      0.0      0.0   
2  0.0  0.0   0.0    0.0         0.0       0.0      0.0      0.0   
3  0.0  0.0   0.0    0.0         0.0       0.0      0.0      0.0   
4  0.0  0.0   0.0    0.0         0.0       0.0      0.0      0.0   
5  0.0  0.0   0.0    0.0         0.0       0.0      0.0      0.0   
6  0.0  0.0   0.0    0.0         0.0       0.0      0.0      0.0   
7  0.0  0.0   0.0    0.0         0.0       0.0      0.0      0.0   
8  0.0  0.0   0.0    0.0         0.0       0.0      0.0      0.0   
9  0.0  0.0   0.0    0.0         0.0       0.0      0.0      0.0   

   000009260gbs  0000abcd  
0           0.0       0.0  
1           0.0       0.0  
2           0.0       0.0  
3           0.0       0.0  
4           0.0       0.0  
5           0.0       0.0  
6           0.0       0.0  
7           0.0       0.0  