In [7]:
%pip install nltk

Note: you may need to restart the kernel to use updated packages.


In [8]:
import pandas as pd
import numpy as np
import re
import nltk
from emot.emo_unicode import UNICODE_EMOJI, EMOTICONS_EMO

In [9]:
#Read csv file and name columns
data = pd.read_csv("../Data/Twitter.csv", header=None, encoding='latin-1')
data.columns = ['sentiment', 'id', 'date', 'query', 'user', 'tweet']

data.head()

Unnamed: 0,sentiment,id,date,query,user,tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,NO_QUERY,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,NO_QUERY,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,NO_QUERY,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,NO_QUERY,Karoli,"@nationwideclass no, it's not behaving at all...."


In [10]:
#Drop the column 'query', as it only contains 'NO_QUERY'
data = data.drop(columns= 'query')

#Replace the 4 for a positive sentiment with a 1 for easier understanding (there are no numbers between 0 and 4)
data['sentiment'] = data['sentiment'].replace(4, 1)
#0 = negative, 1 = positive

data.head()

Unnamed: 0,sentiment,id,date,user,tweet
0,0,1467810369,Mon Apr 06 22:19:45 PDT 2009,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t..."
1,0,1467810672,Mon Apr 06 22:19:49 PDT 2009,scotthamilton,is upset that he can't update his Facebook by ...
2,0,1467810917,Mon Apr 06 22:19:53 PDT 2009,mattycus,@Kenichan I dived many times for the ball. Man...
3,0,1467811184,Mon Apr 06 22:19:57 PDT 2009,ElleCTF,my whole body feels itchy and like its on fire
4,0,1467811193,Mon Apr 06 22:19:57 PDT 2009,Karoli,"@nationwideclass no, it's not behaving at all...."


In [11]:
#Convert Date and time column into datetime format (By stripping day, month, year and time manually as strings and passing them into datetime function)
data['date_new'] = data['date'].str[8:10] + "/" + data['date'].str[4:7] + "/" + data['date'].str[24:28] + ", " + data['date'].str[11:19]
data['date_new'] = pd.to_datetime(data['date_new'], format="%d/%b/%Y, %H:%M:%S")

#extract DateTime information from column
data['year'] = pd.DatetimeIndex(data['date_new']).year
data['month'] = pd.DatetimeIndex(data['date_new']).month

#Weekday where 0 = Monday and 6 = Sunday
data['weekday'] = pd.DatetimeIndex(data['date_new']).weekday
data['time'] = pd.DatetimeIndex(data['date_new']).time
data['hour'] = pd.DatetimeIndex(data['date_new']).hour

# Extract Timezones
data['date'] = data['date'].astype('string')
data['timezone'] = data['date'].str[20:23]
data.head()

##sort time into groups
#create list of conditions (time groups)
conditions = [
    (data['hour'] < 4),
    (data['hour'] >= 4) & (data['hour'] < 8),
    (data['hour'] >= 8) & (data['hour'] < 12),
    (data['hour'] >= 12) & (data['hour'] < 16),
    (data['hour'] >= 16) & (data['hour'] < 20),
    (data['hour'] >= 20)
    ]

# create a list of the values we want to assign for each condition
values = ['0-4', '4-8', '8-12', '12-16', '16-20', '20-24']

# create a new column and use np.select to assign values to it using our lists as arguments
data['time_group'] = np.select(conditions, values)

#drop old date column
data.drop(['date', 'timezone'], axis = 1, inplace=True)
data.rename(columns={'date_new': 'date'}, inplace=True)

data.head()

Unnamed: 0,sentiment,id,user,tweet,date,year,month,weekday,time,hour,time_group
0,0,1467810369,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",2009-04-06 22:19:45,2009,4,0,22:19:45,22,20-24
1,0,1467810672,scotthamilton,is upset that he can't update his Facebook by ...,2009-04-06 22:19:49,2009,4,0,22:19:49,22,20-24
2,0,1467810917,mattycus,@Kenichan I dived many times for the ball. Man...,2009-04-06 22:19:53,2009,4,0,22:19:53,22,20-24
3,0,1467811184,ElleCTF,my whole body feels itchy and like its on fire,2009-04-06 22:19:57,2009,4,0,22:19:57,22,20-24
4,0,1467811193,Karoli,"@nationwideclass no, it's not behaving at all....",2009-04-06 22:19:57,2009,4,0,22:19:57,22,20-24


In [12]:
#Count the number of words per tweet
data['word_count'] = data['tweet'].str.split().str.len()

#Check, if certain special characters occur in a tweet (one-hot encoded)
data['dot_dot_dot'] = data['tweet'].str.contains('\.\.\.')
data['exclamation_mark'] = data['tweet'].str.contains('!')
data['question_mark'] = data['tweet'].str.contains('\?')
data['at_symbol'] = data['tweet'].str.contains('\@')
data['link'] = data['tweet'].str.contains('http')
data['money'] = data['tweet'].str.contains('\$|\€|\£')
data['paragraph_symbol'] = data['tweet'].str.contains('\§')
data['hashtag'] = data['tweet'].str.contains('#')

data.head()

Unnamed: 0,sentiment,id,user,tweet,date,year,month,weekday,time,hour,time_group,word_count,dot_dot_dot,exclamation_mark,question_mark,at_symbol,link,money,paragraph_symbol,hashtag
0,0,1467810369,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...",2009-04-06 22:19:45,2009,4,0,22:19:45,22,20-24,19,False,False,False,True,True,False,False,False
1,0,1467810672,scotthamilton,is upset that he can't update his Facebook by ...,2009-04-06 22:19:49,2009,4,0,22:19:49,22,20-24,21,True,True,False,False,False,False,False,False
2,0,1467810917,mattycus,@Kenichan I dived many times for the ball. Man...,2009-04-06 22:19:53,2009,4,0,22:19:53,22,20-24,18,False,False,False,True,False,False,False,False
3,0,1467811184,ElleCTF,my whole body feels itchy and like its on fire,2009-04-06 22:19:57,2009,4,0,22:19:57,22,20-24,10,False,False,False,False,False,False,False,False
4,0,1467811193,Karoli,"@nationwideclass no, it's not behaving at all....",2009-04-06 22:19:57,2009,4,0,22:19:57,22,20-24,21,False,False,True,True,False,False,False,False


In [13]:
convert_dict = {'dot_dot_dot': 'int64',
                'exclamation_mark': 'int64',
                'question_mark': 'int64',
                'at_symbol': 'int64',
                'link': 'int64',
                'money': 'int64',
                'paragraph_symbol': 'int64',
                'hashtag': 'int64'
                }
data = data.astype(convert_dict)

In [14]:
#nltk.download('stopwords')
stopword = nltk.corpus.stopwords.words('english') #Create a list of english stopwords from nltk
stopword

['i',
 'me',
 'my',
 'myself',
 'we',
 'our',
 'ours',
 'ourselves',
 'you',
 "you're",
 "you've",
 "you'll",
 "you'd",
 'your',
 'yours',
 'yourself',
 'yourselves',
 'he',
 'him',
 'his',
 'himself',
 'she',
 "she's",
 'her',
 'hers',
 'herself',
 'it',
 "it's",
 'its',
 'itself',
 'they',
 'them',
 'their',
 'theirs',
 'themselves',
 'what',
 'which',
 'who',
 'whom',
 'this',
 'that',
 "that'll",
 'these',
 'those',
 'am',
 'is',
 'are',
 'was',
 'were',
 'be',
 'been',
 'being',
 'have',
 'has',
 'had',
 'having',
 'do',
 'does',
 'did',
 'doing',
 'a',
 'an',
 'the',
 'and',
 'but',
 'if',
 'or',
 'because',
 'as',
 'until',
 'while',
 'of',
 'at',
 'by',
 'for',
 'with',
 'about',
 'against',
 'between',
 'into',
 'through',
 'during',
 'before',
 'after',
 'above',
 'below',
 'to',
 'from',
 'up',
 'down',
 'in',
 'out',
 'on',
 'off',
 'over',
 'under',
 'again',
 'further',
 'then',
 'once',
 'here',
 'there',
 'when',
 'where',
 'why',
 'how',
 'all',
 'any',
 'both',
 'each

In [15]:
stopword_clean = [word.replace("'", "") for word in stopword]

In [16]:
#Define the typical pattern of links ('https://', 'http://', 'www.'), tags ('@') and hashtags ('#')
url_pattern_1 = r'https?://\S+'
url_pattern_2 = r'www\.\S+'
tag_pattern = r'@\S+'
hashtag_pattern = r'#\S+'

#Add a new column for the tokenized tweet and remove all links, tags and hashtags from the tweets
data.insert(4, 'tweet_tokenized', data['tweet'].apply(lambda x: re.sub(url_pattern_1, '', x)))
data['tweet_tokenized'] = data['tweet_tokenized'].apply(lambda x: re.sub(url_pattern_2, '', x))
data['tweet_tokenized'] = data['tweet_tokenized'].apply(lambda x: re.sub(tag_pattern, '', x))
data['tweet_tokenized'] = data['tweet_tokenized'].apply(lambda x: re.sub(hashtag_pattern, '', x))


In [17]:
#Handling Emojis
# Function for converting emojis into word
def convert_emoticons(text):
    for emot in EMOTICONS_EMO:
        text = text.replace(emot, "".join(EMOTICONS_EMO[emot].replace(",","").replace(":","").split()))
    return text

# Apply Formula to tweets
data['tweet_tokenized'] = data['tweet_tokenized'].apply(lambda x: convert_emoticons(x))
data.head(6)

Unnamed: 0,sentiment,id,user,tweet,tweet_tokenized,date,year,month,weekday,time,...,time_group,word_count,dot_dot_dot,exclamation_mark,question_mark,at_symbol,link,money,paragraph_symbol,hashtag
0,0,1467810369,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","- Awww, that's a bummer. You shoulda got Da...",2009-04-06 22:19:45,2009,4,0,22:19:45,...,20-24,19,0,0,0,1,1,0,0,0
1,0,1467810672,scotthamilton,is upset that he can't update his Facebook by ...,is upset that he can't update his Facebook by ...,2009-04-06 22:19:49,2009,4,0,22:19:49,...,20-24,21,1,1,0,0,0,0,0,0
2,0,1467810917,mattycus,@Kenichan I dived many times for the ball. Man...,I dived many times for the ball. Managed to s...,2009-04-06 22:19:53,2009,4,0,22:19:53,...,20-24,18,0,0,0,1,0,0,0,0
3,0,1467811184,ElleCTF,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,2009-04-06 22:19:57,2009,4,0,22:19:57,...,20-24,10,0,0,0,0,0,0,0,0
4,0,1467811193,Karoli,"@nationwideclass no, it's not behaving at all....","no, it's not behaving at all. i'm mad. why am...",2009-04-06 22:19:57,2009,4,0,22:19:57,...,20-24,21,0,0,1,1,0,0,0,0
5,0,1467811372,joy_wolf,@Kwesidei not the whole crew,not the whole crew,2009-04-06 22:20:00,2009,4,0,22:20:00,...,20-24,5,0,0,0,1,0,0,0,0


In [18]:
#Remove all punctuation from the tweets
data['tweet_tokenized'] = data['tweet_tokenized'].str.replace('[^\w\s]', '')
data['tweet_tokenized'] = data['tweet_tokenized'].str.replace('_', '')

#Remove all whitespaces from the beginning or end of the tweets
data['tweet_tokenized'] = data['tweet_tokenized'].str.strip()

#Set all characters to lowercase
data['tweet_tokenized'] = data['tweet_tokenized'].apply(lambda x: x.lower())

#Remove all numbers
data['tweet_tokenized'] = data['tweet_tokenized'].str.replace('\d+', '')

data.head()

Unnamed: 0,sentiment,id,user,tweet,tweet_tokenized,date,year,month,weekday,time,...,time_group,word_count,dot_dot_dot,exclamation_mark,question_mark,at_symbol,link,money,paragraph_symbol,hashtag
0,0,1467810369,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","- awww, that's a bummer. you shoulda got davi...",2009-04-06 22:19:45,2009,4,0,22:19:45,...,20-24,19,0,0,0,1,1,0,0,0
1,0,1467810672,scotthamilton,is upset that he can't update his Facebook by ...,is upset that he can't update his facebook by ...,2009-04-06 22:19:49,2009,4,0,22:19:49,...,20-24,21,1,1,0,0,0,0,0,0
2,0,1467810917,mattycus,@Kenichan I dived many times for the ball. Man...,i dived many times for the ball. managed to sa...,2009-04-06 22:19:53,2009,4,0,22:19:53,...,20-24,18,0,0,0,1,0,0,0,0
3,0,1467811184,ElleCTF,my whole body feels itchy and like its on fire,my whole body feels itchy and like its on fire,2009-04-06 22:19:57,2009,4,0,22:19:57,...,20-24,10,0,0,0,0,0,0,0,0
4,0,1467811193,Karoli,"@nationwideclass no, it's not behaving at all....","no, it's not behaving at all. i'm mad. why am ...",2009-04-06 22:19:57,2009,4,0,22:19:57,...,20-24,21,0,0,1,1,0,0,0,0


In [19]:
#Tokenize sentences based on non-alphanumeric characters (Leerstelle)
def tokenize(text):
    tokens = re.split('\W+', text)
    return tokens

data['tweet_tokenized'] = data['tweet_tokenized'].apply(lambda x: tokenize(x))

data.head()

Unnamed: 0,sentiment,id,user,tweet,tweet_tokenized,date,year,month,weekday,time,...,time_group,word_count,dot_dot_dot,exclamation_mark,question_mark,at_symbol,link,money,paragraph_symbol,hashtag
0,0,1467810369,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","[, awww, that, s, a, bummer, you, shoulda, got...",2009-04-06 22:19:45,2009,4,0,22:19:45,...,20-24,19,0,0,0,1,1,0,0,0
1,0,1467810672,scotthamilton,is upset that he can't update his Facebook by ...,"[is, upset, that, he, can, t, update, his, fac...",2009-04-06 22:19:49,2009,4,0,22:19:49,...,20-24,21,1,1,0,0,0,0,0,0
2,0,1467810917,mattycus,@Kenichan I dived many times for the ball. Man...,"[i, dived, many, times, for, the, ball, manage...",2009-04-06 22:19:53,2009,4,0,22:19:53,...,20-24,18,0,0,0,1,0,0,0,0
3,0,1467811184,ElleCTF,my whole body feels itchy and like its on fire,"[my, whole, body, feels, itchy, and, like, its...",2009-04-06 22:19:57,2009,4,0,22:19:57,...,20-24,10,0,0,0,0,0,0,0,0
4,0,1467811193,Karoli,"@nationwideclass no, it's not behaving at all....","[no, it, s, not, behaving, at, all, i, m, mad,...",2009-04-06 22:19:57,2009,4,0,22:19:57,...,20-24,21,0,0,1,1,0,0,0,0


In [20]:
def remove_stopwords(tokenized_list): #Function to remove all stopword from our list of tokenized tweets
    text = [word for word in tokenized_list if word not in stopword_clean] #Write each word from our tokenized list into a new list, if it is not in the stopword list
    return text

#Create new column with tokenized tweets without stopwords
data['tweet_tokenized'] = data['tweet_tokenized'].apply(lambda x: remove_stopwords(x)) 
data.head()

Unnamed: 0,sentiment,id,user,tweet,tweet_tokenized,date,year,month,weekday,time,...,time_group,word_count,dot_dot_dot,exclamation_mark,question_mark,at_symbol,link,money,paragraph_symbol,hashtag
0,0,1467810369,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","[, awww, bummer, shoulda, got, david, carr, th...",2009-04-06 22:19:45,2009,4,0,22:19:45,...,20-24,19,0,0,0,1,1,0,0,0
1,0,1467810672,scotthamilton,is upset that he can't update his Facebook by ...,"[upset, update, facebook, texting, might, cry,...",2009-04-06 22:19:49,2009,4,0,22:19:49,...,20-24,21,1,1,0,0,0,0,0,0
2,0,1467810917,mattycus,@Kenichan I dived many times for the ball. Man...,"[dived, many, times, ball, managed, save, 50, ...",2009-04-06 22:19:53,2009,4,0,22:19:53,...,20-24,18,0,0,0,1,0,0,0,0
3,0,1467811184,ElleCTF,my whole body feels itchy and like its on fire,"[whole, body, feels, itchy, like, fire]",2009-04-06 22:19:57,2009,4,0,22:19:57,...,20-24,10,0,0,0,0,0,0,0,0
4,0,1467811193,Karoli,"@nationwideclass no, it's not behaving at all....","[behaving, mad, see, ]",2009-04-06 22:19:57,2009,4,0,22:19:57,...,20-24,21,0,0,1,1,0,0,0,0


In [21]:
#nltk.download('wordnet')
wnlemm = nltk.WordNetLemmatizer() 

def lemmatizing(tokenized_text): #Function to lemmatize all words in our tokenized tweets list without stopwords
    text = [wnlemm.lemmatize(word) for word in tokenized_text] #Lemmatize each word in our tokenized list and write it into a new list
    return text

#Create new column with lemmatized tweets from our tokenized tweets without stopwords
data['tweet_tokenized'] = data['tweet_tokenized'].apply(lambda x: lemmatizing(x))
data.head()

Unnamed: 0,sentiment,id,user,tweet,tweet_tokenized,date,year,month,weekday,time,...,time_group,word_count,dot_dot_dot,exclamation_mark,question_mark,at_symbol,link,money,paragraph_symbol,hashtag
0,0,1467810369,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","[, awww, bummer, shoulda, got, david, carr, th...",2009-04-06 22:19:45,2009,4,0,22:19:45,...,20-24,19,0,0,0,1,1,0,0,0
1,0,1467810672,scotthamilton,is upset that he can't update his Facebook by ...,"[upset, update, facebook, texting, might, cry,...",2009-04-06 22:19:49,2009,4,0,22:19:49,...,20-24,21,1,1,0,0,0,0,0,0
2,0,1467810917,mattycus,@Kenichan I dived many times for the ball. Man...,"[dived, many, time, ball, managed, save, 50, r...",2009-04-06 22:19:53,2009,4,0,22:19:53,...,20-24,18,0,0,0,1,0,0,0,0
3,0,1467811184,ElleCTF,my whole body feels itchy and like its on fire,"[whole, body, feel, itchy, like, fire]",2009-04-06 22:19:57,2009,4,0,22:19:57,...,20-24,10,0,0,0,0,0,0,0,0
4,0,1467811193,Karoli,"@nationwideclass no, it's not behaving at all....","[behaving, mad, see, ]",2009-04-06 22:19:57,2009,4,0,22:19:57,...,20-24,21,0,0,1,1,0,0,0,0


Cleaning

In [22]:
#N-gram vectorizing and tfidf need a list of strings passed to it, so we need to convert our list
data['tweet_tokenized_string'] = data['tweet_tokenized'].apply(lambda x: ' '.join(x)) #Join each word in our list with a space inbetween
data

Unnamed: 0,sentiment,id,user,tweet,tweet_tokenized,date,year,month,weekday,time,...,word_count,dot_dot_dot,exclamation_mark,question_mark,at_symbol,link,money,paragraph_symbol,hashtag,tweet_tokenized_string
0,0,1467810369,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","[, awww, bummer, shoulda, got, david, carr, th...",2009-04-06 22:19:45,2009,4,0,22:19:45,...,19,0,0,0,1,1,0,0,0,awww bummer shoulda got david carr third day ...
1,0,1467810672,scotthamilton,is upset that he can't update his Facebook by ...,"[upset, update, facebook, texting, might, cry,...",2009-04-06 22:19:49,2009,4,0,22:19:49,...,21,1,1,0,0,0,0,0,0,upset update facebook texting might cry result...
2,0,1467810917,mattycus,@Kenichan I dived many times for the ball. Man...,"[dived, many, time, ball, managed, save, 50, r...",2009-04-06 22:19:53,2009,4,0,22:19:53,...,18,0,0,0,1,0,0,0,0,dived many time ball managed save 50 rest go b...
3,0,1467811184,ElleCTF,my whole body feels itchy and like its on fire,"[whole, body, feel, itchy, like, fire]",2009-04-06 22:19:57,2009,4,0,22:19:57,...,10,0,0,0,0,0,0,0,0,whole body feel itchy like fire
4,0,1467811193,Karoli,"@nationwideclass no, it's not behaving at all....","[behaving, mad, see, ]",2009-04-06 22:19:57,2009,4,0,22:19:57,...,21,0,0,1,1,0,0,0,0,behaving mad see
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1599995,1,2193601966,AmandaMarie1028,Just woke up. Having no school is the best fee...,"[woke, school, best, feeling, ever]",2009-06-16 08:40:49,2009,6,1,08:40:49,...,11,0,0,0,0,0,0,0,0,woke school best feeling ever
1599996,1,2193601969,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...,"[thewdb, com, cool, hear, old, walt, interview...",2009-06-16 08:40:49,2009,6,1,08:40:49,...,11,0,1,0,0,1,0,0,0,thewdb com cool hear old walt interview â
1599997,1,2193601991,bpbabe,Are you ready for your MoJo Makeover? Ask me f...,"[ready, mojo, makeover, ask, detail]",2009-06-16 08:40:49,2009,6,1,08:40:49,...,11,0,0,1,0,0,0,0,0,ready mojo makeover ask detail
1599998,1,2193602064,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...,"[happy, 38th, birthday, boo, alll, time, tupac...",2009-06-16 08:40:49,2009,6,1,08:40:49,...,12,0,1,0,0,0,0,0,0,happy 38th birthday boo alll time tupac amaru ...


In [23]:
words = pd.DataFrame(data.tweet_tokenized_string.str.split(expand=True).stack().value_counts()).reset_index()
words.rename(columns={'index': 'word', 0: 'count'}, inplace=True)

In [24]:
words_to_split = words[words['count'] < 5]
words_to_split

Unnamed: 0,word,count
48626,hih,4
48627,euna,4
48628,deat,4
48629,hwz,4
48630,anyfin,4
...,...,...
269068,agingforwards,1
269069,fromme,1
269070,numbre,1
269071,deppresing,1


In [25]:
words_to_split_list = words_to_split['word'].tolist()
words_to_split_list

['hih',
 'euna',
 'deat',
 'hwz',
 'anyfin',
 'launchpad',
 'dlt',
 'kelci',
 'vapid',
 'earmuff',
 'itched',
 'berserker',
 'rumi',
 'commish',
 'napper',
 'crank2',
 'wonderul',
 'healthily',
 'compras',
 'sholat',
 'pyscho',
 'engish',
 '5hour',
 'ohgosh',
 'unles',
 'stevenage',
 'christinas',
 'relaxxx',
 'nlang',
 '2late',
 'homieee',
 'histology',
 'chlamydia',
 'nawws',
 'bergman',
 'gracia',
 'stumptown',
 'reverend',
 'sabes',
 'quotwinkorsmirkat',
 'fufilled',
 'trc',
 'ilysfm',
 'danelle',
 'crusted',
 'enacting',
 'conect',
 'goede',
 'fucksticks',
 'jamacia',
 'oowwww',
 'lovvveeee',
 'thinx',
 'sueno',
 'dufferin',
 'masonic',
 'pressin',
 '½we',
 'bennington',
 'masina',
 'yiayia',
 'j1',
 'urrrrgh',
 'followill',
 'monavie',
 'alices',
 'klatch',
 'demoscene',
 'abscence',
 'dona',
 'ohhkay',
 'bottleneck',
 'doohickey',
 'fugged',
 'yuppy',
 'oneal',
 'muwah',
 'souped',
 'jal',
 'scoreeeee',
 'qosh',
 'papou',
 'ohmahgah',
 '½ä',
 'condoning',
 'speculate',
 'nesn',


In [26]:
single_use_words = set(words_to_split_list) # Convert your list to a set for faster lookup

def remove_single_use_words(tweet):
    return [word for word in tweet if word not in single_use_words]

data['tweet_tokenized'] = data['tweet_tokenized'].apply(remove_single_use_words)

In [27]:
#N-gram vectorizing and tfidf need a list of strings passed to it, so we need to convert our list
data['tweet_tokenized_string'] = data['tweet_tokenized'].apply(lambda x: ' '.join(x)) #Join each word in our list with a space inbetween
data

Unnamed: 0,sentiment,id,user,tweet,tweet_tokenized,date,year,month,weekday,time,...,word_count,dot_dot_dot,exclamation_mark,question_mark,at_symbol,link,money,paragraph_symbol,hashtag,tweet_tokenized_string
0,0,1467810369,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","[, awww, bummer, shoulda, got, david, carr, th...",2009-04-06 22:19:45,2009,4,0,22:19:45,...,19,0,0,0,1,1,0,0,0,awww bummer shoulda got david carr third day ...
1,0,1467810672,scotthamilton,is upset that he can't update his Facebook by ...,"[upset, update, facebook, texting, might, cry,...",2009-04-06 22:19:49,2009,4,0,22:19:49,...,21,1,1,0,0,0,0,0,0,upset update facebook texting might cry result...
2,0,1467810917,mattycus,@Kenichan I dived many times for the ball. Man...,"[many, time, ball, managed, save, 50, rest, go...",2009-04-06 22:19:53,2009,4,0,22:19:53,...,18,0,0,0,1,0,0,0,0,many time ball managed save 50 rest go bound
3,0,1467811184,ElleCTF,my whole body feels itchy and like its on fire,"[whole, body, feel, itchy, like, fire]",2009-04-06 22:19:57,2009,4,0,22:19:57,...,10,0,0,0,0,0,0,0,0,whole body feel itchy like fire
4,0,1467811193,Karoli,"@nationwideclass no, it's not behaving at all....","[behaving, mad, see, ]",2009-04-06 22:19:57,2009,4,0,22:19:57,...,21,0,0,1,1,0,0,0,0,behaving mad see
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1599995,1,2193601966,AmandaMarie1028,Just woke up. Having no school is the best fee...,"[woke, school, best, feeling, ever]",2009-06-16 08:40:49,2009,6,1,08:40:49,...,11,0,0,0,0,0,0,0,0,woke school best feeling ever
1599996,1,2193601969,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...,"[com, cool, hear, old, walt, interview, â, ]",2009-06-16 08:40:49,2009,6,1,08:40:49,...,11,0,1,0,0,1,0,0,0,com cool hear old walt interview â
1599997,1,2193601991,bpbabe,Are you ready for your MoJo Makeover? Ask me f...,"[ready, mojo, makeover, ask, detail]",2009-06-16 08:40:49,2009,6,1,08:40:49,...,11,0,0,1,0,0,0,0,0,ready mojo makeover ask detail
1599998,1,2193602064,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...,"[happy, 38th, birthday, boo, alll, time, tupac...",2009-06-16 08:40:49,2009,6,1,08:40:49,...,12,0,1,0,0,0,0,0,0,happy 38th birthday boo alll time tupac shakur


In [28]:
#Wenn wir wollen, können wir diese Tweets alle herausfiltern (Dafür einfach das False in True umwandeln)
data = data[data['tweet_tokenized_string'].str.contains('[a-zA-Z]')==True] # Alle Tweet herausfiltern die nur nicht normale Buchstaben enthalten

In [29]:
# Index löschen um ihn neu zu nummerieren da sich die Anzahl der Tweets geändert hat
data = data.reset_index()
data = data.drop(columns=['index'])
data

Unnamed: 0,sentiment,id,user,tweet,tweet_tokenized,date,year,month,weekday,time,...,word_count,dot_dot_dot,exclamation_mark,question_mark,at_symbol,link,money,paragraph_symbol,hashtag,tweet_tokenized_string
0,0,1467810369,_TheSpecialOne_,"@switchfoot http://twitpic.com/2y1zl - Awww, t...","[, awww, bummer, shoulda, got, david, carr, th...",2009-04-06 22:19:45,2009,4,0,22:19:45,...,19,0,0,0,1,1,0,0,0,awww bummer shoulda got david carr third day ...
1,0,1467810672,scotthamilton,is upset that he can't update his Facebook by ...,"[upset, update, facebook, texting, might, cry,...",2009-04-06 22:19:49,2009,4,0,22:19:49,...,21,1,1,0,0,0,0,0,0,upset update facebook texting might cry result...
2,0,1467810917,mattycus,@Kenichan I dived many times for the ball. Man...,"[many, time, ball, managed, save, 50, rest, go...",2009-04-06 22:19:53,2009,4,0,22:19:53,...,18,0,0,0,1,0,0,0,0,many time ball managed save 50 rest go bound
3,0,1467811184,ElleCTF,my whole body feels itchy and like its on fire,"[whole, body, feel, itchy, like, fire]",2009-04-06 22:19:57,2009,4,0,22:19:57,...,10,0,0,0,0,0,0,0,0,whole body feel itchy like fire
4,0,1467811193,Karoli,"@nationwideclass no, it's not behaving at all....","[behaving, mad, see, ]",2009-04-06 22:19:57,2009,4,0,22:19:57,...,21,0,0,1,1,0,0,0,0,behaving mad see
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1588280,1,2193601966,AmandaMarie1028,Just woke up. Having no school is the best fee...,"[woke, school, best, feeling, ever]",2009-06-16 08:40:49,2009,6,1,08:40:49,...,11,0,0,0,0,0,0,0,0,woke school best feeling ever
1588281,1,2193601969,TheWDBoards,TheWDB.com - Very cool to hear old Walt interv...,"[com, cool, hear, old, walt, interview, â, ]",2009-06-16 08:40:49,2009,6,1,08:40:49,...,11,0,1,0,0,1,0,0,0,com cool hear old walt interview â
1588282,1,2193601991,bpbabe,Are you ready for your MoJo Makeover? Ask me f...,"[ready, mojo, makeover, ask, detail]",2009-06-16 08:40:49,2009,6,1,08:40:49,...,11,0,0,1,0,0,0,0,0,ready mojo makeover ask detail
1588283,1,2193602064,tinydiamondz,Happy 38th Birthday to my boo of alll time!!! ...,"[happy, 38th, birthday, boo, alll, time, tupac...",2009-06-16 08:40:49,2009,6,1,08:40:49,...,12,0,1,0,0,0,0,0,0,happy 38th birthday boo alll time tupac shakur


In [30]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1588285 entries, 0 to 1588284
Data columns (total 22 columns):
 #   Column                  Non-Null Count    Dtype         
---  ------                  --------------    -----         
 0   sentiment               1588285 non-null  int64         
 1   id                      1588285 non-null  int64         
 2   user                    1588285 non-null  object        
 3   tweet                   1588285 non-null  object        
 4   tweet_tokenized         1588285 non-null  object        
 5   date                    1588285 non-null  datetime64[ns]
 6   year                    1588285 non-null  int32         
 7   month                   1588285 non-null  int32         
 8   weekday                 1588285 non-null  int32         
 9   time                    1588285 non-null  object        
 10  hour                    1588285 non-null  int32         
 11  time_group              1588285 non-null  object        
 12  word_count    

In [31]:
data.drop(columns=['id', 'user', 'tweet', 'tweet_tokenized', 'date', 'year', 'month', 'time', 'time_group'], axis = 1, inplace = True)

In [28]:
""" data.to_pickle('data_cleaned') """

" data.to_pickle('data_cleaned') "

In [None]:
#%pip install keras

In [None]:
#%pip install tensorflow

In [32]:
#Split data in Dependent and indepentent variable
X = data.drop(columns=['sentiment'], axis = 1)
y= data['sentiment']

In [36]:
%pip install scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.2.2-cp39-cp39-win_amd64.whl (8.4 MB)
Collecting threadpoolctl>=2.0.0 (from scikit-learn)
  Using cached threadpoolctl-3.1.0-py3-none-any.whl (14 kB)
Installing collected packages: threadpoolctl, scikit-learn
Successfully installed scikit-learn-1.2.2 threadpoolctl-3.1.0
Note: you may need to restart the kernel to use updated packages.


In [37]:
#Train/Test Split into 3 sets (Training, Validation and Test)
from sklearn.model_selection import train_test_split

X_train, X_validation_and_test, y_train, y_validation_and_test = train_test_split(X, y, test_size=.1, random_state=42)
X_validation, X_test, y_validation, y_test = train_test_split(X_validation_and_test, y_validation_and_test, test_size=.1, random_state = 42)

In [38]:
#See if the different test sets are evenly split between positive and negative

print ("Train set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(X_train),
    (len(X_train[y_train == 0]) / (len(X_train)*1.))*100,
    (len(X_train[y_train == 1]) / (len(X_train)*1.))*100))
print("Validation set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(X_validation),
    (len(X_validation[y_validation == 0]) / (len(X_validation)*1.))*100,
    (len(X_validation[y_validation == 1]) / (len(X_validation)*1.))*100))
print("Test set has total {0} entries with {1:.2f}% negative, {2:.2f}% positive".format(len(X_test),
    (len(X_test[y_test == 0]) / (len(X_test)*1.))*100,
    (len(X_test[y_test == 1]) / (len(X_test)*1.))*100))

Train set has total 1429456 entries with 50.05% negative, 49.95% positive
Validation set has total 142946 entries with 50.07% negative, 49.93% positive
Test set has total 15883 entries with 48.84% negative, 51.16% positive


In [39]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import confusion_matrix

def accuracy_summary(pipeline, x_train, y_train, x_test, y_test):

    sentiment_fit = pipeline.fit(x_train, y_train)
    y_pred = sentiment_fit.predict(x_test)
    accuracy = accuracy_score(y_test, y_pred)
    print("accuracy score: {0:.2f}%".format(accuracy*100))
    print("-"*80)
    return accuracy

In [37]:
#Add additional features to matrix build from Tweet info

""" import scipy
from scipy.sparse import hstack

additional_features_train = X_train[['weekday',
                            'hour',
                            'word_count',
                            'dot_dot_dot',
                            'exclamation_mark',
                            'question_mark',
                            'at_symbol',
                            'link',
                            'money',
                            'paragraph_symbol',
                            'hashtag'
                            ]]

# We convert the additional_features dataframe to a sparse matrix
additional_features_train_matrix = scipy.sparse.csr_matrix(additional_features_train.values)

# Then we horizontally stack the tf-idf matrix with the additional features
X_train_matrix = hstack([X_train_vect_matrix, additional_features_train_matrix]) """

" import scipy\nfrom scipy.sparse import hstack\n\nadditional_features_train = X_train[['weekday',\n                            'hour',\n                            'word_count',\n                            'dot_dot_dot',\n                            'exclamation_mark',\n                            'question_mark',\n                            'at_symbol',\n                            'link',\n                            'money',\n                            'paragraph_symbol',\n                            'hashtag'\n                            ]]\n\n# We convert the additional_features dataframe to a sparse matrix\nadditional_features_train_matrix = scipy.sparse.csr_matrix(additional_features_train.values)\n\n# Then we horizontally stack the tf-idf matrix with the additional features\nX_train_matrix = hstack([X_train_vect_matrix, additional_features_train_matrix]) "

In [40]:
%pip install scipy

Note: you may need to restart the kernel to use updated packages.


In [35]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import scipy
from scipy.sparse import hstack

tvec = TfidfVectorizer()
lr = LogisticRegression(random_state = 42)
n_features = np.arange(100000,200001,25000)

add_features_train = X_train[['weekday','hour','word_count','dot_dot_dot','exclamation_mark','question_mark','at_symbol','link','money','paragraph_symbol','hashtag']]
add_features_validation = X_validation[['weekday','hour','word_count','dot_dot_dot','exclamation_mark','question_mark','at_symbol','link','money','paragraph_symbol','hashtag']]
add_features_test = X_test[['weekday','hour','word_count','dot_dot_dot','exclamation_mark','question_mark','at_symbol','link','money','paragraph_symbol','hashtag']]


for n in n_features:

    #Initialise Vectoriser in loop
    tvec = TfidfVectorizer(max_features=n,ngram_range=(1, 3))

    #Vectorize String
    X_train_matrix = tvec.fit_transform(X_train['tweet_tokenized_string'])
    X_validation_matrix = tvec.transform(X_validation['tweet_tokenized_string'])
    X_test_matrix = tvec.transform(X_test['tweet_tokenized_string'])

    # We convert the additional_features dataframe to a sparse matrix
    add_features_train_matrix = scipy.sparse.csr_matrix(add_features_train.values)
    add_features_validation_matrix = scipy.sparse.csr_matrix(add_features_validation.values)
    add_features_test_matrix = scipy.sparse.csr_matrix(add_features_test.values)

    # Then we horizontally stack the tf-idf matrix with the additional features
    X_train_matrix = hstack([X_train_matrix, add_features_train_matrix])
    X_validation_matrix = hstack([X_validation_matrix, add_features_validation_matrix])    
    X_test_matrix = hstack([X_test_matrix, add_features_test_matrix])

    #fit(train) model
    lr.fit(X_train_matrix, y_train)
    y_pred = lr.predict(X_test_matrix)

    accuracy = accuracy_score(y_test, y_pred)
    print ("accuracy score: {0:.2f}%".format(accuracy*100))
    print ("-"*80)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


accuracy score: 73.07%
--------------------------------------------------------------------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


accuracy score: 72.71%
--------------------------------------------------------------------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


accuracy score: 73.07%
--------------------------------------------------------------------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


accuracy score: 73.39%
--------------------------------------------------------------------------------
accuracy score: 73.20%
--------------------------------------------------------------------------------


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [48]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import scipy
from scipy.sparse import hstack


tvec = TfidfVectorizer(max_features=100000,ngram_range=(1, 3))

add_features_train = X_train[['weekday','hour','word_count','dot_dot_dot','exclamation_mark','question_mark','at_symbol','link','money','paragraph_symbol','hashtag']]
add_features_validation = X_validation[['weekday','hour','word_count','dot_dot_dot','exclamation_mark','question_mark','at_symbol','link','money','paragraph_symbol','hashtag']]
add_features_test = X_test[['weekday','hour','word_count','dot_dot_dot','exclamation_mark','question_mark','at_symbol','link','money','paragraph_symbol','hashtag']]


#Vectorize String
X_train_matrix = tvec.fit_transform(X_train['tweet_tokenized_string'])
X_validation_matrix = tvec.transform(X_validation['tweet_tokenized_string'])
X_test_matrix = tvec.transform(X_test['tweet_tokenized_string'])

# We convert the additional_features dataframe to a sparse matrix
#add_features_train_matrix = scipy.sparse.csr_matrix(add_features_train.values)
#add_features_validation_matrix = scipy.sparse.csr_matrix(add_features_validation.values)
#add_features_test_matrix = scipy.sparse.csr_matrix(add_features_test.values)

# Then we horizontally stack the tf-idf matrix with the additional features
#X_train_matrix = hstack([X_train_matrix, add_features_train_matrix])
#X_validation_matrix = hstack([X_validation_matrix, add_features_validation_matrix])    
#X_test_matrix = hstack([X_test_matrix, add_features_test_matrix])

In [2]:
%pip install --upgrade keras




In [44]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Flatten
from keras.layers import Embedding
from keras.preprocessing import sequence

In [50]:
from keras.models import Sequential
from keras.layers import Dense, Dropout
from keras.layers import Flatten
from keras.layers import Embedding
from keras.preprocessing import sequence

#Create Generator (Split Data for training model into different batches so PC can handle it)
def batch_generator(X_data, y_data, batch_size):
    samples_per_epoch = X_data.shape[0]
    number_of_batches = samples_per_epoch // batch_size
    counter = 0
    index = np.arange(np.shape(y_data)[0])
    while True:
        index_batch = index[batch_size * counter:batch_size * (counter + 1)]
        X_batch = X_data[index_batch, :].toarray()
        X_batch = np.expand_dims(X_batch, axis=1)  # Add an extra dimension
        y_batch = y_data[y_data.index[index_batch]]
        counter += 1
        yield X_batch, y_batch
        if counter > number_of_batches:
            counter = 0

# Define simple Hyperparameters for model (Input Layer = 100,000, Hidden Layer = 64, Relu activation Function)
# 1 Output Layer with Sigmoid activation Function
# Optimization Algorithm = ADAM
model = Sequential()
model.add(Dense(64, activation='relu', input_dim=100000))
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.fit_generator(
    generator=batch_generator(X_train_matrix, y_train, 32),
    epochs=5,
    validation_data=(X_validation_matrix, y_validation),
    steps_per_epoch=X_train_matrix.shape[0] // 32
)

Epoch 1/5


  model.fit_generator(


InvalidArgumentError: Graph execution error:

TypeError: `generator` yielded an element of shape (32,) where an element of shape (None, None) was expected.
Traceback (most recent call last):

  File "c:\Users\ollin\Documents\repos\marketing_analytics_project-1\.conda\lib\site-packages\tensorflow\python\ops\script_ops.py", line 267, in __call__
    ret = func(*args)

  File "c:\Users\ollin\Documents\repos\marketing_analytics_project-1\.conda\lib\site-packages\tensorflow\python\autograph\impl\api.py", line 642, in wrapper
    return func(*args, **kwargs)

  File "c:\Users\ollin\Documents\repos\marketing_analytics_project-1\.conda\lib\site-packages\tensorflow\python\data\ops\from_generator_op.py", line 235, in generator_py_func
    raise TypeError(

TypeError: `generator` yielded an element of shape (32,) where an element of shape (None, None) was expected.


	 [[{{node PyFunc}}]]
	 [[IteratorGetNext]] [Op:__inference_train_function_3583]

In [53]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

#Pass Sparse into dense matrix
X_train_matrix = X_train_matrix.toarray()
X_validation_matrix = X_validation_matrix.toarray()
X_test_matrix = X_test_matrix.toarray()

model = Sequential()
model.add(Dense(64, activation='relu', input_dim=X_train_matrix.shape[1]))
model.add(Dense(1, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

model.fit(X_train_matrix, y_train, epochs=5, batch_size=32, validation_data=(X_validation_matrix, y_validation))

loss, accuracy = model.evaluate(X_test_matrix, y_test)
print('Test Loss:', loss)
print('Test Accuracy:', accuracy)

MemoryError: Unable to allocate 1.04 TiB for an array with shape (1429456, 100000) and data type float64

In [73]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Define a generator to yield batches of sparse input data
def sparse_batch_generator(X_data, y_data, batch_size):
    samples_per_epoch = X_data.shape[0]
    number_of_batches = samples_per_epoch // batch_size
    counter = 0
    index = np.arange(samples_per_epoch)
    while True:
        index_batch = index[batch_size * counter:batch_size * (counter + 1)]
        X_batch = X_data[index_batch].toarray()  # Convert SparseTensor to dense matrix
        X_batch = np.reshape(X_batch, (X_batch.shape[0], -1))
        y_batch = np.array(y_data.iloc[index_batch]).reshape(-1, 1)
        counter += 1
        yield (X_batch, y_batch)
        if counter >= number_of_batches:
            counter = 0



# Define the architecture of your neural network model
model = Sequential()
model.add(Dense(64, activation='relu', input_dim=X_train_matrix.shape[1]))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model using the sparse batch generator
model.fit(sparse_batch_generator(X_train_matrix, y_train, batch_size=32),
          steps_per_epoch=X_train_matrix.shape[0] // 32,
          epochs=5,
          validation_data=sparse_batch_generator(X_validation_matrix, y_validation, batch_size=32),
          validation_steps=X_validation_matrix.shape[0] // 32)

# Evaluate the model on the test data
loss, accuracy = model.evaluate(sparse_batch_generator(X_test_matrix, y_test, batch_size=32),
                                          steps=X_test_matrix.shape[0] // 32)
print('Test Loss:', loss)
print('Test Accuracy:', accuracy)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Loss: 0.5148494243621826
Test Accuracy: 0.7670741081237793


In [71]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense

# Define a generator to yield batches of sparse input data
def sparse_batch_generator(X_data, y_data, batch_size, num_batches):
    samples_per_epoch = X_data.shape[0]
    number_of_batches = min(samples_per_epoch // batch_size, num_batches)
    counter = 0
    index = np.arange(samples_per_epoch)
    while counter < number_of_batches:
        index_batch = index[batch_size * counter:batch_size * (counter + 1)]
        X_batch = X_data[index_batch].toarray()  # Convert SparseTensor to dense matrix
        X_batch = np.reshape(X_batch, (X_batch.shape[0], -1))
        y_batch = np.array(y_data.iloc[index_batch]).reshape(-1, 1)
        counter += 1
        yield (X_batch, y_batch)
        if counter >= number_of_batches:
            counter = 0


# Define the architecture of your neural network model
model = Sequential()
model.add(Dense(64, activation='relu', input_dim=X_train_matrix.shape[1]))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

# Train the model using the sparse batch generator
model.fit(sparse_batch_generator(X_train_matrix, y_train, batch_size=32, num_batches=1),
          steps_per_epoch=1,
          epochs=3,
          validation_data=sparse_batch_generator(X_validation_matrix, y_validation, batch_size=32, num_batches=1),
          validation_steps=X_validation_matrix.shape[0] // 32)

# Evaluate the model on the test data
loss, accuracy = model.evaluate(sparse_batch_generator(X_test_matrix, y_test, batch_size=32, num_batches=1),
                                          steps=X_test_matrix.shape[0] // 32)
print('Test Loss:', loss)
print('Test Accuracy:', accuracy)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


TypeError: sparse_batch_generator() missing 1 required positional argument: 'num_batches'

In [72]:
# Evaluate the model on the test data
loss, accuracy = model.evaluate(sparse_batch_generator(X_test_matrix, y_test, batch_size=32, num_batches=1),
                                          steps=X_test_matrix.shape[0] // 32)
print('Test Loss:', loss)
print('Test Accuracy:', accuracy)

Test Loss: 0.6915808320045471
Test Accuracy: 0.53125
