In [1]:
from pyprojroot import here
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords #To remove stopwords
import string #To remove punctuations
from nltk.tokenize import RegexpTokenizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.models import Model
from sklearn.ensemble import RandomForestClassifier
# from mlxtend.plotting import plot_confusion_matrix
from sklearn.model_selection import train_test_split
from keras.optimizers import RMSprop
import tensorflow as tf

In [2]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices("GPU")))

Num GPUs Available:  0


In [3]:
if tf.test.is_gpu_available():
    print("GPU is available")
else:
    print("GPU is NOT available")

Instructions for updating:
Use `tf.config.list_physical_devices('GPU')` instead.
GPU is NOT available


In [26]:
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\farza\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\farza\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [27]:
def cleaning_stopwords(text, stopwords:set):
    return " ".join([word for word in str(text).split() if word not in stopwords])

def cleaning_email(data):
    return re.sub('@[^\s]+', ' ', data)

def cleaning_URLs(data):
    return re.sub('((www\.[^\s]+)|(https?://[^\s]+))',' ',data)

def cleaning_punctuations(text, english_punctuations):
    translator = str.maketrans('', '', english_punctuations)
    return text.translate(translator)

def cleaning_numbers(data):
    return re.sub('[0-9]+', '', data)

target: the polarity of the tweet (0 = negative, 2 = neutral, 4 = positive)

ids: The id of the tweet ( 2087)

date: the date of the tweet (Sat May 16 23:58:44 UTC 2009)

flag: The query (lyx). If there is no query, then this value is NO_QUERY.

user: the user that tweeted (robotickilldozr)

text: the text of the tweet (Lyx is cool)

In [28]:
column_names = ["target", "id", "date", "flag", "user", "text"]
df = pd.read_csv(here("data/Tweet/data.csv"), encoding='ISO-8859-1', names=column_names)
print(df.shape)
print(df.columns)

(1600000, 6)
Index(['target', 'id', 'date', 'flag', 'user', 'text'], dtype='object')


In [29]:
df["target"].unique()

array([0, 4], dtype=int64)

In [30]:
# selecting target and features
data = df[["target","text"]]

In [31]:
del df

In [32]:
data["target"].value_counts()

target
0    800000
4    800000
Name: count, dtype: int64

In [33]:
data['target'][data['target']==4]=1

You are setting values through chained assignment. Currently this works in certain cases, but when using Copy-on-Write (which will become the default behaviour in pandas 3.0) this will never work to update the original DataFrame or Series, because the intermediate object on which we are setting values will behave as a copy.
A typical example is when you are setting values in a column of a DataFrame, like:

df["col"][row_indexer] = value

Use `df.loc[row_indexer, "col"] = values` instead, to perform the assignment in a single step and ensure this keeps updating the original `df`.

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

  data['target'][data['target']==4]=1


In [34]:
data["target"].unique()

array([0, 1], dtype=int64)

Making Text Lowercase

In [35]:
data['text']=data['text'].str.lower()

Removing stop words

In [36]:
stopwords_list = stopwords.words('english')
stopwords = set(stopwords_list)
data['text'] = data['text'].apply(lambda text: cleaning_stopwords(text, stopwords=stopwords))

Removing Emails

In [37]:
data['text']= data['text'].apply(lambda x: cleaning_email(x))
data['text'].head()

0      http://twitpic.com/2y1zl - awww, that's bumm...
1    upset can't update facebook texting it... migh...
2      dived many times ball. managed save 50% rest...
3                     whole body feels itchy like fire
4      no, behaving all. i'm mad. here? can't see t...
Name: text, dtype: object

Removing URLs

In [38]:
data['text'] = data['text'].apply(lambda x: cleaning_URLs(x))
data['text'].head()

0        - awww, that's bummer. shoulda got david c...
1    upset can't update facebook texting it... migh...
2      dived many times ball. managed save 50% rest...
3                     whole body feels itchy like fire
4      no, behaving all. i'm mad. here? can't see t...
Name: text, dtype: object

Removing punctuations

In [39]:
english_punctuations = string.punctuation
print(english_punctuations)
data['text'] = data['text'].apply(lambda text: cleaning_punctuations(text, english_punctuations=english_punctuations))

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


Removing Numbers

In [40]:
data['text'] = data['text'].apply(lambda x: cleaning_numbers(x))
data['text'].head()

0         awww thats bummer shoulda got david carr ...
1    upset cant update facebook texting it might cr...
2      dived many times ball managed save  rest go ...
3                     whole body feels itchy like fire
4           no behaving all im mad here cant see there
Name: text, dtype: object

Applying tokenization

In [41]:
tokenizer = RegexpTokenizer(r'\w+')
data['text'] = data['text'].apply(tokenizer.tokenize)
data.head(5)

Unnamed: 0,target,text
0,0,"[awww, thats, bummer, shoulda, got, david, car..."
1,0,"[upset, cant, update, facebook, texting, it, m..."
2,0,"[dived, many, times, ball, managed, save, rest..."
3,0,"[whole, body, feels, itchy, like, fire]"
4,0,"[no, behaving, all, im, mad, here, cant, see, ..."


Applying lemmatizer

In [42]:
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
nltk.download('omw-1.4')
def lemmatizer_on_text(words):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(word) for word in words]
data['text'] = data['text'].apply(lambda x: lemmatizer_on_text(x))

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\farza\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\farza\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [43]:
data['text'].head()

0    [awww, thats, bummer, shoulda, got, david, car...
1    [upset, cant, update, facebook, texting, it, m...
2    [dived, many, time, ball, managed, save, rest,...
3               [whole, body, feel, itchy, like, fire]
4    [no, behaving, all, im, mad, here, cant, see, ...
Name: text, dtype: object

In [44]:
X=data.text
y=data.target

max_len (Maximum Sequence Length):

- max_len determines the maximum length of sequences after padding or truncating.
- You can start by examining the distribution of text lengths in your dataset. Choose a value that covers the majority of your texts without excessively truncating them.
- Setting max_len too small may result in information loss, while setting it too large may increase computational overhead.
- You may want to experiment with different values and evaluate their impact on model performance.
- A common approach is to set max_len to the length of the longest text in your dataset or to a value that covers a significant portion of your texts while being computationally feasible.


num_words (Maximum Number of Words):

- num_words determines the maximum number of unique words to be tokenized, based on word frequency.
- This parameter can help control the size of the vocabulary and reduce computational complexity.
- Start by considering the vocabulary size of your dataset. If your dataset contains a large number of unique words, you may need to set num_words higher to capture more of the vocabulary.
- Conversely, if your dataset is relatively small or if you want to limit the vocabulary size, you can set num_words lower.
- Keep in mind that setting num_words too low may result in the loss of less frequent words, which could be important for model performance.

In [45]:
all_words = [word for tweet in data['text'] for word in tweet]

# Extract unique words using set() and get the number of unique words
num_unique_words = len(set(all_words))
print("Number of unique words:", num_unique_words)
del all_words, num_unique_words

Number of unique words: 409603


In [46]:
max_len = 500 # determines the maximum length of sequences after padding or truncating. Setting max_len too small may result in information loss, while setting it too large may increase computational overhead.
tok = Tokenizer(num_words=2000)
tok.fit_on_texts(X)
sequences = tok.texts_to_sequences(X)
sequences_matrix = sequence.pad_sequences(sequences,maxlen=max_len)

In [47]:
sequences_matrix.shape

(1600000, 500)

In [48]:
X_train, X_test, Y_train, Y_test = train_test_split(sequences_matrix, y, test_size=0.3, random_state=42,stratify=y)
print("Shape of X_train: ", X_train.shape)
print("Shape of X_test: ", X_test.shape)

Shape of X_train:  (1120000, 500)
Shape of X_test:  (480000, 500)


In [57]:
def tensorflow_based_model(): 
    inputs = Input(name='inputs',shape=[max_len])
    layer = Embedding(2000,50)(inputs) 
    layer = LSTM(64)(layer) 
    layer = Dense(256,name='FC1')(layer) 
    layer = Activation('relu')(layer) 
    layer = Dropout(0.5)(layer) 
    layer = Dense(1,name='out_layer')(layer) 
    layer = Activation('sigmoid')(layer) 
    model = Model(inputs=inputs,outputs=layer) 
    return model 

In [58]:
model = tensorflow_based_model()
model.compile(loss='binary_crossentropy',optimizer=RMSprop(), metrics=['accuracy'])  
model.summary()

Model: "model_2"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 inputs (InputLayer)         [(None, 500)]             0         
                                                                 
 embedding_2 (Embedding)     (None, 500, 50)           100000    
                                                                 
 lstm_2 (LSTM)               (None, 64)                29440     
                                                                 
 FC1 (Dense)                 (None, 256)               16640     
                                                                 
 activation_4 (Activation)   (None, 256)               0         
                                                                 
 dropout_2 (Dropout)         (None, 256)               0         
                                                                 
 out_layer (Dense)           (None, 1)                 257 

In [59]:
history=model.fit(X_train,Y_train,batch_size=80,epochs=6, validation_split=0.1)# here we are starting the training of model by feeding the training data
print('Training finished!!')

Epoch 1/6
   95/12600 [..............................] - ETA: 1:05:52 - loss: 0.6344 - accuracy: 0.6305

KeyboardInterrupt: 