In [29]:
from sklearn.model_selection import train_test_split
import pandas as pd

In [30]:
df = pd.read_csv('./preprocessed_twitter_data.csv')

In [31]:
negative_tweets = df[df['sentiment'] == 0].head(500000)
print(negative_tweets.shape)
print(negative_tweets)

(500000, 3)
        Unnamed: 0                                              tweet   
0                0  upset updat facebook text might cri result sch...  \
1                1       dive mani time ball manag save rest go bound   
2                2                    whole bodi feel itchi like fire   
3                3                                      behav mad see   
4                4                                         whole crew   
...            ...                                                ...   
499995      499995                      idea use web p tweetdeck work   
499996      499996          ww work gain back probabl put chip anyway   
499997      499997                                   sorri mommi miss   
499998      499998     terribl headach last night weight decreas aaah   
499999      499999                                         cant sleep   

        sentiment  
0               0  
1               0  
2               0  
3               0  
4          

In [32]:
positive_tweets = df[df['sentiment'] == 1].head(500000)
print(positive_tweets.shape)
print(positive_tweets)

(500000, 3)
         Unnamed: 0                                              tweet   
799999       799999                                  love u guy r best  \
800000       800000      im meet one besti tonight cant wait girl talk   
800001       800001  thank twitter add sunisa got meet hin show dc ...   
800002       800002  sick realli cheap hurt much eat real food plu ...   
800003       800003                                     effect everyon   
...             ...                                                ...   
1299994     1299994                      watch horton hear tasha apart   
1299995     1299995      r u friendfe got friend request u sure realli   
1299996     1299996  thank comment grace record toast ice americano...   
1299997     1299997  look launch uniqu art web store uniqu art shir...   
1299998     1299998  new blog potenti cake diaster ahhhh www mandic...   

         sentiment  
799999           1  
800000           1  
800001           1  
800002         

In [33]:
neutral_tweets = df[df['sentiment'] == 2].head(10000)
print(neutral_tweets.shape)
print(neutral_tweets)

(10000, 3)
         Unnamed: 0                                              tweet   
3123980     3123980                              top tablet damn right  \
3123981     3123981              cnbctv appl margin better expect aapl   
3123987     3123987  rt bought store pretti good logo match wait in...   
3124000     3124002  latest appl product lead effici iphon ipad plu...   
3124005     3124007                              rt thank think upgrad   
...             ...                                                ...   
3141591     3141673   better anyon els busi stock invest tradeidea gnu   
3141592     3141674  rt sound eerili like famou julian robertson un...   
3141593     3141675  arent extrem optimist yet market go spx spi iw...   
3141594     3141676  case covid florida past week still set open pa...   
3141595     3141677  investor take chanc biogen stock biib rhhbi nv...   

         sentiment  
3123980          2  
3123981          2  
3123987          2  
3124000         

In [34]:
from sklearn.model_selection import train_test_split

# Split negative_tweets into train and test sets
train_negative_tweets, test_negative_tweets = train_test_split(negative_tweets, test_size=0.3, random_state=42)

# Split positive_tweets into train and test sets
train_positive_tweets, test_positive_tweets = train_test_split(positive_tweets, test_size=0.3, random_state=42)

# Split neutral_tweets into train and test sets
train_neutral_tweets, test_neutral_tweets = train_test_split(neutral_tweets, test_size=0.3, random_state=42)

In [35]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.utils import shuffle

# Combine the train sets for all sentiment categories and shuffle
train_tweets = shuffle(pd.concat([train_negative_tweets, train_positive_tweets, train_neutral_tweets], ignore_index=True), random_state=42)

# Combine the test sets for all sentiment categories and shuffle
test_tweets = shuffle(pd.concat([test_negative_tweets, test_positive_tweets, test_neutral_tweets], ignore_index=True), random_state=42)

# Handle NaN values in the 'text' column
train_tweets = train_tweets.dropna(subset=['tweet'])
test_tweets = test_tweets.dropna(subset=['tweet'])

# Get the normalized text reviews from the combined train and test sets
tweet_train_reviews = train_tweets['tweet'].tolist()
tweet_test_reviews = test_tweets['tweet'].tolist()

# Train lables and test lables of tweets
train_labels = train_tweets['sentiment']
test_labels = test_tweets['sentiment']

# Initialize the tokenizer
tokenizer = Tokenizer()

# Fit the tokenizer on the training data
tokenizer.fit_on_texts(tweet_train_reviews)

# Convert text to sequences
train_sequences = tokenizer.texts_to_sequences(tweet_train_reviews)
test_sequences = tokenizer.texts_to_sequences(tweet_test_reviews)

# Ensure sequences are padded to the same length
maxlen = max(len(seq) for seq in train_sequences + test_sequences)
print("Maxlen: " , maxlen)
train_sequences = pad_sequences(train_sequences, maxlen=maxlen)
test_sequences = pad_sequences(test_sequences, maxlen=maxlen)

Maxlen:  56


In [39]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Assuming you have already defined vocab_size, embedding_dim, maxlen, train_sequences, and train_labels
# ...
vocab_size = len(tokenizer.word_index) + 1  # add 1 for the padding token
embedding_dim = 100  # adjust as needed


# Define the model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim))
model.add(LSTM(units=128, dropout=0.2, recurrent_dropout=0.2, return_sequences=True))
model.add(LSTM(units=64, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(units=64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(units=3, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# Train the model
model.fit(train_sequences, train_labels, epochs=5, batch_size=128, validation_split=0.2)
model.evaluate(test_sequences, test_labels)



Epoch 1/5
[1m482/482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m373s[0m 766ms/step - accuracy: 0.7021 - loss: 0.6022 - val_accuracy: 0.7714 - val_loss: 0.4803
Epoch 2/5
[1m482/482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m388s[0m 804ms/step - accuracy: 0.7952 - loss: 0.4469 - val_accuracy: 0.7729 - val_loss: 0.4790
Epoch 3/5
[1m482/482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m337s[0m 699ms/step - accuracy: 0.8154 - loss: 0.4094 - val_accuracy: 0.7656 - val_loss: 0.4946
Epoch 4/5
[1m482/482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m372s[0m 772ms/step - accuracy: 0.8289 - loss: 0.3802 - val_accuracy: 0.7665 - val_loss: 0.5073
Epoch 5/5
[1m482/482[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m373s[0m 773ms/step - accuracy: 0.8436 - loss: 0.3476 - val_accuracy: 0.7637 - val_loss: 0.5349
[1m9424/9424[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m154s[0m 16ms/step - accuracy: 0.7630 - loss: 0.5389


[0.5384401679039001, 0.7632735967636108]

In [42]:
model.save('modelfinal.h5') #model2.h5

