In [6]:
from sklearn.model_selection import train_test_split
import pandas as pd

In [7]:
df = pd.read_csv('./preprocessed_twitter_data.csv')

In [8]:
negative_tweets = df[df['sentiment'] == 0].head(200000)
print(negative_tweets.shape)
print(negative_tweets)

(200000, 3)
        Unnamed: 0                                              tweet  \
0                0  upset updat facebook text might cri result sch...   
1                1       dive mani time ball manag save rest go bound   
2                2                    whole bodi feel itchi like fire   
3                3                                      behav mad see   
4                4                                         whole crew   
...            ...                                                ...   
199995      199995                                               work   
199996      199996      common crash find delet process ok eat memori   
199997      199997                    babi boy wear big boy underwear   
199998      199998                           fml forgot phone charger   
199999      199999  believ wait anoth month phone contract end bor...   

        sentiment  
0               0  
1               0  
2               0  
3               0  
4          

In [9]:
positive_tweets = df[df['sentiment'] == 1].head(200000)
print(positive_tweets.shape)
print(positive_tweets)

(200000, 3)
        Unnamed: 0                                              tweet  \
799999      799999                                  love u guy r best   
800000      800000      im meet one besti tonight cant wait girl talk   
800001      800001  thank twitter add sunisa got meet hin show dc ...   
800002      800002  sick realli cheap hurt much eat real food plu ...   
800003      800003                                     effect everyon   
...            ...                                                ...   
999994      999994                                         thank need   
999995      999995                                               mayb   
999996      999996                 hell window price rang unless free   
999997      999997            neah wish reminisc read post last tweet   
999998      999998  way rewatch sun goddess last night sasha amaz ...   

        sentiment  
799999          1  
800000          1  
800001          1  
800002          1  
800003     

In [10]:
neutral_tweets = df[df['sentiment'] == 2].head(5000)
print(neutral_tweets.shape)
print(neutral_tweets)

(5000, 3)
         Unnamed: 0                                              tweet  \
3123980     3123980                              top tablet damn right   
3123981     3123981              cnbctv appl margin better expect aapl   
3123987     3123987  rt bought store pretti good logo match wait in...   
3124000     3124002  latest appl product lead effici iphon ipad plu...   
3124005     3124007                              rt thank think upgrad   
...             ...                                                ...   
3134336     3134418               rural land reform upset china villag   
3134337     3134419  scottish salmon produc say brexit cost million...   
3134338     3134420  senat approv defens polici bill includ trump s...   
3134339     3134421  sgx singapor exchang metropolitan area employ ...   
3134340     3134422            south africa plan duti poultri u brazil   

         sentiment  
3123980          2  
3123981          2  
3123987          2  
3124000          

In [11]:
from sklearn.model_selection import train_test_split

# Split negative_tweets into train and test sets
train_negative_tweets, test_negative_tweets = train_test_split(negative_tweets, test_size=0.3, random_state=42)

# Split positive_tweets into train and test sets
train_positive_tweets, test_positive_tweets = train_test_split(positive_tweets, test_size=0.3, random_state=42)

# Split neutral_tweets into train and test sets
train_neutral_tweets, test_neutral_tweets = train_test_split(neutral_tweets, test_size=0.3, random_state=42)

In [12]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.utils import shuffle

# Combine the train sets for all sentiment categories and shuffle
train_tweets = shuffle(pd.concat([train_negative_tweets, train_positive_tweets, train_neutral_tweets], ignore_index=True), random_state=42)

# Combine the test sets for all sentiment categories and shuffle
test_tweets = shuffle(pd.concat([test_negative_tweets, test_positive_tweets, test_neutral_tweets], ignore_index=True), random_state=42)

# Handle NaN values in the 'text' column
train_tweets = train_tweets.dropna(subset=['tweet'])
test_tweets = test_tweets.dropna(subset=['tweet'])

# Get the normalized text reviews from the combined train and test sets
tweet_train_reviews = train_tweets['tweet'].tolist()
tweet_test_reviews = test_tweets['tweet'].tolist()

# Train lables and test lables of tweets
train_labels = train_tweets['sentiment']
test_labels = test_tweets['sentiment']

# Initialize the tokenizer
tokenizer = Tokenizer()

# Fit the tokenizer on the training data
tokenizer.fit_on_texts(tweet_train_reviews)

# Convert text to sequences
train_sequences = tokenizer.texts_to_sequences(tweet_train_reviews)
test_sequences = tokenizer.texts_to_sequences(tweet_test_reviews)

# Ensure sequences are padded to the same length
maxlen = max(len(seq) for seq in train_sequences + test_sequences)
train_sequences = pad_sequences(train_sequences, maxlen=maxlen)
test_sequences = pad_sequences(test_sequences, maxlen=maxlen)




In [13]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

# Assuming you have already defined vocab_size, embedding_dim, maxlen, train_sequences, and train_labels
# ...
vocab_size = len(tokenizer.word_index) + 1  # add 1 for the padding token
embedding_dim = 100  # adjust as needed
maxlen = maxlen  # as defined earlier
train_sequences = train_sequences  # as defined earlier
train_labels = train_labels  # as defined earlier

# Define the model
model = Sequential()
model.add(Embedding(input_dim=vocab_size, output_dim=embedding_dim, input_length=maxlen))
model.add(LSTM(units=100))
model.add(Dense(units=3, activation='softmax'))

# Compile the model
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
# Train the model
model.fit(train_sequences, train_labels, epochs=5, batch_size=64, validation_split=0.2)
model.evaluate(test_sequences, test_labels)




Epoch 1/5


Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


[0.6573936939239502, 0.747217059135437]

In [15]:
model.save('sentiment_model.h5')

  saving_api.save_model(


In [16]:
from tensorflow.keras.models import load_model

# Load the saved model
loaded_model = load_model('sentiment_model.h5')


In [20]:
new_text_data = "Today is an average day. Nothing particularly exciting or noteworthy happened, but at least it wasn't a bad day either. #neutral #day"


new_text_sequences = tokenizer.texts_to_sequences([new_text_data])
new_text_sequences = pad_sequences(new_text_sequences, maxlen=maxlen)

# Make predictions
predictions = loaded_model.predict(new_text_sequences)

# Assuming a classification task with three classes (adjust accordingly)
predicted_class = predictions.argmax(axis=-1)[0]

print(predicted_class)

0
