### **MOUNTING DRIVE TO ACCESS AND SAVE DATA**

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


### **IMPORTING RELEVANT MODULES**

In [2]:
import numpy as np
import pandas as pd
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.models import load_model
nltk.download('punkt')
nltk.download('stopwords')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

### **TOKENIZING AND PREPROCESSING TWEETS**

In [4]:
stop_words = set(stopwords.words('english'))

def preprocess_text(text):
    if type(text) != float:
        tokens = word_tokenize(text.lower())
        filtered_tokens = [word for word in tokens if word.isalnum() and word not in stop_words]
        return ' '.join(filtered_tokens)
    else:
        return str(text)

df = pd.read_csv('/content/drive/MyDrive/Colab Notebooks/extern.csv')

df['preprocessed_tweet'] = df['tweet'].apply(preprocess_text)
X = df['preprocessed_tweet'].values
y = df['value'].values

### **DIVIDING INTO TRAIN AND VALIDATION SET**

In [5]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

vocab_size = 50000  # Maximum number of words in the vocabulary
embedding_dim = 100
max_length = 50  # Maximum tweet length


tokenizer = Tokenizer(num_words=vocab_size, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)

X_train_sequences = tokenizer.texts_to_sequences(X_train)
X_train_padded = pad_sequences(X_train_sequences, maxlen=max_length, padding='post', truncating='post')

X_test_sequences = tokenizer.texts_to_sequences(X_test)
X_test_padded = pad_sequences(X_test_sequences, maxlen=max_length, padding='post', truncating='post')

### **CREATION AND TESTING OF THE MODEL**

In [11]:
model1 = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    LSTM(64),
    Dense(1, activation='tanh')  # Output activation function can be changed based on your needs
])



model1.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])
model1.fit(X_train_padded, y_train, epochs=5,validation_data=(X_test_padded, y_test))

# Evaluate model1
loss, mae = model1.evaluate(X_test_padded, y_test)
print(f"Test Loss: {loss}, accuracy : {mae}")



Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Loss: 0.36389994621276855, Mean Absolute Error: 0.6230000257492065


In [17]:
model2 = Sequential([
    Embedding(vocab_size, embedding_dim, input_length=max_length),
    LSTM(128, return_sequences=True),
    LSTM(64),
    Dense(1, activation='tanh')  # Output activation function can be changed based on your needs
])

model2.compile(loss='mean_squared_error', optimizer='adam', metrics=['accuracy'])
model2.fit(X_train_padded, y_train, epochs=5,validation_data=(X_test_padded, y_test))

# Evaluate model2
loss, mae = model2.evaluate(X_test_padded, y_test)
print(f"Test Loss: {loss}, accuracy: {mae}")

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
Test Loss: 0.37243160605430603, Mean Absolute Error: 0.6244999766349792


### **SAVING THE MODEL**

In [18]:

model1.save('/content/drive/MyDrive/Colab Notebooks/model1_5epoch.h5')
print("Model1 saved successfully!")
model2.save('/content/drive/MyDrive/Colab Notebooks/model2_5epoch.h5')
print("Model2 saved successfully!")

Model1 saved successfully!
Model2 saved successfully!


### **MODEL LOADING AND TESTING**

In [27]:
loaded_model1 = load_model('/content/drive/MyDrive/Colab Notebooks/model1.h5')
loaded_model2 = load_model('/content/drive/MyDrive/Colab Notebooks/model2.h5')

def predict_sentiment(tweet,model):
    preprocessed_tweet = preprocess_text(tweet)
    tweet_sequence = tokenizer.texts_to_sequences([preprocessed_tweet])
    padded_sequence = pad_sequences(tweet_sequence, maxlen=max_length, padding='post', truncating='post')
    sentiment_prediction = model.predict(padded_sequence)[0][0]
    return sentiment_prediction

for i in range(0,3):
    user_tweet = input("Enter a tweet: ")
    sentiment_prediction = predict_sentiment(user_tweet,loaded_model1)
    print("Sentiment prediction for model 1 :", sentiment_prediction)
    sentiment_prediction = predict_sentiment(user_tweet,loaded_model2)
    print("Sentiment prediction for model 2 :", sentiment_prediction)


Enter a tweet: Earlier failed lawyer, now failed troll too!
Sentiment prediction for model 1 : -0.98257494
Sentiment prediction for model 2 : -0.9532739
Enter a tweet: One has to start somewhere. Nobody is giving or expecting charity. If you want, make concessions. One day maybe we can stand on our own. Do you know how India became leader in steel production??? In beginning making large scale steel was something similar. Nuclear as well.
Sentiment prediction for model 1 : 0.9445943
Sentiment prediction for model 2 : 0.8808789
Enter a tweet: Is chutiya ko kisne khula chod rakha hai
Sentiment prediction for model 1 : -0.09799191
Sentiment prediction for model 2 : -0.0049019014
