In [None]:
import pandas as pd

LOAD DATASET

In [None]:
data=pd.read_csv("/content/Indigo_Airlines_Travel_Reviews.csv")

In [None]:
data.head()

Unnamed: 0,Airline,Review Text
0,Indigo Airlines,The seat comfort was manageable for short-haul...
1,Indigo Airlines,Seats were cramped and lacked sufficient legro...
2,Indigo Airlines,Cabin crew seemed disinterested and were not v...
3,Indigo Airlines,Food and beverage options were extremely limit...
4,Indigo Airlines,The crew was friendly and assisted passengers ...


INSTALL REQUIRED LIBRARIES


In [None]:
# !pip install nltk
# !pip install gensim


DOWNLOAD STOPWORDS


In [None]:
import nltk
nltk.download('stopwords')


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.


True

CLEANING THE DATASET

In [None]:
import re
from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

def clean_text(text):
    text = str(text).lower()  # Convert to lowercase
    text = re.sub(r'[^\w\s]', '', text)  # Remove punctuation
    words = text.split()
    words = [word for word in words if word not in stop_words]
    return ' '.join(words)

data['Cleaned_Review'] = data['Review Text'].apply(clean_text)
data[['Review Text', 'Cleaned_Review']].head()


Unnamed: 0,Review Text,Cleaned_Review
0,The seat comfort was manageable for short-haul...,seat comfort manageable shorthaul flights
1,Seats were cramped and lacked sufficient legro...,seats cramped lacked sufficient legroom making...
2,Cabin crew seemed disinterested and were not v...,cabin crew seemed disinterested helpful flight
3,Food and beverage options were extremely limit...,food beverage options extremely limited flight
4,The crew was friendly and assisted passengers ...,crew friendly assisted passengers efficiently ...


MAKING LABEL BASED ON THEIR WORDS IN THE SENTENCE

In [None]:
def assign_sentiment(review):
    review = review.lower()
    negative_keywords = ['cramped', 'not clean', 'delayed', 'rude', 'limited', 'uncomfortable', 'worst', 'fail', 'bad', 'dirty']
    positive_keywords = ['comfortable', 'well-maintained', 'friendly', 'smooth', 'pleasant', 'good', 'timely', 'hassle-free']

    if any(word in review for word in negative_keywords):
        return 0  # Negative
    elif any(word in review for word in positive_keywords):
        return 2  # Positive
    else:
        return 1  # Neutral


In [None]:
data['Sentiment'] = data['Cleaned_Review'].apply(assign_sentiment)
data[['Cleaned_Review', 'Sentiment']].head()

Unnamed: 0,Cleaned_Review,Sentiment
0,seat comfort manageable shorthaul flights,1
1,seats cramped lacked sufficient legroom making...,0
2,cabin crew seemed disinterested helpful flight,1
3,food beverage options extremely limited flight,0
4,crew friendly assisted passengers efficiently ...,2


DROP THE CLOUMNS

In [None]:
data=data.drop(columns=['Review Text','Airline'],axis=1)

In [None]:
data.head(10)

Unnamed: 0,Cleaned_Review,Sentiment
0,seat comfort manageable shorthaul flights,1
1,seats cramped lacked sufficient legroom making...,0
2,cabin crew seemed disinterested helpful flight,1
3,food beverage options extremely limited flight,0
4,crew friendly assisted passengers efficiently ...,2
5,seating arrangement okay short domestic trips,1
6,cabin crew seemed disinterested helpful flight,1
7,seats stained tray tables clean upon boarding,1
8,seats clean wellmaintained sufficient space sh...,1
9,luggage took unusually long time arrive baggag...,1


In [None]:
from textblob import TextBlob

def get_sentiment_tb(text):
    if pd.isnull(text):
        return 1  # Neutral
    score = TextBlob(text).sentiment.polarity
    if score > 0.2:
        return 2  # Positive
    elif score < -0.2:
        return 0  # Negative
    else:
        return 1  # Neutral

data['Sentiment'] = data['Cleaned_Review'].apply(get_sentiment_tb)


In [None]:
# !pip install torch torchvision torchaudio
# !pip install nltk scikit-learn


In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from nltk.tokenize import word_tokenize
from collections import Counter
import nltk
nltk.download('punkt')


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

SPLITING THE TOKENS IN THE SENTENCE

In [None]:
from collections import Counter

# Tokenize each cleaned review into list of words
data['tokens'] = data['Cleaned_Review'].apply(lambda x: x.split())

# Flatten all word tokens into one list
all_words = [word for tokens in data['tokens'] for word in tokens]

# Count word frequencies
word_counts = Counter(all_words)

# Build vocabulary: reserve 0 for <PAD>, 1 for <UNK>
vocab = {word: i+2 for i, (word, _) in enumerate(word_counts.most_common())}
vocab['<PAD>'] = 0
vocab['<UNK>'] = 1


In [None]:
data.head()

Unnamed: 0,Cleaned_Review,Sentiment,tokens
0,seat comfort manageable shorthaul flights,1,"[seat, comfort, manageable, shorthaul, flights]"
1,seats cramped lacked sufficient legroom making...,0,"[seats, cramped, lacked, sufficient, legroom, ..."
2,cabin crew seemed disinterested helpful flight,1,"[cabin, crew, seemed, disinterested, helpful, ..."
3,food beverage options extremely limited flight,1,"[food, beverage, options, extremely, limited, ..."
4,crew friendly assisted passengers efficiently ...,2,"[crew, friendly, assisted, passengers, efficie..."


In [None]:
len(word_counts),len(vocab)

(179, 181)

In [None]:
def encode_word(tokens,vocab):
  return [vocab.get(word,vocab['<UNK>']) for word in tokens]
data['input_ids']=data['tokens'].apply(lambda tokens: encode_word(tokens,vocab))

In [None]:
data.head(2)

Unnamed: 0,Cleaned_Review,Sentiment,tokens,input_ids
0,seat comfort manageable shorthaul flights,1,"[seat, comfort, manageable, shorthaul, flights]","[157, 158, 159, 160, 161]"
1,seats cramped lacked sufficient legroom making...,0,"[seats, cramped, lacked, sufficient, legroom, ...","[6, 162, 163, 46, 43, 19, 164, 165]"


In [None]:
import torch
from torch.nn.utils.rnn import pad_sequence
max_len=8
# Ensure dtype is torch.long for token IDs
data['tensor_inputs']=data['input_ids'].apply(lambda x:torch.tensor(x[:max_len],dtype=torch.long))

In [None]:
padding=pad_sequence(
    data['tensor_inputs'].tolist(),
    batch_first=True,
    padding_value=0
)

In [None]:
padding.shape

torch.Size([5000, 8])

In [None]:
labels=torch.tensor(data['Sentiment'],dtype=torch.long)

In [None]:
from sklearn.model_selection import train_test_split
train_inputs,val_inputs,train_labels,val_labels=train_test_split(padding,labels,test_size=0.2,random_state=42)

In [None]:
from torch.utils.data import TensorDataset,DataLoader

In [None]:
train_data=TensorDataset(train_inputs,train_labels)
val_data=TensorDataset(val_inputs,val_labels)

In [None]:
val_labels.shape

torch.Size([1000])

In [None]:
data.head()

Unnamed: 0,Cleaned_Review,Sentiment,tokens,input_ids,tensor_inputs
0,seat comfort manageable shorthaul flights,1,"[seat, comfort, manageable, shorthaul, flights]","[157, 158, 159, 160, 161]","[tensor(157), tensor(158), tensor(159), tensor..."
1,seats cramped lacked sufficient legroom making...,0,"[seats, cramped, lacked, sufficient, legroom, ...","[6, 162, 163, 46, 43, 19, 164, 165]","[tensor(6), tensor(162), tensor(163), tensor(4..."
2,cabin crew seemed disinterested helpful flight,1,"[cabin, crew, seemed, disinterested, helpful, ...","[5, 8, 27, 84, 35, 2]","[tensor(5), tensor(8), tensor(27), tensor(84),..."
3,food beverage options extremely limited flight,1,"[food, beverage, options, extremely, limited, ...","[166, 167, 168, 169, 170, 2]","[tensor(166), tensor(167), tensor(168), tensor..."
4,crew friendly assisted passengers efficiently ...,2,"[crew, friendly, assisted, passengers, efficie...","[8, 122, 123, 124, 125, 36, 2]","[tensor(8), tensor(122), tensor(123), tensor(1..."


In [None]:
# data_set=data.drop(columns=['tensor_inputs'])

In [None]:
# data_set.head()

In [None]:
train_dataset=DataLoader(train_data,batch_size=32,shuffle=True)

In [None]:

# for X_batch, y_batch in train_dataset:
#     print(X_batch.shape, y_batch.shape)

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim


class SequentialModel(nn.Module):
    def __init__(self,input_size,hidden_state,output_size):
      super().__init__()
      self.fc1=nn.Linear(input_size,hidden_state)
      self.relu=nn.ReLU()
      self.fc2=nn.Linear(hidden_state,hidden_state//2)
      self.fc3=nn.Linear(hidden_state//2,output_size)
    def forward(self,x):
      x=self.fc1(x)
      a1=self.relu(x)
      x=self.fc2(a1)
      a2=self.relu(x)
      output=self.fc3(a2) # Used a2 instead of a3
      return output

model=SequentialModel(8,128,3)
criterion=nn.CrossEntropyLoss()
optimizer=optim.Adam(model.parameters(),lr=0.01)
epochs=10

In [None]:
for epoch in range(epochs):
    for i, (inputs, targets) in enumerate(train_dataset):
        inputs=inputs.float()
        labels=labels.long()
        outputs=model(inputs)
        loss=criterion(outputs,targets)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        # if (i+1) % 10 == 0:
    print(f"{epoch}Batch [{i+1}], Loss: {loss.item():.4f}")


0Batch [125], Loss: 0.4390
1Batch [125], Loss: 0.1385
2Batch [125], Loss: 0.0862
3Batch [125], Loss: 0.0260
4Batch [125], Loss: 0.1988
5Batch [125], Loss: 0.1946
6Batch [125], Loss: 0.2573
7Batch [125], Loss: 0.1634
8Batch [125], Loss: 0.2148
9Batch [125], Loss: 0.1058


In [None]:
#  vocab_size=len(vocab),print(vocab_size)

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

val_inputs,val_outputs=val_data.tensors
val_inputs=val_inputs.float()
val_outputs=val_outputs.long()
with torch.no_grad():
  y_pred=model(val_inputs)
  y_pred=torch.argmax(y_pred,dim=1)
  y_true=val_outputs.numpy()
  y_pred=y_pred.numpy()
  acc=accuracy_score(y_true,y_pred)
  prec=precision_score(y_true,y_pred,average='weighted')
  rec=recall_score(y_true,y_pred,average='weighted')
  f1=f1_score(y_true,y_pred,average='weighted')
  print("Accuracy:",acc)
  print("Recall:",rec)
  print("F1 Score:",f1)

Accuracy: 0.889
Recall: 0.889
F1 Score: 0.8686018298948338


In [None]:
text="seat comfort manageable shorthaul flights"
tokens=text.split()
input_ids=[vocab.get(word,vocab['<UNK>']) for word in tokens]
if len(input_ids)<8:
  input_ids+=[vocab['<PAD>']]*(8-len(input_ids))
model=model.eval()
input_tensor=torch.tensor(input_ids,dtype=torch.long)
input_tensor=input_tensor.unsqueeze(0)
input_tensor=input_tensor.float()
with torch.no_grad():
  output=model(input_tensor)
  predicted_class=torch.argmax(output,dim=1).item()


In [None]:
print(predicted_class)

1


In [None]:
from torch.nn.utils.rnn import pad_sequence
from sklearn.model_selection import train_test_split
import torch

# Determine the maximum sequence length
max_len = data['input_ids'].apply(len).max()

# Pad the input_ids to the maximum length
padded_input_ids = data['input_ids'].apply(lambda x: x + [vocab['<PAD>']] * (max_len - len(x)))

# Convert to torch tensor
X = torch.tensor(padded_input_ids.tolist())  # shape [5000, max_len]
y = torch.tensor(data['Sentiment'].tolist())     # shape [5000]
x_train,x_test,y_train,y_test=train_test_split(X,y,test_size=0.2,random_state=42)
batch_size = 32
train_dataset = TensorDataset(x_train, y_train)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataset = TensorDataset(x_test, y_test)
# test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

print("Padded input shape:",x_train.shape)
print("Labels shape:", y_test.shape)

Padded input shape: torch.Size([4000, 8])
Labels shape: torch.Size([1000])


In [None]:
#convert sentence to embedding (word2vec)
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip


--2025-07-23 14:44:49--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2025-07-23 14:44:49--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2025-07-23 14:44:49--  https://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


202

In [None]:
import numpy as np

embedding_dim = 50
embedding_matrix = np.zeros((vocab_size, embedding_dim))

glove_path = "/content/glove.6B.50d.txt"
glove = {}

with open(glove_path, 'r', encoding='utf8') as f:
    for line in f:
        parts = line.strip().split()
        word = parts[0]
        vector = np.array(parts[1:], dtype=np.float32)
        glove[word] = vector

for word, idx in vocab.items():
    if word in glove:
        embedding_matrix[idx] = glove[word]
    else:
        embedding_matrix[idx] = np.random.normal(scale=0.6, size=(embedding_dim,))


NameError: name 'vocab_size' is not defined

In [None]:
#bulit rnn based on the after embedding
class GloveRNNClassifier(nn.Module):
  def __init__(self,vocab_size,embedding_dim,hidden_size,output_size,embedding_matrix):
      super().__init__()
      # Initialize the embedding layer with the pre-trained matrix
      self.embedding=nn.Embedding(vocab_size,embedding_dim)
      self.embedding.weight=nn.Parameter(torch.tensor(embedding_matrix,dtype=torch.float32))
      self.embedding.weight.requires_grad=False # Freeze GloVe weights

      self.hidden_size=hidden_size
      # Define the RNN layer weights
      self.wih=nn.Parameter(torch.randn(hidden_size,embedding_dim))
      self.whh=nn.Parameter(torch.randn(hidden_size,hidden_size))
      self.bih=nn.Parameter(torch.randn(hidden_size))

      self.fc=nn.Linear(hidden_size,output_size)

  def forward(self,x): # x: [batch_size, seq_len] (token IDs)
    x=self.embedding(x) # shape: [batch_size, seq_len, embedding_dim]
    batch_size,seq_len,_=x.size()
    h_t=torch.zeros(batch_size,self.hidden_size,device=x.device)

    for t in range(seq_len):
      x_t=x[:,t,:] # shape: [batch_size, embedding_dim]
      h_t=torch.tanh(x_t@self.wih.T+h_t@self.whh.T+self.bih) # shape: [batch_size, hidden_size]
    return self.fc(h_t) # shape: [batch_size, output_size]


hidden_size=50
output_size=3 # 0: Negative, 1: Neutral, 2: Positive
model=GloveRNNClassifier(vocab_size,embedding_dim,hidden_size,output_size,embedding_matrix)
criterion=nn.CrossEntropyLoss()
optimizer=optim.Adam(model.parameters(),lr=0.01)
epochs=10

for epoch in range(epochs):
  for i, (inputs,labels) in enumerate(train_loader):
    # inputs are now token IDs, no need to cast to float here
    labels=labels.long() # ensure labels are long for CrossEntropyLoss
    outputs=model(inputs)
    loss=criterion(outputs,labels)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
  print(f"{epoch}Batch [{i+1}], Loss: {loss.item():.4f}")

In [None]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
model.eval()
with torch.no_grad():
  y_pred=model(x_test)
  y_pred=torch.argmax(y_pred,dim=1)
  y_test=y_test.numpy()
  y_pred=y_pred.numpy()
  acc=accuracy_score(y_test,y_pred)
  prec=precision_score(y_test,y_pred,average='weighted')
  rec=recall_score(y_test,y_pred,average='weighted')
  f1=f1_score(y_test,y_pred,average='weighted')
  print("Accuracy:",acc)
  print("Recall:",rec)
  print("F1 Score:",f1)

using RNN

In [None]:
import torch

# --- Set your model and device ---
model.eval()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# --- Helper Function ---
def predict_sentiment(sentence, model, vocab, max_len=50):
    """
    Predict sentiment (Negative, Neutral, Positive) for a given input sentence.
    """
    # Step 1: Tokenize
    tokens = sentence.lower().split()

    # Step 2: Encode tokens to indices
    input_ids = [vocab.get(token, vocab['<UNK>']) for token in tokens]

    # Step 3: Pad or truncate to max_len
    if len(input_ids) < max_len:
        input_ids += [vocab['<PAD>']] * (max_len - len(input_ids))
    else:
        input_ids = input_ids[:max_len]

    # Step 4: Convert to tensor and predict
    input_tensor = torch.tensor([input_ids]).to(device)  # Shape: [1, max_len]
    with torch.no_grad():
        output = model(input_tensor)  # Shape: [1, 3]
        predicted_label = torch.argmax(output, dim=1).item()

    # Step 5: Convert label to text
    label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
    return label_map[predicted_label]

# --- Example Usage ---
sentence = "seats lacked legroom  not comfortable and bad"
predicted = predict_sentiment(sentence, model, vocab)
print("Predicted Sentiment:", predicted)


In [None]:
vocab_size=len(vocab)

In [None]:
!pip install gensim

In [None]:
import torch
import torch.nn as nn
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Step 1: Define LSTM Model
class LSTMSentimentClassifier(nn.Module):
     def __init__(self, embedding_dim, hidden_dim, output_size):
            super().__init__()
            self.embedding=nn.Embedding(vocab_size,embedding_dim)
            self.embedding.weight=nn.Parameter(torch.tensor(embedding_matrix,dtype=torch.float32))
            self.embedding.weight.requires_grad=False
            self.lstm=nn.LSTM(embedding_dim,hidden_dim,batch_first=True)
            self.fc=nn.Linear(hidden_dim,output_size)
     def forward(self,x):
        x=self.embedding(x)
        output,_=self.lstm(x)
        output=self.fc(output[:,-1,:])
        return output
model_lstm=LSTMSentimentClassifier(embedding_dim,hidden_size,output_size)
criterion=nn.CrossEntropyLoss()
optimizer=optim.Adam(model.parameters(),lr=0.01)
epochs=10
for i in range(epochs):
    for inputs,labels in train_loader:
        outputs=model_lstm(inputs)
        loss=criterion(outputs,labels)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
    print(f"Batch [{i+1}], Loss: {loss.item():.4f}")



In [None]:
model.eval()
with torch.no_grad():
  y_pred=model_lstm(x_test)
  y_pred=torch.argmax(y_pred,dim=1)
  y_pred=y_pred.numpy()
  acc=accuracy_score(y_test,y_pred)
  prec=precision_score(y_test,y_pred,average='weighted')
  rec=recall_score(y_test,y_pred,average='weighted')
  f1=f1_score(y_test,y_pred,average='weighted')
  print("Accuracy:",acc)
  print("Recall:",rec)
  print("F1 Score:",f1)

In [None]:
#predict one word
sentence = "seat comfort manageable shorthaul flights"
tokens = sentence.lower().split()
input_ids = [vocab.get(token, vocab['<UNK>']) for token in tokens]

# Define max_len and device as they are used later
max_len = 8  # Using the same max_len as in previous padding
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # Assuming you want to use GPU if available

if len(input_ids) < max_len:
    input_ids += [vocab['<PAD>']] * (max_len - len(input_ids))
else:
    input_ids = input_ids[:max_len]

input_tensor = torch.tensor([input_ids]).to(device)

# Ensure the model is on the correct device
model_lstm.to(device)
model_lstm.eval()

with torch.no_grad():
    output = model_lstm(input_tensor)
    predicted_label = torch.argmax(output, dim=1).item()

# Define the label map to interpret the predicted label
label_map = {0: "Negative", 1: "Neutral", 2: "Positive"}
predicted_sentiment = label_map[predicted_label]

print("Predicted Sentiment:", predicted_sentiment)