In [None]:
#Aaditya Varshney, B225
#Ram Srivastava, B224

In [None]:
import pandas as pd
import sqlite3
import re
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import classification_report
from sklearn.ensemble import VotingClassifier

# Load CSV file
df_csv=pd.read_csv("Tweets.csv")

# Load SQLite database
conn=sqlite3.connect("database.sqlite")
df_sqlite=pd.read_sql("SELECT * FROM Tweets;",conn)

# Combine datasets and remove duplicates
df_combined=pd.concat([df_csv,df_sqlite]).drop_duplicates().reset_index(drop=True)

# Keep only relevant columns
df_combined=df_combined[['text','airline_sentiment']].dropna()

# Preprocessing function
def preprocess_text(text):
    text=text.lower()
    text=re.sub(r'http\S+','',text)  # Remove URLs
    text=re.sub(r'@\w+', '',text)  # Remove mentions
    text=re.sub(r'[^a-z\s]','',text)  # Remove special characters
    return text

df_combined['cleaned_text']=df_combined['text'].astype(str).apply(preprocess_text)

# Encode sentiment labels
label_encoder=LabelEncoder()
df_combined['sentiment_encoded']=label_encoder.fit_transform(df_combined['airline_sentiment'])

# Split dataset
X_train,X_test,y_train,y_test=train_test_split(
    df_combined['cleaned_text'],df_combined['sentiment_encoded'],test_size=0.2,random_state=42
)

# Vectorize text using TF-IDF
vectorizer=TfidfVectorizer(max_features=7000)
X_train_tfidf=vectorizer.fit_transform(X_train)
X_test_tfidf=vectorizer.transform(X_test)

# Define models
rf_model=RandomForestClassifier(n_estimators=300,max_depth=50,random_state=42)
xgb_model=XGBClassifier(n_estimators=300,learning_rate=0.1,max_depth=6,subsample=0.8,colsample_bytree=0.8,use_label_encoder=False,eval_metric='mlogloss')

# Ensemble Voting Classifier
ensemble_model=VotingClassifier(estimators=[
    ('random_forest',rf_model),
    ('xgboost',xgb_model)
], voting='soft')

# Train ensemble model
ensemble_model.fit(X_train_tfidf,y_train)

# Evaluate model
y_pred=ensemble_model.predict(X_test_tfidf)
report=classification_report(y_test,y_pred,target_names=label_encoder.classes_)
print(report)

Parameters: { "use_label_encoder" } are not used.



              precision    recall  f1-score   support

    negative       0.84      0.98      0.90      3697
     neutral       0.85      0.57      0.68      1204
    positive       0.89      0.69      0.78       917

    accuracy                           0.85      5818
   macro avg       0.86      0.75      0.79      5818
weighted avg       0.85      0.85      0.84      5818



In [None]:
import pandas as pd
import sqlite3
import re
import numpy as np
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, Dropout, SpatialDropout1D
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# Load CSV file
df_csv = pd.read_csv("Tweets.csv")

# Load SQLite database
conn = sqlite3.connect("database.sqlite")
df_sqlite = pd.read_sql("SELECT * FROM Tweets;", conn)

# Combine datasets and remove duplicates
df_combined = pd.concat([df_csv, df_sqlite]).drop_duplicates().reset_index(drop=True)

# Keep only relevant columns
df_combined = df_combined[['text', 'airline_sentiment']].dropna()

# Preprocessing function
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'[^a-z\s]', '', text)  # Remove special characters
    return text

df_combined['cleaned_text'] = df_combined['text'].astype(str).apply(preprocess_text)

# Encode sentiment labels
label_encoder = LabelEncoder()
df_combined['sentiment_encoded'] = label_encoder.fit_transform(df_combined['airline_sentiment'])

# Tokenization
max_words = 10000  # Vocabulary size
max_length = 100  # Max tweet length
tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(df_combined['cleaned_text'])
sequences = tokenizer.texts_to_sequences(df_combined['cleaned_text'])
padded_sequences = pad_sequences(sequences, maxlen=max_length, padding='post')

# Split dataset
X_train, X_test, y_train, y_test = train_test_split(padded_sequences, df_combined['sentiment_encoded'], test_size=0.2, random_state=42)

# Build LSTM model
model = Sequential([
    Embedding(input_dim=max_words, output_dim=128, input_length=max_length),
    SpatialDropout1D(0.2),
    Bidirectional(LSTM(64, return_sequences=True)),
    Bidirectional(LSTM(32)),
    Dropout(0.3),
    Dense(32, activation='relu'),
    Dense(3, activation='softmax')  # 3 sentiment classes
])

# Compile model
model.compile(loss='sparse_categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

# Train model
model.fit(X_train, y_train, epochs=5, batch_size=64, validation_data=(X_test, y_test))

# Evaluate model
loss, accuracy = model.evaluate(X_test, y_test)
print(f"Test Loss: {loss}, Test Accuracy: {accuracy}")

Epoch 1/5




[1m364/364[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m159s[0m 413ms/step - accuracy: 0.6854 - loss: 0.7539 - val_accuracy: 0.8340 - val_loss: 0.4319
Epoch 2/5
[1m364/364[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m200s[0m 407ms/step - accuracy: 0.8636 - loss: 0.3763 - val_accuracy: 0.8771 - val_loss: 0.3565
Epoch 3/5
[1m364/364[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m201s[0m 404ms/step - accuracy: 0.9146 - loss: 0.2472 - val_accuracy: 0.8964 - val_loss: 0.3247
Epoch 4/5
[1m364/364[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m147s[0m 404ms/step - accuracy: 0.9448 - loss: 0.1746 - val_accuracy: 0.9103 - val_loss: 0.3135
Epoch 5/5
[1m364/364[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m204s[0m 409ms/step - accuracy: 0.9564 - loss: 0.1331 - val_accuracy: 0.9144 - val_loss: 0.3089
[1m182/182[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m11s[0m 59ms/step - accuracy: 0.9132 - loss: 0.3083
Test Loss: 0.3088552951812744, Test Accuracy: 0.9144035577774048
