# Twitter Sentiment Analysis using EDA and Deep Learning
# Author: [Your Name]
# Run this notebook in Google Colab with GPU runtime for best performance.


## Step 1: Install and Import Libraries
!pip install -q wordcloud tensorflow nltk emoji seaborn matplotlib sklearn

In [None]:
import re, string, numpy as np, pandas as pd, seaborn as sns, matplotlib.pyplot as plt
from wordcloud import WordCloud
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, confusion_matrix
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Bidirectional, LSTM, Dense, Dropout


## Step 2: Load Dataset
from google.colab import drive
drive.mount('/content/drive')
DATA_PATH = '/content/drive/MyDrive/twitter_airline_sentiment.csv'  # Update path
df = pd.read_csv(DATA_PATH)
df.head()

## Step 3: EDA and Preprocessing
df.drop_duplicates(inplace=True)
df['text_length'] = df['text'].astype(str).apply(len)
sns.countplot(x='airline_sentiment', data=df)
plt.title('Sentiment Distribution')
plt.show()

## Text cleaning
nltk.download('stopwords'); nltk.download('punkt'); nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def clean_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'@[A-Za-z0-9_]+', '', text)
    text = re.sub(r'#[A-Za-z0-9_]+', '', text)
    text = re.sub(r'[^a-z\s]', '', text)
    tokens = nltk.word_tokenize(text)
    tokens = [lemmatizer.lemmatize(w) for w in tokens if w not in stop_words]
    return ' '.join(tokens)
df['clean_text'] = df['text'].apply(clean_text)

## Step 4: Tokenization and Train-Test Split
le = LabelEncoder()
df['label'] = le.fit_transform(df['airline_sentiment'])
X_train, X_test, y_train, y_test = train_test_split(df['clean_text'], df['label'], test_size=0.2, random_state=42)
tokenizer = Tokenizer(num_words=20000, oov_token='<OOV>')
tokenizer.fit_on_texts(X_train)
MAXLEN = 50
X_train_seq = pad_sequences(tokenizer.texts_to_sequences(X_train), maxlen=MAXLEN)
X_test_seq = pad_sequences(tokenizer.texts_to_sequences(X_test), maxlen=MAXLEN)
y_train = tf.keras.utils.to_categorical(y_train)
y_test = tf.keras.utils.to_categorical(y_test)

## Step 5: Build and Train Model
model = Sequential([
    Embedding(20000, 128, input_length=MAXLEN),
    Bidirectional(LSTM(64, dropout=0.2, recurrent_dropout=0.2)),
    Dense(64, activation='relu'),
    Dropout(0.3),
    Dense(3, activation='softmax')
])
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])
history = model.fit(X_train_seq, y_train, validation_split=0.2, epochs=5, batch_size=32)

## Step 6: Evaluation
plt.plot(history.history['accuracy'], label='train')
plt.plot(history.history['val_accuracy'], label='val')
plt.legend(); plt.title('Accuracy vs Epochs'); plt.show()

y_pred = np.argmax(model.predict(X_test_seq), axis=1)
y_true = np.argmax(y_test, axis=1)
print(classification_report(y_true, y_pred, target_names=le.classes_))
cm = confusion_matrix(y_true, y_pred)
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predicted'); plt.ylabel('True'); plt.show()