In [1]:
!pip install transformers datasets torch


^C


In [2]:
!pip install transformers datasets torch nltk seaborn matplotlib wordcloud scikit-learn pandas numpy


^C


In [3]:
import warnings
warnings.filterwarnings("ignore")
from transformers import logging
logging.set_verbosity_error()

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
import re
import nltk
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from datasets import Dataset
import transformers as hf_transformers

# Download stopwords
nltk.download('stopwords')
from nltk.corpus import stopwords

# 1. Load Dataset
df = pd.read_csv("C:/Users/Dell/Downloads/Tweets.csv")
print("✅ Dataset Shape:", df.shape)
print("✅ Columns:", df.columns)
print(df.head())

# 2. Basic Data Info
print("\nSentiment Distribution:")
print(df['airline_sentiment'].value_counts())

# Plot sentiment distribution
counts = df['airline_sentiment'].value_counts()
counts.plot(kind='bar', figsize=(8,5))
plt.title("Sentiment Distribution")
plt.xlabel("Sentiment")
plt.ylabel("Count")
plt.show()

# 3. Preprocessing Function
stop_words = set(stopwords.words('english'))

def clean_tweet(text):
    text = str(text).lower()
    text = re.sub(r"http\S+", "", text)
    text = re.sub(r"@\w+", "@user", text)
    text = re.sub(r"#", "", text)
    text = re.sub(r"[^a-z\s]", "", text)
    text = " ".join([word for word in text.split() if word not in stop_words])
    return text

df['clean_text'] = df['text'].apply(clean_tweet)

# 4. WordCloud Visualization
for sentiment in df['airline_sentiment'].unique():
    words = " ".join(df[df['airline_sentiment'] == sentiment]['clean_text'])
    if len(words.strip()) == 0:
        continue
    wc = WordCloud(width=600, height=400, background_color='white').generate(words)
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.title(f"WordCloud - {sentiment}")
    plt.show()

# 5. Train-Test Split
X = df['clean_text']
y = df['airline_sentiment']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# 6. Baseline Model (TF-IDF + Logistic Regression)
vectorizer = TfidfVectorizer(max_features=5000, ngram_range=(1,2))
X_train_vec = vectorizer.fit_transform(X_train)
X_test_vec = vectorizer.transform(X_test)

clf = LogisticRegression(max_iter=1000)
clf.fit(X_train_vec, y_train)
y_pred = clf.predict(X_test_vec)

print("\n✅ =========== Logistic Regression Results ==========")
print(classification_report(y_test, y_pred))

cm = confusion_matrix(y_test, y_pred)
sns.heatmap(cm, annot=True, fmt="d", cmap="Blues", xticklabels=clf.classes_, yticklabels=clf.classes_)
plt.xlabel("Predicted")
plt.ylabel("True")
plt.title("Confusion Matrix - Logistic Regression")
plt.show()

# 7. Transformer Model (DistilBERT)
model_name = "distilbert-base-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_name)

dataset = Dataset.from_pandas(df[['clean_text','airline_sentiment']])
unique_labels = sorted(df['airline_sentiment'].unique())
label2id = {label: i for i, label in enumerate(unique_labels)}
id2label = {i: label for label, i in label2id.items()}

def map_labels_batch(batch):
    return {'label': [label2id[l] for l in batch['airline_sentiment']]}

dataset = dataset.map(map_labels_batch, batched=True, remove_columns=['airline_sentiment'])

def tokenize(batch):
    return tokenizer(batch['clean_text'], truncation=True, padding='max_length', max_length=64)

dataset = dataset.map(tokenize, batched=True)

train_test = dataset.train_test_split(test_size=0.2, seed=42)
train_ds, test_ds = train_test['train'], train_test['test']

model = AutoModelForSequenceClassification.from_pretrained(
    model_name,
    num_labels=len(label2id),
    id2label=id2label,
    label2id=label2id
)

training_args = TrainingArguments(
    output_dir="./results",
    do_train=True,
    do_eval=True,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=2,
    learning_rate=2e-5,
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=50,
    save_total_limit=2,
    dataloader_pin_memory=False
)

def compute_metrics(eval_pred):
    from sklearn.metrics import accuracy_score, f1_score
    logits, labels = eval_pred
    preds = np.argmax(logits, axis=-1)
    return {
        'accuracy': accuracy_score(labels, preds),
        'f1': f1_score(labels, preds, average='macro')
    }

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=test_ds,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Train (can take time on CPU)
trainer.train()

print("\n=== Transformer (BERT) Evaluation ===")
metrics = trainer.evaluate()
print(metrics)

# 8. Sentiment Trend Over Time
df['date'] = pd.to_datetime(df['tweet_created'], errors='coerce')
trend = df.groupby([df['date'].dt.date, 'airline_sentiment']).size().unstack().fillna(0)
trend.plot(figsize=(12,6))
plt.title("Sentiment Trend Over Time")
plt.xlabel("Date")
plt.ylabel("Tweet Count")
plt.show()


ModuleNotFoundError: No module named 'transformers'