In [None]:
#importing necessary libraries
import pandas as pd
import numpy as np
import re
import string
import matplotlib.pyplot as plt
import seaborn as sns
from wordcloud import WordCloud
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

nltk.download('stopwords')
nltk.download('wordnet')

In [None]:
from google.colab import files
files.upload()

In [None]:
#load the dataset
df = pd.read_csv("Tweets.csv")
print("Dataset Shape:", df.shape)
df.head()

In [None]:
#data preproceesing
data = df[['text', 'airline_sentiment']]
data = data.rename(columns={'airline_sentiment': 'sentiment'})

# Display class distribution
print("\nSentiment Distribution:")
print(data['sentiment'].value_counts())

# Text cleaning function
def clean_text(text):
    text = str(text).lower()                               # Lowercase
    text = re.sub(r'@\w+', '', text)                       # Remove mentions
    text = re.sub(r'#', '', text)                          # Remove hashtags
    text = re.sub(r'http\S+', '', text)                    # Remove URLs
    text = re.sub(r'[0-9]', '', text)                      # Remove numbers
    text = re.sub(r'[^\w\s]', '', text)                    # Remove punctuation
    text = text.strip()
    return text

data['clean_text'] = data['text'].apply(clean_text)

In [None]:
#lemmatization & stopword removal
lemmatizer = WordNetLemmatizer()
stop_words = stopwords.words('english')

def preprocess_text(text):
    words = text.split()
    words = [lemmatizer.lemmatize(word) for word in words if word not in stop_words]
    return ' '.join(words)

data['clean_text'] = data['clean_text'].apply(preprocess_text)

In [None]:
#EDA
#sentiment distribution plot
sns.countplot(x='sentiment', data=data, palette='viridis')
plt.title('Sentiment Distribution')
plt.show()

#generate word clouds
for sentiment in data['sentiment'].unique():
    wc = WordCloud(width=800, height=400, background_color='white').generate(
        ' '.join(data[data['sentiment'] == sentiment]['clean_text'])
    )
    plt.imshow(wc, interpolation='bilinear')
    plt.axis('off')
    plt.title(f"WordCloud for {sentiment.capitalize()} Sentiment")
    plt.show()

In [None]:
#feature extraction
tfidf = TfidfVectorizer(max_features=5000)
X = tfidf.fit_transform(data['clean_text']).toarray()
y = data['sentiment']

# Train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
#model training
model = LogisticRegression(max_iter=200)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)

In [None]:
#evaluation
print("Model Accuracy:", accuracy_score(y_test, y_pred))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

#confusion matrix
plt.figure(figsize=(6,4))
sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap='Blues')
plt.title("Confusion Matrix")
plt.xlabel("Predicted")
plt.ylabel("Actual")
plt.show()

In [None]:
#predict new sentences
def predict_sentiment(text):
    text = clean_text(text)
    text = preprocess_text(text)
    vec = tfidf.transform([text]).toarray()
    pred = model.predict(vec)[0]
    return pred

# Test examples
sample_texts = [
    "I love this new flight experience!",
    "The service was terrible and the staff was rude.",
    "It was an average experience, nothing special."
]

for t in sample_texts:
    print(f"Tweet: {t}\nPredicted Sentiment: {predict_sentiment(t)}\n")

In [None]:
import joblib
joblib.dump(model, 'sentiment_model.pkl')
joblib.dump(tfidf, 'tfidf_vectorizer.pkl')

In [None]:
from google.colab import files
files.download('sentiment_model.pkl')
files.download('tfidf_vectorizer.pkl')

In [None]:
#model performance metrics
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
import seaborn as sns
import matplotlib.pyplot as plt

#predictions
y_pred = model.predict(X_test)

#metrics
print("Model Evaluation Metrics:")
print("--------------------------")
print("Accuracy :", accuracy_score(y_test, y_pred))
print("Precision:", precision_score(y_test, y_pred, average='weighted'))
print("Recall   :", recall_score(y_test, y_pred, average='weighted'))
print("F1 Score :", f1_score(y_test, y_pred, average='weighted'))
print("\nClassification Report:\n", classification_report(y_test, y_pred))

#confusion Matrix
cm = confusion_matrix(y_test, y_pred)
plt.figure(figsize=(6,4))
sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
plt.title('Confusion Matrix')
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.show()