In [None]:
# Install necessary libraries
!pip install transformers torch pandas matplotlib seaborn scikit-learn Flask wordcloud


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score
import torch
from flask import Flask, request, jsonify
import re
from wordcloud import WordCloud


In [None]:
# Load the dataset
train_data = pd.read_csv('Train.csv')
test_data = pd.read_csv('Test.csv')

# Initial exploration
print("Train Data Head:")
print(train_data.head())
print("\nTrain Data Info:")
print(train_data.info())

# Check for missing values
print("\nMissing Values in Train Data:")
print(train_data.isnull().sum())


In [None]:
# Visualize the distribution of emotions
plt.figure(figsize=(12, 6))
sns.countplot(data=train_data, x='emotion_in_tweet_is_directed_at')
plt.title('Distribution of Emotions')
plt.xticks(rotation=45)
plt.show()


In [None]:
# Analyze tweet length
train_data['text_length'] = train_data['tweet_text'].apply(lambda x: len(str(x)))

# Plot the distribution of tweet lengths
plt.figure(figsize=(12, 6))
sns.histplot(train_data['text_length'], bins=30, kde=True)
plt.title('Distribution of Tweet Lengths')
plt.xlabel('Tweet Length')
plt.ylabel('Frequency')
plt.show()


In [None]:
# Generate a WordCloud for common words in tweets
all_text = ' '.join([str(text) for text in train_data['tweet_text']])
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_text)

plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("Most Common Words in Tweets")
plt.show()


In [None]:
# Define text cleaning function
def clean_text(text):
    text = str(text)  # Ensure input is a string
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'[^A-Za-z0-9 ]+', '', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    return text

# Apply cleaning
train_data['cleaned_text'] = train_data['tweet_text'].apply(clean_text)


In [None]:
from sklearn.preprocessing import LabelEncoder

# Encode target labels
label_encoder = LabelEncoder()
train_data['target'] = label_encoder.fit_transform(train_data['emotion_in_tweet_is_directed_at'])

# Split data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    train_data['cleaned_text'], train_data['target'], test_size=0.2, random_state=42)


In [None]:
# Load tokenizer and model (try a more complex model like BERT to improve accuracy)
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_encoder.classes_))


In [None]:
# Tokenize data
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(list(X_val), truncation=True, padding=True, max_length=128)


In [None]:
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Prepare datasets
train_dataset = SentimentDataset(train_encodings, list(y_train))
val_dataset = SentimentDataset(val_encodings, list(y_val))


In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    logging_strategy='epoch',
    save_strategy='epoch',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=3,  # Increased epochs for better learning
    weight_decay=0.01,
    logging_dir='./logs',
    report_to="none"  # Disable WandB logging
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)


In [None]:
# Train the model
trainer.train()


In [None]:
# Make predictions on validation set
val_preds = trainer.predict(val_dataset)
val_preds_labels = np.argmax(val_preds.predictions, axis=1)

# Convert label_encoder.classes_ to strings
target_names = [str(class_name) for class_name in label_encoder.classes_]

# Print evaluation metrics with updated target names
print("Classification Report:")
print(classification_report(y_val, val_preds_labels, target_names=target_names))


In [None]:
# Function to predict emotion and sentiment for a single text input
def predict_emotion_and_sentiment(text):
    # Clean and tokenize the text
    cleaned_text = clean_text(text)
    inputs = tokenizer(cleaned_text, return_tensors="pt", truncation=True, padding=True)
    
    # Predict emotion
    with torch.no_grad():
        outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits, dim=1).item()
    
    # Decode the predicted label to emotion
    emotion = label_encoder.inverse_transform([prediction])[0]
    
    # Determine general sentiment (positive or negative)
    positive_emotions = ['joy', 'love', 'surprise']  # Adjust based on your emotion classes
    sentiment = "Positive" if emotion in positive_emotions else "Negative"
    
    return emotion, sentiment


In [None]:
# Sample input
sample_text = "Gotta love this #SXSW Google Calendar featuring top parties/ show cases to check out."

# Predict emotion and sentiment
emotion, sentiment = predict_emotion_and_sentiment(sample_text)

print(f"Input: {sample_text}")
print(f"Predicted Emotion: {emotion}")
print(f"Sentiment: {sentiment}")


In [None]:
app = Flask(__name__)

@app.route('/predict', methods=['POST'])
def predict():
    data = request.json
    text = data.get('text', '')
    emotion, sentiment = predict_emotion_and_sentiment(text)
    return jsonify({'emotion': emotion, 'sentiment': sentiment})

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5000)
