In [None]:
# Install necessary libraries
!pip install transformers torch pandas matplotlib seaborn scikit-learn Flask wordcloud textblob googletrans==4.0.0-rc1


In [None]:
# Import libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, f1_score
import torch
from flask import Flask, request, jsonify
import re
from wordcloud import WordCloud
from textblob import TextBlob
from googletrans import Translator
import random


In [None]:
# Load the dataset
train_data = pd.read_csv('Train.csv')
test_data = pd.read_csv('Test.csv')

# Initial exploration
print("Train Data Head:")
print(train_data.head())
print("\nTrain Data Info:")
print(train_data.info())

# Check for missing values
print("\nMissing Values in Train Data:")
print(train_data.isnull().sum())


In [None]:
# Visualize the distribution of emotions
plt.figure(figsize=(12, 6))
sns.countplot(data=train_data, x='emotion_in_tweet_is_directed_at')
plt.title('Distribution of Emotions')
plt.xticks(rotation=45)
plt.show()


In [None]:
# Analyze tweet length
train_data['text_length'] = train_data['tweet_text'].apply(lambda x: len(str(x)))

# Plot the distribution of tweet lengths
plt.figure(figsize=(12, 6))
sns.histplot(train_data['text_length'], bins=30, kde=True)
plt.title('Distribution of Tweet Lengths')
plt.xlabel('Tweet Length')
plt.ylabel('Frequency')
plt.show()


In [None]:
# Generate a WordCloud for common words in tweets
all_text = ' '.join([str(text) for text in train_data['tweet_text']])
wordcloud = WordCloud(width=800, height=400, background_color='white').generate(all_text)

plt.figure(figsize=(12, 6))
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis('off')
plt.title("Most Common Words in Tweets")
plt.show()


In [None]:
# Balance the dataset by oversampling underrepresented classes
class_counts = train_data['emotion_in_tweet_is_directed_at'].value_counts()
max_class_count = class_counts.max()

# Oversample the minority classes to balance the dataset
balanced_data = pd.DataFrame()
for emotion in class_counts.index:
    subset = train_data[train_data['emotion_in_tweet_is_directed_at'] == emotion]
    balanced_subset = subset.sample(max_class_count, replace=True, random_state=42)
    balanced_data = pd.concat([balanced_data, balanced_subset], axis=0)

# Drop duplicate tweets to reduce noise
balanced_data = balanced_data.drop_duplicates(subset=['tweet_text']).reset_index(drop=True)
print("\nBalanced Data Distribution:\n", balanced_data['emotion_in_tweet_is_directed_at'].value_counts())


In [None]:
# Initialize Translator for back translation
translator = Translator()

# Define functions for data augmentation
def synonym_replacement(text):
    words = text.split()
    augmented_words = [TextBlob(word).synonyms()[0] if TextBlob(word).synonyms() else word for word in words]
    return ' '.join(augmented_words)

def back_translation(text):
    # Translate to French and back to English
    french_translation = translator.translate(text, src='en', dest='fr').text
    return translator.translate(french_translation, src='fr', dest='en').text

def random_insertion(text):
    words = text.split()
    for _ in range(random.randint(1, 3)):  # Randomly add 1 to 3 words
        synonym_word = random.choice(words)
        words.insert(random.randint(0, len(words)), synonym_word)
    return ' '.join(words)

# Apply augmentation to create additional samples
augmented_data = balanced_data.copy()
augmented_texts = []
augmented_labels = []

for _, row in balanced_data.iterrows():
    text = row['tweet_text']
    label = row['emotion_in_tweet_is_directed_at']
    
    # Synonym Replacement
    augmented_texts.append(synonym_replacement(text))
    augmented_labels.append(label)
    
    # Back Translation
    augmented_texts.append(back_translation(text))
    augmented_labels.append(label)
    
    # Random Insertion
    augmented_texts.append(random_insertion(text))
    augmented_labels.append(label)

# Append augmented data to original dataset
augmented_df = pd.DataFrame({'tweet_text': augmented_texts, 'emotion_in_tweet_is_directed_at': augmented_labels})
final_data = pd.concat([balanced_data, augmented_df], ignore_index=True)


In [None]:
# Define text cleaning function
def clean_text(text):
    text = str(text)  # Ensure input is a string
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)  # Remove mentions
    text = re.sub(r'[^A-Za-z0-9 ]+', '', text)  # Remove special characters
    text = text.lower()  # Convert to lowercase
    return text

# Apply cleaning
final_data['cleaned_text'] = final_data['tweet_text'].apply(clean_text)


In [None]:
from sklearn.preprocessing import LabelEncoder

# Encode target labels
label_encoder = LabelEncoder()
final_data['target'] = label_encoder.fit_transform(final_data['emotion_in_tweet_is_directed_at'])

# Split data into train and validation sets
X_train, X_val, y_train, y_val = train_test_split(
    final_data['cleaned_text'], final_data['target'], test_size=0.2, random_state=42)


In [None]:
# Load tokenizer and model (try a more complex model like BERT to improve accuracy)
model_name = 'bert-base-uncased'
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=len(label_encoder.classes_))


In [None]:
# Tokenize data
train_encodings = tokenizer(list(X_train), truncation=True, padding=True, max_length=128)
val_encodings = tokenizer(list(X_val), truncation=True, padding=True, max_length=128)


In [None]:
class SentimentDataset(torch.utils.data.Dataset):
    def __init__(self, encodings, labels):
        self.encodings = encodings
        self.labels = labels

    def __getitem__(self, idx):
        item = {key: torch.tensor(val[idx]) for key, val in self.encodings.items()}
        item['labels'] = torch.tensor(self.labels[idx])
        return item

    def __len__(self):
        return len(self.labels)

# Prepare datasets
train_dataset = SentimentDataset(train_encodings, list(y_train))
val_dataset = SentimentDataset(val_encodings, list(y_val))


In [None]:
training_args = TrainingArguments(
    output_dir='./results',
    evaluation_strategy='epoch',
    logging_strategy='epoch',
    save_strategy='epoch',
    per_device_train_batch_size=8,
    per_device_eval_batch_size=8,
    num_train_epochs=5,  # Increased epochs for better learning
    weight_decay=0.01,
    logging_dir='./logs',
    report_to="none"  # Disable WandB logging
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=val_dataset,
)


In [None]:
# Train the model
trainer.train()


In [None]:
# Make predictions on validation set
val_preds = trainer.predict(val_dataset)
val_preds_labels = np.argmax(val_preds.predictions, axis=1)

# Convert label_encoder.classes_ to strings
target_names = [str(class_name) for class_name in label_encoder.classes_]

# Print classification report for detailed metrics by class
print("Classification Report:")
print(classification_report(y_val, val_preds_labels, target_names=target_names))

# Calculate overall precision, recall, and F1 score
precision = precision_score(y_val, val_preds_labels, average='weighted')
recall = recall_score(y_val, val_preds_labels, average='weighted')
f1 = f1_score(y_val, val_preds_labels, average='weighted')

print("\nOverall Metrics:")
print(f"Precision: {precision:.2f}")
print(f"Recall: {recall:.2f}")
print(f"F1 Score: {f1:.2f}")

# Explanation of Metrics:
# - Precision: Measures the accuracy of positive predictions.
# - Recall: Measures the model's ability to capture all relevant instances of each emotion.
# - F1 Score: Balances precision and recall, providing a single metric that accounts for both.


In [None]:
# Function to predict emotion and sentiment for a single text input
def predict_emotion_and_sentiment(text):
    # Clean and tokenize the text
    cleaned_text = clean_text(text)
    inputs = tokenizer(cleaned_text, return_tensors="pt", truncation=True, padding=True)
    
    # Predict emotion
    with torch.no_grad():
        outputs = model(**inputs)
    prediction = torch.argmax(outputs.logits, dim=1).item()
    
    # Decode the predicted label to emotion
    emotion = label_encoder.inverse_transform([prediction])[0]
    
    # Determine general sentiment (positive or negative)
    positive_emotions = ['joy', 'love', 'surprise']  # Adjust based on your emotion classes
    sentiment = "Positive" if emotion in positive_emotions else "Negative"
    
    return emotion, sentiment


In [None]:
# Sample input
sample_text = "Gotta love this #SXSW Google Calendar featuring top parties/ show cases to check out."

# Predict emotion and sentiment
emotion, sentiment = predict_emotion_and_sentiment(sample_text)

print(f"Input: {sample_text}")
print(f"Predicted Emotion: {emotion}")
print(f"Sentiment: {sentiment}")


In [None]:
app = Flask(__name__)

@app.route('/predict', methods=['POST'])
def predict():
    data = request.json
    text = data.get('text', '')
    emotion, sentiment = predict_emotion_and_sentiment(text)
    return jsonify({'emotion': emotion, 'sentiment': sentiment})

if __name__ == '__main__':
    app.run(host='0.0.0.0', port=5000)


In [None]:
# Use an official Python runtime as a parent image
FROM python:3.8-slim

# Set the working directory in the container
WORKDIR /app

# Copy the current directory contents into the container at /app
COPY . /app

# Install dependencies
RUN pip install --no-cache-dir -r requirements.txt

# Expose the port that Flask will run on
EXPOSE 5000

# Run the Flask application
CMD ["python", "app.py"]


In [None]:
transformers
torch
Flask
pandas
matplotlib
seaborn
scikit-learn
wordcloud
textblob
googletrans==4.0.0-rc1
