In [3]:
!pip install emoji

Collecting emoji
  Downloading emoji-2.12.1-py3-none-any.whl (431 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m431.4/431.4 kB[0m [31m7.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: emoji
Successfully installed emoji-2.12.1


In [15]:
import numpy as np
import pandas as pd
import re
import emoji
import nltk
from nltk.corpus import stopwords
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense, Dropout
from tensorflow.keras.callbacks import EarlyStopping

# Install NLTK data if not installed
nltk.download('punkt')
nltk.download('stopwords')

# Load initial CSV files
train_df = pd.read_csv('TEC.csv', encoding='utf-8', sep="\t")
dev_df = pd.read_csv('goemotions_full.csv', encoding='utf-8', sep="\t")
test_df = pd.read_csv('semeval2018-task1-emoc.csv', encoding='utf-8', sep="\t")

# Preprocess data
def preprocess_data(df):
    # Check if 'Tweet' column exists
    if 'Tweet' in df.columns:
        # Translate emojis
        df["clean"] = df["Tweet"].apply(lambda x: emoji.demojize(x))
        # Remove URLs
        df["clean"] = df["clean"].apply(lambda x: re.sub(r"http\S+", "", x))
        # Tokenize text
        df["clean"] = df["clean"].apply(lambda x: nltk.word_tokenize(x.lower()))
        # Remove stopwords and punctuation
        stop_words = set(stopwords.words('english'))
        df["clean"] = df["clean"].apply(lambda x: [word for word in x if word not in stop_words])
        df["clean"] = df["clean"].apply(lambda x: [re.sub(r'['+string.punctuation+']', '', word) for word in x])
        df["clean"] = df["clean"].apply(lambda x: [re.sub('\\n', '', word) for word in x])
        # Remove empty words
        df["clean"] = df["clean"].apply(lambda x: [word for word in x if len(word.strip()) > 0])
    else:
        print("Column 'Tweet' not found in DataFrame")
    return df

# Apply preprocessing
train_df = preprocess_data(train_df)
dev_df = preprocess_data(dev_df)
test_df = preprocess_data(test_df)

# Check if 'clean' column exists
if 'clean' in train_df.columns and 'clean' in dev_df.columns:
    # Combine train and development data for training
    train_texts = train_df['clean'].tolist() + dev_df['clean'].tolist()
    # Assuming that the columns for emotions are present in the DataFrame
    train_labels = train_df[train_df.columns.intersection(["anger", "anticipation", "disgust", "fear", "joy", "love", "optimism", "pessimism", "sadness", "surprise", "trust"])].values.tolist() + \
                   dev_df[dev_df.columns.intersection(["anger", "anticipation", "disgust", "fear", "joy", "love", "optimism", "pessimism", "sadness", "surprise", "trust"])].values.tolist()

    # Tokenization
    max_words = 10000
    tokenizer = Tokenizer(num_words=max_words, oov_token='<OOV>')
    tokenizer.fit_on_texts(train_texts)
    X_train = tokenizer.texts_to_sequences(train_texts)
    X_test = tokenizer.texts_to_sequences(test_df['clean'].tolist())

    # Padding sequences
    maxlen = 100
    X_train = pad_sequences(X_train, padding='post', maxlen=maxlen)
    X_test = pad_sequences(X_test, padding='post', maxlen=maxlen)

    # Convert labels to binary matrix
    mlb = MultiLabelBinarizer()
    y_train = mlb.fit_transform(train_labels)

    # Define the model
    model = Sequential()
    model.add(Embedding(input_dim=max_words, output_dim=128, input_length=maxlen))
    model.add(LSTM(units=64, dropout=0.2, recurrent_dropout=0.2))
    model.add(Dense(11, activation='sigmoid'))  # 11 output neurons for 11 emotions
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])

    # Train the model
    callbacks = [EarlyStopping(patience=3, monitor='val_loss')]
    history = model.fit(X_train, y_train, epochs=10, batch_size=64, validation_split=0.1, callbacks=callbacks)

    # Evaluate on test data
    predictions = model.predict(X_test)

    # Assuming you want to save the predictions to a CSV file
    predicted_df = pd.DataFrame(predictions, columns=["anger", "anticipation", "disgust", "fear", "joy", "love", "optimism", "pessimism", "sadness", "surprise", "trust"])
    predicted_df.to_csv('predicted_emotions.csv', index=False)
else:
    print("Column 'clean' not found in DataFrame")


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


Column 'Tweet' not found in DataFrame
Column 'Tweet' not found in DataFrame
Column 'Tweet' not found in DataFrame
Column 'clean' not found in DataFrame


In [20]:
import pandas as pd
import matplotlib.pyplot as plt

# Load the training data
traindf = pd.read_csv('TEC.csv', encoding='utf-8', sep="\t")

# Define the sentiment categories
sentiments = ["anger", "anticipation", "disgust", "fear", "joy", "love", "optimism", "pessimism", "sadness", "surprise", "trust"]

# Check if the sentiment columns exist in the DataFrame
if all(sentiment in traindf.columns for sentiment in sentiments):
    # Calculate the frequency of each sentiment category
    sentiment_counts = traindf[sentiments].sum()

    # Plotting the bar chart
    plt.figure(figsize=(10, 6))
    sentiment_counts.plot(kind='bar', color='skyblue')
    plt.title('Frequency of Sentiment Categories')
    plt.xlabel('Sentiment Category')
    plt.ylabel('Frequency')
    plt.xticks(rotation=45)
    plt.show()
else:
    print("Sentiment columns not found in the DataFrame.")


Sentiment columns not found in the DataFrame.
