# Expression Recognition

# 1. Importing, exploring and cleaning ISEAR dataset

In [None]:
!git clone https://github.com/Bhattars1/MSc-Project-Expression-Recognition-

fatal: destination path 'MSc-Project-Expression-Recognition-' already exists and is not an empty directory.


In [None]:
# Import function
import pandas as pd
def upload_file(path):
  df = pd.read_excel(path, engine='openpyxl')
  print(df.head())
  print(df.info())
  return df

In [None]:
file_path = '/content/MSc-Project-Expression-Recognition-/DATA.xlsx'
df = upload_file(file_path)

       ID  CITY  COUN  SUBJ  SEX  AGE  RELI  PRAC  FOCC  MOCC  ...  SELF  \
0  101032     1    10    32    2   21     2     2     7     1  ...     3   
1  101040     1    10    40    2   18     2     2     7     7  ...     1   
2  261050     1    26    50    1   23     4     1     9     1  ...     0   
3  101039     1    10    39    1   19     2     2     8     1  ...     0   
4  101038     1    10    38    2   18     2     2     7     6  ...     0   

   RELA  VERBAL  NEUTRO    MYKEY  Field3  Field2  Field1  \
0     0       3       0  1010323       2       2   anger   
1     0       0       1  1010403       4       3   anger   
2     0       0       1  2610503       4       3   anger   
3     1       3       1  1010393       4       4   anger   
4     1       3       2  1010383       3       4   anger   

                                                 SIT  STATE  
0  When a boy tried to fool me so he would be OK ...      1  
1  I felt anger when I saw that I was being misle...      

In [None]:
# Filter unnessary columns
def filter_data(df, selected_columns):
  df_selected = df[selected_columns]
  print(df_selected.head(3))
  return df_selected

In [None]:
selected_columns = ["COUN", "SEX", "RELI", "PRAC", "FIEL", "Field1", "SIT"]
df_selected = filter_data(df, selected_columns)

   COUN  SEX  RELI  PRAC  FIEL Field1  \
0    10    2     2     2     2  anger   
1    10    2     2     2     9  anger   
2    26    1     4     1     1  anger   

                                                 SIT  
0  When a boy tried to fool me so he would be OK ...  
1  I felt anger when I saw that I was being misle...  
2  Once a friend had pushed me and I had fallen o...  


In [None]:
# Exploring the data
def check_unique_values(df, selected_columns):
  for i in selected_columns[:-1]:
    unique = set(df[i])
    print(f"There are {len(unique)} unique values in {i}")
    print(f"They are listed below:\n{list(unique)}\n")
check_unique_values(df_selected, selected_columns)

There are 16 unique values in COUN
They are listed below:
[33, 1, 2, 4, 8, 9, 10, 12, 16, 17, 19, 20, 22, 26, 27, 30]

There are 3 unique values in SEX
They are listed below:
[0, 1, 2]

There are 9 unique values in RELI
They are listed below:
[0, 1, 2, 3, 4, 5, 6, 7, 8]

There are 3 unique values in PRAC
They are listed below:
[0, 1, 2]

There are 10 unique values in FIEL
They are listed below:
[0, 1, 2, 3, 4, 5, 6, 7, 8, 9]

There are 7 unique values in Field1
They are listed below:
['sadness', 'guilt', 'joy', 'disgust', 'fear', 'shame', 'anger']



In [None]:
# Extracting data and labels
def data_and_labels(df, data, labels):
  data = df[data].tolist()
  labels = df[labels].tolist()
  return data, labels

data_column = "SIT"
label_column = "Field1"
data, labels = data_and_labels(df_selected, data_column, label_column)

In [None]:
len(labels), len(data)

(7666, 7666)

In [None]:
# Checking if the dataset is balanced or not
def label_distribution(data):
  unique_expressions = list(set(labels))
  for i in unique_expressions:
    print(f"{i} : {list(labels).count(i)}")
label_distribution(labels)

sadness : 1096
guilt : 1093
joy : 1094
disgust : 1096
fear : 1095
shame : 1096
anger : 1096


# 2. Preprocessing

In [None]:
import torch
from transformers import BertTokenizer, BertModel

# Load pre-trained BERT model and tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertModel.from_pretrained('bert-base-uncased')

In [None]:
# Import stopwords
import nltk
nltk.download("stopwords")
import nltk
nltk.download('punkt')
import spacy
from nltk.corpus import stopwords
stop_words = stopwords.words("english")
from spacy.lang.en.stop_words import STOP_WORDS
stopwords = set(stop_words).union(STOP_WORDS)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [None]:
# Define exception stopwords
exceptionStopWords = {
    'again',
    'against',
    'ain',
    'almost',
    'among',
    'amongst',
    'amount',
    'anyhow',
    'anyway',
    'aren',
    'aren’t',
    'below',
    'bottom',
    'but',
    'cannot',
    'couldn',
    'couldn’t',
    'didn',
    'didn’t',
    'doesn',
    'doesn’t',
    'don',
    'don’t',
    'done',
    'down',
    'except',
    'few',
    'hadn',
    'hadn’t',
    'hasn',
    'hasn’t',
    'haven',
    'haven’t',
    'however',
    'isn',
    'isn’t',
    'least',
    'mightn’t',
    'mustn',
    'mustn’t',
    'needn',
    'needn’t',
    'neither',
    'never',
    'nevertheless',
    'no',
    'nobody',
    'none',
    'noone',
    'nor',
    'not',
    'nothing',
    'should',
    'should’ve',
    'shouldn',
    'shouldn’t',
    'too',
    'top',
    'up',
    'wasn',
    'wasn’t',
    'well',
    'weren',
    'weren’t',
    'won',
    'won’t',
    'wouldn',
    'wouldn’t',
}
final_stop_words = stopwords-exceptionStopWords

In [None]:
# Import lemmatization library
import spacy
nlp = spacy.load("en_core_web_sm", disable=["parser", "ner"])

In [None]:
# Define function for stopwords
def remove_stopwords(data):
  return [token for token in data if token not in final_stop_words]

In [None]:
# Define lemmatization function
def lemmatization(tokens):
    doc = nlp(" ".join(tokens))
    return [token.lemma_ for token in doc]

In [None]:
# Define the pipeline function
import tqdm
def pipeline(reviews):
    processed_reviews = []
    for review in tqdm.tqdm(reviews, desc="Processing reviews"):
        tokens = tokenizer.tokenize(review)  # Tokenize using BERT tokenizer
        tokens = remove_stopwords(tokens)    # Remove stopwords
        tokens = lemmatization(tokens)       # Perform lemmatization
        processed_reviews.append(" ".join(tokens))  # Reconstruct the sentence
    return processed_reviews

In [None]:
# Function to generate BERT embeddings for a list of sentences in batches
def generate_bert_embeddings(sentences, batch_size=16):
    embeddings_list = []
    for i in tqdm.tqdm(range(0, len(sentences), batch_size), desc="Generating BERT embeddings"):
        batch = sentences[i:i + batch_size]
        inputs = tokenizer(batch, return_tensors='pt', padding=True, truncation=True, max_length=512)
        with torch.no_grad():
            outputs = model(**inputs)
        # Extract embeddings of the [CLS] token (index 0) for each sentence
        cls_embeddings = outputs.last_hidden_state[:, 0, :]
        embeddings_list.append(cls_embeddings)
    embeddings = torch.cat(embeddings_list)
    return embeddings

In [None]:
# Example data
sentences = [
    "I love programming and coding.",
    "This is an example sentence for BERT embeddings."
]

# Process and generate embeddings
processed_data = pipeline(sentences)
data_embeddings = generate_bert_embeddings(processed_data)

Processing reviews:   0%|          | 0/2 [00:00<?, ?it/s]

Generating BERT embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

In [None]:
# Passing the sampled data through the pipeline
processed_data = list(map(lambda x: pipeline(x), data))

In [None]:
processed_data = pipeline(data)

Processing reviews:   0%|          | 0/7666 [00:00<?, ?it/s]

In [None]:
data_embeddings = generate_bert_embeddings(processed_data)

Generating BERT embeddings:   0%|          | 0/480 [00:00<?, ?it/s]

In [None]:
import numpy as np
from sklearn.metrics.pairwise import cosine_similarity

# Function to compute cosine similarity between two sentences
def compute_similarity(embedding1, embedding2):
    # Convert tensors to numpy arrays
    embedding1 = embedding1.numpy()
    embedding2 = embedding2.numpy()
    # Compute cosine similarity
    similarity = cosine_similarity(embedding1, embedding2)
    return similarity[0][0]

# Example sentences
sentence1 = "The weather today is beautiful and sunny."
sentence2 = "The economy is in a severe recession."

# Process the sentences
processed_sentences = pipeline([sentence1, sentence2])

# Generate BERT embeddings
embeddings = generate_bert_embeddings(processed_sentences)

# Extract embeddings for the sentences
embedding1 = embeddings[0:1]  # First sentence
embedding2 = embeddings[1:2]  # Second sentence

# Compute similarity
similarity_score = compute_similarity(embedding1, embedding2)

print(f"Similarity score between the sentences: {similarity_score}")

Processing reviews:   0%|          | 0/2 [00:00<?, ?it/s]

Generating BERT embeddings:   0%|          | 0/1 [00:00<?, ?it/s]

Similarity score between the sentences: 0.739437460899353


In [None]:
len(data_embeddings)

7666

In [None]:
# Import embedding library
from gensim.models import Word2Vec
embedding_dimension = 100

# trainin the Word2Vec embedding function using the sampled data
model = Word2Vec(processed_data, vector_size = embedding_dimension, window=6, min_count = 3, workers=4)

In [None]:
model.sg

0

In [None]:
# Creating the word vectors and deleting the model
word_vectors = model.wv
del model
len(word_vectors.key_to_index)

2669

In [None]:
word_vectors.similarity("good", "bad")

0.99715894