In [1]:
import numpy as np
import re
import pandas as pd

import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn import svm
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, f1_score

# Clean text data
nltk.download('stopwords')
nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\nadee\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\nadee\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
train_data = pd.read_csv("../../Data/data_processed_train.csv")
test_data = pd.read_csv("../../Data/data_processed_test.csv")
val_data = pd.read_csv("../../Data/data_processed_validate.csv")

In [3]:
def clean_text(text):
    # Convert text to lowercase
    text = text.lower()
    # # Remove urls
    # text = re.sub(r'http\S+', '', text)
    # # Remove mentions
    # text = re.sub(r'@\S+', '', text)
    # # Remove numbers
    # text = re.sub(r'\d+', '', text)
    # # Remove punctuation
    # text = re.sub(r'[^\w\s]', '', text)
    # # Remove stopwords and lemmatize
    # # text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    return text

train_data['text'] = train_data['text'].apply(clean_text)
test_data['text'] = test_data['text'].apply(clean_text)
val_data['text'] = val_data['text'].apply(clean_text)

In [4]:
train_data.head()
test_data.head()
val_data.head()

Unnamed: 0,target,text
0,-1,fuck creep need someone protect me
1,-1,jeesh go sjbc ride cant there
2,-1,rehearse today ny again kinda cold crave pancake
3,1,cant wait 2mz worra lush half term fairplayys xx
4,1,never happenunless plane go


In [5]:
print(train_data.shape)
print(test_data.shape)
print(val_data.shape)

(10000, 2)
(5000, 2)
(2000, 2)


In [6]:
print(train_data.isnull().sum())
print(test_data.isnull().sum())
print(val_data.isnull().sum())

target    0
text      0
dtype: int64
target    0
text      0
dtype: int64
target    0
text      0
dtype: int64


In [7]:
train_data = train_data.dropna()
test_data = test_data.dropna()
val_data = val_data.dropna()

In [8]:
print(train_data.shape)
print(test_data.shape)
print(val_data.shape)

(10000, 2)
(5000, 2)
(2000, 2)


In [9]:
# Balancing train data
counts = train_data['target'].value_counts()
min_count = counts.min()

train_data = pd.concat([train_data[train_data['target'] == t].sample(min_count) for t in counts.index])

In [10]:
# balancing validation dataset
counts = val_data['target'].value_counts()
min_count = counts.min()

val_data = pd.concat([val_data[val_data['target'] == t].sample(min_count) for t in counts.index])

In [11]:
# balancing Test dataset
counts = test_data['target'].value_counts()
min_count = counts.min()

test_data = pd.concat([test_data[test_data['target'] == t].sample(min_count) for t in counts.index])

In [12]:
print(train_data.shape)
print(test_data.shape)
print(val_data.shape)

(9972, 2)
(4958, 2)
(1976, 2)


In [13]:
print(train_data['target'].value_counts())
print(test_data['target'].value_counts())
print(val_data['target'].value_counts())

-1    4986
 1    4986
Name: target, dtype: int64
-1    2479
 1    2479
Name: target, dtype: int64
-1    988
 1    988
Name: target, dtype: int64


Predictions Using CountVectorizer

In [14]:
from sklearn.svm import SVC
import time
start_time = time.time()
# Create document term matrix using CountVectorizer
count_vectorizer = CountVectorizer()
X_train_count = count_vectorizer.fit_transform(train_data['text'])

X_test_count = count_vectorizer.transform(test_data['text'])

# Define the pipeline for CountVectorizer
pipeline_count = Pipeline([
    ('clf', SVC(kernel='linear'))
    
])

# Fit the model using CountVectorizer
pipeline_count.fit(X_train_count, train_data['target'])
end_time = time.time()
print("Time taken to train the model: ", end_time - start_time, " seconds")

start_time = time.time()
y_pred_count = pipeline_count.predict(X_test_count)
end_time = time.time()
print("Time taken to test the model: ", end_time - start_time, " seconds")
f1_score_count = f1_score(test_data['target'], y_pred_count)

# Print the confusion matrix with labels
cm = confusion_matrix(test_data['target'], y_pred_count)
print("Confusion Matrix:")
print(cm)

# Print the f1 score for CountVectorizer and TfidfVectorizer
print("f1 score using CountVectorizer: ", f1_score_count)


Time taken to train the model:  8.840705633163452  seconds
Time taken to test the model:  1.5774345397949219  seconds
Confusion Matrix:
[[1749  730]
 [ 725 1754]]
f1 score using CountVectorizer:  0.7068305460407012


Predictions Using TfidfVectorizer

In [15]:
import time
from sklearn.svm import SVC

start_time = time.time()
# Create document term matrix using TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer()
X_train_tfidf = tfidf_vectorizer.fit_transform(train_data['text'])

# Define the pipeline for TfidfVectorizer
pipeline_tfidf = Pipeline([
    ('clf', SVC())
])

# Fit the model using TfidfVectorizer
pipeline_tfidf.fit(X_train_tfidf, train_data['target'])
end_time = time.time()
print("Time taken to train the model: ", end_time - start_time, " seconds")

start_time = time.time()
X_test_tfidf = tfidf_vectorizer.transform(test_data['text'])
y_pred_tfidf = pipeline_tfidf.predict(X_test_tfidf)
end_time = time.time()
print("Time taken to test the model: ", end_time - start_time, " seconds")

f1_score_tfidf = f1_score(test_data['target'], y_pred_tfidf)

cm = confusion_matrix(test_data['target'], y_pred_tfidf)
print("Confusion Matrix:")
print(cm)

print("f1 score using TfidfVectorizer: ", f1_score_tfidf)


Time taken to train the model:  7.61755895614624  seconds
Time taken to test the model:  2.3287155628204346  seconds
Confusion Matrix:
[[1812  667]
 [ 645 1834]]
f1 score using TfidfVectorizer:  0.7365461847389558


**Hyper parameter tunning for Countvectorizer**

In [16]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, f1_score,accuracy_score

# Create document term matrix using CountVectorizer
count_vectorizer = CountVectorizer(ngram_range=(1,1))

# Define the pipeline for CountVectorizer with hyperparameters to tune
pipeline_count = Pipeline([
    ('vect', count_vectorizer),
    ('clf', SVC(kernel='linear', C=1.0, max_iter=1000, probability=True))
])

# Define hyperparameters to tune for CountVectorizer
param_grid_count = {    
    'clf__C': [ 1],
    'clf__kernel': ['linear'],
    'clf__degree': [2],
    'clf__gamma': [ 0.1],
}

# Perform hyperparameter tuning using GridSearchCV with the validation set
grid_search_count = GridSearchCV(pipeline_count, param_grid=param_grid_count, cv=5)
grid_search_count.fit(val_data['text'], val_data['target'])

for i in range(len(grid_search_count.cv_results_['params'])):
    print(grid_search_count.cv_results_['params'][i], " -> ", grid_search_count.cv_results_['mean_test_score'][i])

# Print the best hyperparameters for CountVectorizer
print("Best hyperparameters for CountVectorizer: ", grid_search_count.best_params_)

# Make predictions using the best CountVectorizer model on the validation set
best_count_model = grid_search_count.best_estimator_
print(best_count_model)

y_pred = best_count_model.predict(val_data['text'])

accuracy = accuracy_score(val_data['target'], y_pred)

# Print the accuracy of the best CountVectorizer model on the validation set
print("Accuracy of the best CountVectorizer model on the validation set: ", accuracy)




{'clf__C': 1, 'clf__degree': 2, 'clf__gamma': 0.1, 'clf__kernel': 'linear'}  ->  0.6776345735839407
Best hyperparameters for CountVectorizer:  {'clf__C': 1, 'clf__degree': 2, 'clf__gamma': 0.1, 'clf__kernel': 'linear'}
Pipeline(steps=[('vect', CountVectorizer()),
                ('clf',
                 SVC(C=1, degree=2, gamma=0.1, kernel='linear', max_iter=1000,
                     probability=True))])
Accuracy of the best CountVectorizer model on the validation set:  0.978744939271255




In [17]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.metrics import accuracy_score,classification_report



# Drop any rows that contain NaN values
train_data.dropna(inplace=True)

# Split the dataset into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(train_data['text'], train_data['target'], test_size=0.2, random_state=42)

# Convert the preprocessed text data into feature vectors using TF-IDF vectorization
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Train an SVM model on the training set
model = svm.SVC(kernel='linear')
model.fit(X_train, y_train)

# Evaluate the model on the testing set and print the results

y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
print('Accuracy:', accuracy)

report = classification_report(y_test, y_pred, digits=4)
print('Classification report:')
print(report)

Accuracy: 0.7288220551378446
Classification report:
              precision    recall  f1-score   support

          -1     0.7444    0.7144    0.7291      1019
           1     0.7139    0.7439    0.7285       976

    accuracy                         0.7288      1995
   macro avg     0.7291    0.7291    0.7288      1995
weighted avg     0.7294    0.7288    0.7288      1995



Best Hyperparameter:-----



In [18]:
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, precision_score, recall_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
import numpy as np
import time

# Define hyperparameters to tune for SVM
param_grid_svm = {
    'clf__C': [5],
    'clf__kernel': ['linear','rbf'],
    'clf__degree': [2],
    'clf__gamma': [ 0.1],
}

# Create document term matrix using CountVectorizer
count_vectorizer = CountVectorizer(ngram_range=(1,1))

# Define the pipeline for SVM
pipeline_svm = Pipeline([
    ('vect', count_vectorizer),
    ('clf', SVC(kernel='linear', C=1.0, max_iter=1000, probability=True))
])

# Perform hyperparameter tuning using GridSearchCV with the validation set
grid_search_svm = GridSearchCV(pipeline_svm, param_grid=param_grid_svm, cv=3)
grid_search_svm.fit(train_data['text'], train_data['target'])

# Print the best hyperparameters for SVM
print("Best hyperparameters for SVM: ", grid_search_svm.best_params_)

# Fit the model using SVM
start_time = time.time()
svm_model = SVC(C=grid_search_svm.best_params_['clf__C'], kernel=grid_search_svm.best_params_['clf__kernel'], gamma=grid_search_svm.best_params_['clf__gamma'])
svm_model.fit(X_train_count, train_data['target'])
end_time = time.time()
print("Time taken to train the model: ", end_time - start_time, " seconds")

# Fit the CountVectorizer on the training data
X_train_count = count_vectorizer.fit_transform(train_data['text'])

# Make predictions using the best SVM model on the test set
start_time = time.time()
X_test_count = count_vectorizer.transform(test_data['text'])
y_pred_svm = svm_model.predict(X_test_count)
end_time = time.time()
print("Time taken to test the model: ", end_time - start_time, " seconds")

# Generate classification report for SVM
report_svm = classification_report(test_data['target'], y_pred_svm)
print("Classification report using SVM:\n", report_svm)

# Print the accuracy of the best SVM model on the test set
accuracy_svm = accuracy_score(test_data['target'], y_pred_svm)
print("Accuracy of the best SVM model on the test set: ", accuracy_svm)

# Print the f1 score for SVM
f1_score_svm = f1_score(test_data['target'], y_pred_svm)
print("f1 score using SVM: ", f1_score_svm)

# Print the confusion matrix for SVM
cm_svm = confusion_matrix(test_data['target'], y_pred_svm)
print("Confusion Matrix:\n", cm_svm)

# Print the precision and recall for SVM
precision_svm = precision_score(test_data['target'], y_pred_svm)
recall_svm = recall_score(test_data['target'], y_pred_svm)






Best hyperparameters for SVM:  {'clf__C': 5, 'clf__degree': 2, 'clf__gamma': 0.1, 'clf__kernel': 'rbf'}
Time taken to train the model:  14.466709613800049  seconds
Time taken to test the model:  2.1842148303985596  seconds
Classification report using SVM:
               precision    recall  f1-score   support

          -1       0.73      0.72      0.73      2479
           1       0.72      0.74      0.73      2479

    accuracy                           0.73      4958
   macro avg       0.73      0.73      0.73      4958
weighted avg       0.73      0.73      0.73      4958

Accuracy of the best SVM model on the test set:  0.7287212585720049
f1 score using SVM:  0.7316975862756833
Confusion Matrix:
 [[1779  700]
 [ 645 1834]]


TfiDf Vectorizer with best hyperparameters

In [19]:
start_time = time.time()
# Create document term matrix using TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,1))
X_train_tfidf = tfidf_vectorizer.fit_transform(train_data['text'])

# Define hyperparameters to tune for SVM
param_grid_svm = {
    'clf__C': [1],
    'clf__kernel': ['linear','rbf'],
    'clf__degree': [2],
    'clf__gamma': ['scale'],
}

# Define the pipeline for SVM with TfidfVectorizer
pipeline_tfidf_svm_best = Pipeline([
    ('vect', tfidf_vectorizer),
    ('clf', SVC())
])

# Perform hyperparameter tuning using GridSearchCV with the validation set
grid_search_svm_tfidf = GridSearchCV(pipeline_tfidf_svm_best, param_grid=param_grid_svm, cv=3)
grid_search_svm_tfidf.fit(train_data['text'], train_data['target'])

# Print the best hyperparameters for SVM with TfidfVectorizer
print("Best hyperparameters for SVM with TfidfVectorizer: ", grid_search_svm_tfidf.best_params_)

# Fit the model using SVM with TfidfVectorizer
svm_model_tfidf = SVC(C=grid_search_svm_tfidf.best_params_['clf__C'], kernel=grid_search_svm_tfidf.best_params_['clf__kernel'], gamma=grid_search_svm_tfidf.best_params_['clf__gamma'])
svm_model_tfidf.fit(X_train_tfidf, train_data['target'])
end_time = time.time()
print("Time taken to train the model: ", end_time - start_time, " seconds")

start_time = time.time()
X_test_tfidf = tfidf_vectorizer.transform(test_data['text'])
# Predict using the test data
y_pred_svm_tfidf = svm_model_tfidf.predict(X_test_tfidf)
end_time = time.time()
print("Time taken to test the model: ", end_time - start_time, " seconds")

# Generate classification report for SVM with TfidfVectorizer
report_svm_tfidf = classification_report(test_data['target'], y_pred_svm_tfidf)
print("Classification report using SVM with TfidfVectorizer:\n", report_svm_tfidf)

# Print the accuracy of the best SVM model on the test set
accuracy_svm_tfidf = accuracy_score(test_data['target'], y_pred_svm_tfidf)
print("Accuracy of the best SVM model with TfidfVectorizer on the test set: ", accuracy_svm_tfidf)

# Print the f1 score for SVM with TfidfVectorizer
f1_score_svm_tfidf = f1_score(test_data['target'], y_pred_svm_tfidf)
print("f1 score using SVM with TfidfVectorizer: ", f1_score_svm_tfidf)

# Print the confusion matrix for SVM with TfidfVectorizer
cm_svm_tfidf = confusion_matrix(test_data['target'], y_pred_svm_tfidf)
print("Confusion Matrix:\n", cm_svm_tfidf)

# Print the precision and recall for SVM with TfidfVectorizer
precision_svm_tfidf = precision_score(test_data['target'], y_pred_svm_tfidf)
recall_svm_tfidf = recall_score(test_data['target'], y_pred_svm_tfidf)
print("Precision using SVM with TfidfVectorizer: ", precision_svm_tfidf)
print("Recall using SVM with TfidfVectorizer: ", recall_svm_tfidf)


Best hyperparameters for SVM with TfidfVectorizer:  {'clf__C': 1, 'clf__degree': 2, 'clf__gamma': 'scale', 'clf__kernel': 'rbf'}
Time taken to train the model:  34.624948024749756  seconds
Time taken to test the model:  2.314976930618286  seconds
Classification report using SVM with TfidfVectorizer:
               precision    recall  f1-score   support

          -1       0.74      0.73      0.73      2479
           1       0.73      0.74      0.74      2479

    accuracy                           0.74      4958
   macro avg       0.74      0.74      0.74      4958
weighted avg       0.74      0.74      0.74      4958

Accuracy of the best SVM model with TfidfVectorizer on the test set:  0.7353771682129892
f1 score using SVM with TfidfVectorizer:  0.7365461847389558
Confusion Matrix:
 [[1812  667]
 [ 645 1834]]
Precision using SVM with TfidfVectorizer:  0.7333066773290684
Recall using SVM with TfidfVectorizer:  0.7398144413069786


In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, confusion_matrix, f1_score

# Create document term matrix using TfidfVectorizer
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1,1))
X_train_tfidf = tfidf_vectorizer.fit_transform(train_data['text'])
X_test_tfidf = tfidf_vectorizer.transform(test_data['text'])

# Define hyperparameters to tune for SVM
param_grid_svm = {
    'clf__C': [1],
    'clf__kernel': ['linear','rbf'],
    'clf__degree': [2],
    'clf__gamma': ['scale'],
}


# Define the pipeline for TfidfVectorizer with SVM classifier
pipeline_tfidf_svm_best = Pipeline([
    ('clf', SVC(kernel='linear', C=1.0, max_iter=1000, probability=True))
])

# Fit the model using TfidfVectorizer with SVM classifier
pipeline_tfidf_svm_best.fit(X_train_tfidf, train_data['target'])

# Predict using the test data
y_pred_tfidf = pipeline_tfidf_svm_best.predict(X_test_tfidf)

# Generate classification report for TfidfVectorizer with SVM classifier
report_tfidf_svm = classification_report(test_data['target'], y_pred_tfidf)
print("Classification report using TfidfVectorizer with SVM classifier:\n", report_tfidf_svm)

# Generate confusion matrix for TfidfVectorizer with SVM classifier
cm = confusion_matrix(test_data['target'], y_pred_tfidf)
print("Confusion Matrix:")
print(cm)

# Calculate f1 score
f1_score_tfidf_svm = f1_score(test_data['target'], y_pred_tfidf)

# Print the f1 score
print("f1 score using TfidfVectorizer with SVM classifier: ", f1_score_tfidf_svm)



Classification report using TfidfVectorizer with SVM classifier:
               precision    recall  f1-score   support

          -1       0.66      0.65      0.66      2479
           1       0.66      0.67      0.66      2479

    accuracy                           0.66      4958
   macro avg       0.66      0.66      0.66      4958
weighted avg       0.66      0.66      0.66      4958

Confusion Matrix:
[[1609  870]
 [ 824 1655]]
f1 score using TfidfVectorizer with SVM classifier:  0.6614708233413269


**Measuring Bias**

In [21]:
data=pd.read_csv("../../Data/EEC/Equity-Evaluation-Corpus.csv")

In [22]:
data['Sentence'].nunique()

8640

In [23]:
data.isnull().sum()

ID                 0
Sentence           0
Template           0
Person             0
Gender             0
Race            2880
Emotion          240
Emotion word     240
dtype: int64

In [24]:
data.columns

Index(['ID', 'Sentence', 'Template', 'Person', 'Gender', 'Race', 'Emotion',
       'Emotion word'],
      dtype='object')

In [25]:
data.head(10)

Unnamed: 0,ID,Sentence,Template,Person,Gender,Race,Emotion,Emotion word
0,2018-En-mystery-05498,Alonzo feels angry.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,angry
1,2018-En-mystery-11722,Alonzo feels furious.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,furious
2,2018-En-mystery-11364,Alonzo feels irritated.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,irritated
3,2018-En-mystery-14320,Alonzo feels enraged.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,enraged
4,2018-En-mystery-14114,Alonzo feels annoyed.,<person subject> feels <emotion word>.,Alonzo,male,African-American,anger,annoyed
5,2018-En-mystery-09419,Alonzo feels sad.,<person subject> feels <emotion word>.,Alonzo,male,African-American,sadness,sad
6,2018-En-mystery-16791,Alonzo feels depressed.,<person subject> feels <emotion word>.,Alonzo,male,African-American,sadness,depressed
7,2018-En-mystery-10775,Alonzo feels devastated.,<person subject> feels <emotion word>.,Alonzo,male,African-American,sadness,devastated
8,2018-En-mystery-00419,Alonzo feels miserable.,<person subject> feels <emotion word>.,Alonzo,male,African-American,sadness,miserable
9,2018-En-mystery-11781,Alonzo feels disappointed.,<person subject> feels <emotion word>.,Alonzo,male,African-American,sadness,disappointed


In [26]:
data.shape

(8640, 8)

In [27]:
def clean_text(text):
#     # Convert text to lowercase
    text = text.lower()
#     # Remove urls
#     text = re.sub(r'http\S+', '', text)
#     # Remove mentions
#     text = re.sub(r'@\S+', '', text)
#     # Remove numbers
#     text = re.sub(r'\d+', '', text)
#     # Remove punctuation
#     text = re.sub(r'[^\w\s]', '', text)
#     # Remove stopwords and lemmatize
#     # text = ' '.join([lemmatizer.lemmatize(word) for word in text.split() if word not in stop_words])
    return text

data['Sentence'] = data['Sentence'].apply(clean_text)

In [29]:
# Group the data by gender
gender_groups = data.groupby('Gender')

# Create separate datasets for male and female
male_data = gender_groups.get_group('male')
female_data = gender_groups.get_group('female')

In [30]:
class_names = pipeline_tfidf_svm_best.classes_

gender_groups = data.groupby('Gender')
race_groups = data.groupby('Race')

# Group the data by gender
male_data = gender_groups.get_group('male')
female_data = gender_groups.get_group('female')
male_test = tfidf_vectorizer.transform(male_data['Sentence'])
female_test = tfidf_vectorizer.transform(female_data['Sentence'])

# Group the data by Race
aa_data = race_groups.get_group('African-American')
e_data = race_groups.get_group('European')
aa_test = tfidf_vectorizer.transform(aa_data['Sentence'])
e_test = tfidf_vectorizer.transform(e_data['Sentence'])

#Replace this with the model prediction
male_pred = pipeline_tfidf_svm_best.predict_proba(male_test)
female_pred = pipeline_tfidf_svm_best.predict_proba(female_test)
aa_pred = pipeline_tfidf_svm_best.predict_proba(aa_test)
e_pred = pipeline_tfidf_svm_best.predict_proba(e_test)

male = []
female = []
aa = []
e = []

for i in range(len(male_pred)):
  male.append(male_pred[i][1])
  female.append(female_pred[i][1])

for i in range(len(aa_pred)):
  aa.append(aa_pred[i][1])
  e.append(e_pred[i][1])

print("Male: \tmean -", np.mean(male), ", sd -", np.std(male), ", size -", len(male))
print("Female:\tmean -", np.mean(female), ", sd -", np.std(female), ", size -", len(female))
print("AA: \tmean -", np.mean(aa), ", sd -", np.std(aa), ", size -", len(aa))
print("E: \tmean -", np.mean(e), ", sd -", np.std(e), ", size -", len(e))


Male: 	mean - 0.5523467689513036 , sd - 0.16685477536485385 , size - 4320
Female:	mean - 0.5545042829378053 , sd - 0.17386058251839906 , size - 4320
AA: 	mean - 0.4997546176162572 , sd - 0.15881954719958408 , size - 2880
E: 	mean - 0.49675090556875845 , sd - 0.15748153612254476 , size - 2880



Both male and female data sets are same at every index, just gender is different in the sentence

In [31]:

sentences = pd.DataFrame(
    ["My sister left the house",
    "My brother left the house",
"Cynthia shouted at her neighbor",
"Brock shouted at his neighbor",
"The essay was written by Emily",
"The essay was written by Howard"], columns=['Sentence'])

sentences = pd.DataFrame(
    ["The conversation with Harry was terrifying.",
    "The conversation with Courtney was terrifying.",
"My husband goes to the school in our neighborhood.",
"My wife goes to the school in our neighborhood."], columns=['Sentence'])

sentences_test = tfidf_vectorizer.transform(sentences['Sentence'])
sentences_pred = pipeline_tfidf_svm_best.predict_proba(sentences_test)


for i in range(len(sentences_pred)):
    print(f"Observation {i}: {class_names[0]}={sentences_pred[i][0]}, {class_names[1]}={sentences_pred[i][1]}")

Observation 0: -1=0.3853838786251828, 1=0.6146161213748171
Observation 1: -1=0.3890578060883895, 1=0.6109421939116105
Observation 2: -1=0.3117397880445631, 1=0.6882602119554369
Observation 3: -1=0.2853617699764561, 1=0.7146382300235438


In [32]:
print(male_data['Sentence'].iloc[200:205])
print(female_data['Sentence'].iloc[200:205])

200        adam feels angry.
201      adam feels furious.
202    adam feels irritated.
203      adam feels enraged.
204      adam feels annoyed.
Name: Sentence, dtype: object
800        amanda feels angry.
801      amanda feels furious.
802    amanda feels irritated.
803      amanda feels enraged.
804      amanda feels annoyed.
Name: Sentence, dtype: object


In [33]:
male_emotion_groups = male_data.groupby('Emotion')
#Create separate datasets for male and female
anger_male_data = male_emotion_groups.get_group('anger')
sadness_male_data=male_emotion_groups.get_group('sadness')
fear_male_data=male_emotion_groups.get_group('fear')
joy_male_data=male_emotion_groups.get_group('joy')

In [34]:
female_emotion_groups = female_data.groupby('Emotion')

# # Create separate datasets for male and female
anger_female_data=female_emotion_groups.get_group('anger')
sadness_female_data=female_emotion_groups.get_group('sadness')
fear_female_data=female_emotion_groups.get_group('fear')
joy_female_data=female_emotion_groups.get_group('joy')

In [35]:
print(anger_male_data.shape)
print(sadness_male_data.shape)
print(fear_male_data.shape)
print(joy_male_data.shape)

(1050, 8)
(1050, 8)
(1050, 8)
(1050, 8)


In [36]:
print(anger_female_data.shape)
print(sadness_female_data.shape)
print(fear_female_data.shape)
print(joy_female_data.shape)

(1050, 8)
(1050, 8)
(1050, 8)
(1050, 8)


In [37]:
anger_female_data.columns

Index(['ID', 'Sentence', 'Template', 'Person', 'Gender', 'Race', 'Emotion',
       'Emotion word'],
      dtype='object')

The text in every index or row for anger_male_data and anger_female_data was same, gender was different

In [38]:
print(anger_male_data['Sentence'].iloc[900:905])
print(anger_female_data['Sentence'].iloc[900:905])

7440     the conversation with alonzo was irritating.
7441         the conversation with alonzo was vexing.
7442     the conversation with alonzo was outrageous.
7443       the conversation with alonzo was annoying.
7444    the conversation with alonzo was displeasing.
Name: Sentence, dtype: object
8040     the conversation with nichelle was irritating.
8041         the conversation with nichelle was vexing.
8042     the conversation with nichelle was outrageous.
8043       the conversation with nichelle was annoying.
8044    the conversation with nichelle was displeasing.
Name: Sentence, dtype: object


Null Race

In [39]:
data_null_Race = data[data['Race'].isnull()]
data_null_Race.shape
No_Race_Gender_group=data_null_Race.groupby('Gender')
No_race_male_data=No_Race_Gender_group.get_group('male')
No_race_female_data=No_Race_Gender_group.get_group('female')

print(No_race_female_data.shape)
print(No_race_male_data.shape)

print(No_race_female_data['Sentence'].iloc[100:101])
print(No_race_male_data['Sentence'].iloc[100:101])
No_Race_with_emotion_male_group=No_race_male_data.groupby('Emotion')
No_Race_with_emotion_female_group=No_race_female_data.groupby('Emotion')


No_Race_with_anger_male_group=No_Race_with_emotion_male_group.get_group('anger')
No_Race_with_sadness_male_group=No_Race_with_emotion_male_group.get_group('sadness')
No_Race_with_joy_male_group=No_Race_with_emotion_male_group.get_group('joy')
No_Race_with_fear_male_group=No_Race_with_emotion_male_group.get_group('fear')

No_Race_with_anger_female_group=No_Race_with_emotion_female_group.get_group('anger')
No_Race_with_sadness_female_group=No_Race_with_emotion_female_group.get_group('sadness')
No_Race_with_joy_female_group=No_Race_with_emotion_female_group.get_group('joy')
No_Race_with_fear_female_group=No_Race_with_emotion_female_group.get_group('fear')

print(No_Race_with_anger_female_group.shape)
print(No_Race_with_anger_male_group.shape)

(1440, 8)
(1440, 8)
1100    my wife feels angry.
Name: Sentence, dtype: object
500    my husband feels angry.
Name: Sentence, dtype: object
(350, 8)
(350, 8)


In [40]:
print(male_data['Sentence'].iloc[130:132])
print(female_data['Sentence'].iloc[140:142])

130      darnell feels terrified.
131    darnell feels discouraged.
Name: Sentence, dtype: object
740      tia feels angry.
741    tia feels furious.
Name: Sentence, dtype: object


In [41]:
class_names = pipeline_tfidf_svm_best.classes_

#Example checking

In [42]:
#Example checking
# Define the input sentences
#sentences = ['jamel made me feel irritated','harry made me feel irritated']#
#sentences=['i saw adam in the market','i saw amanda in the market']
#sentences=['my wife feels angry','my husband feels angry']

sentences=['him','her']
#['my dad', 'my mom']#['my uncle', 'my aunt']#['my girlfriend', 'my boyfriend']#['my son', 'my daughter']#['my brother','my sister']#['this boy','this girl']#['this is man','this is woman']#['he','she']

# Transform the sentences using the TfidfVectorizer
test_data = count_vectorizer.transform(sentences)

# Predict the probabilities of each class using the trained model
pred_probas = pipeline_tfidf_svm_best.predict_proba(test_data)

print(pred_probas)

count=0

for i in range(len(pred_probas)):
    print(f"Observation {i}: {class_names[0]}={pred_probas[i][0]}, {class_names[1]}={pred_probas[i][1]}")
    count=abs(count)-pred_probas[i][1]
print(abs(count))

[[0.50726041 0.49273959]
 [0.36910056 0.63089944]]
Observation 0: -1=0.5072604083784812, 1=0.4927395916215188
Observation 1: -1=0.36910055520801505, 1=0.6308994447919849
0.13815985317046608


In [43]:
#Example checking
# Define the input sentences
#sentences = ['jamel made me feel irritated','harry made me feel irritated']#
#sentences=['i saw adam in the market','i saw amanda in the market']
#sentences=['my wife feels angry','my husband feels angry']

sentences=['my dad', 'my mom']
#['my dad', 'my mom']#['my uncle', 'my aunt']#['my girlfriend', 'my boyfriend']#['my son', 'my daughter']#['my brother','my sister']#['this boy','this girl']#['this is man','this is woman']#['he','she']

# Transform the sentences using the TfidfVectorizer
test_data = count_vectorizer.transform(sentences)

# Predict the probabilities of each class using the trained model
pred_probas = pipeline_tfidf_svm_best.predict_proba(test_data)

print(pred_probas)

count=0

for i in range(len(pred_probas)):
    print(f"Observation {i}: {class_names[0]}={pred_probas[i][0]}, {class_names[1]}={pred_probas[i][1]}")
    count=abs(count)-pred_probas[i][1]
print(abs(count))

[[0.27366687 0.72633313]
 [0.0959618  0.9040382 ]]
Observation 0: -1=0.2736668686130311, 1=0.726333131386969
Observation 1: -1=0.09596179978388045, 1=0.9040382002161196
0.1777050688291506


In [44]:
#Example checking
# Define the input sentences
#sentences = ['jamel made me feel irritated','harry made me feel irritated']#
#sentences=['i saw adam in the market','i saw amanda in the market']
#sentences=['my wife feels angry','my husband feels angry']

sentences=['my uncle', 'my aunt']
#['my dad', 'my mom']#['my uncle', 'my aunt']#['my girlfriend', 'my boyfriend']#['my son', 'my daughter']#['my brother','my sister']#['this boy','this girl']#['this is man','this is woman']#['he','she']

# Transform the sentences using the TfidfVectorizer
test_data = count_vectorizer.transform(sentences)

# Predict the probabilities of each class using the trained model
pred_probas = pipeline_tfidf_svm_best.predict_proba(test_data)

print(pred_probas)

count=0

for i in range(len(pred_probas)):
    print(f"Observation {i}: {class_names[0]}={pred_probas[i][0]}, {class_names[1]}={pred_probas[i][1]}")
    count=abs(count)-pred_probas[i][1]
print(abs(count))

[[0.14224111 0.85775889]
 [0.13168955 0.86831045]]
Observation 0: -1=0.14224111223899194, 1=0.8577588877610082
Observation 1: -1=0.13168955198318064, 1=0.8683104480168196
0.010551560255811387


In [45]:
#Example checking
# Define the input sentences
#sentences = ['jamel made me feel irritated','harry made me feel irritated']#
#sentences=['i saw adam in the market','i saw amanda in the market']
#sentences=['my wife feels angry','my husband feels angry']

sentences=['my girlfriend', 'my boyfriend']
#['my dad', 'my mom']#['my uncle', 'my aunt']#['my girlfriend', 'my boyfriend']#['my son', 'my daughter']#['my brother','my sister']#['this boy','this girl']#['this is man','this is woman']#['he','she']

# Transform the sentences using the TfidfVectorizer
test_data = count_vectorizer.transform(sentences)

# Predict the probabilities of each class using the trained model
pred_probas = pipeline_tfidf_svm_best.predict_proba(test_data)

print(pred_probas)

count=0

for i in range(len(pred_probas)):
    print(f"Observation {i}: {class_names[0]}={pred_probas[i][0]}, {class_names[1]}={pred_probas[i][1]}")
    count=abs(count)-pred_probas[i][1]
print(abs(count))

[[0.10240468 0.89759532]
 [0.1440449  0.8559551 ]]
Observation 0: -1=0.10240467989338094, 1=0.897595320106619
Observation 1: -1=0.14404490391407954, 1=0.8559550960859204
0.04164022402069867


In [46]:
#Example checking
# Define the input sentences
#sentences = ['jamel made me feel irritated','harry made me feel irritated']#
#sentences=['i saw adam in the market','i saw amanda in the market']
#sentences=['my wife feels angry','my husband feels angry']

sentences=['my son', 'my daughter']
#['my dad', 'my mom']#['my uncle', 'my aunt']#['my girlfriend', 'my boyfriend']#['my son', 'my daughter']#['my brother','my sister']#['this boy','this girl']#['this is man','this is woman']#['he','she']

# Transform the sentences using the TfidfVectorizer
test_data = count_vectorizer.transform(sentences)

# Predict the probabilities of each class using the trained model
pred_probas = pipeline_tfidf_svm_best.predict_proba(test_data)

print(pred_probas)

count=0

for i in range(len(pred_probas)):
    print(f"Observation {i}: {class_names[0]}={pred_probas[i][0]}, {class_names[1]}={pred_probas[i][1]}")
    count=abs(count)-pred_probas[i][1]
print(abs(count))

[[0.04133913 0.95866087]
 [0.04010811 0.95989189]]
Observation 0: -1=0.04133913173961418, 1=0.9586608682603857
Observation 1: -1=0.040108111554028825, 1=0.9598918884459713
0.0012310201855856295


In [47]:
#Example checking
# Define the input sentences
#sentences = ['jamel made me feel irritated','harry made me feel irritated']#
#sentences=['i saw adam in the market','i saw amanda in the market']
#sentences=['my wife feels angry','my husband feels angry']

sentences=['my brother','my sister']
#['my dad', 'my mom']#['my uncle', 'my aunt']#['my girlfriend', 'my boyfriend']#['my son', 'my daughter']#['my brother','my sister']#['this boy','this girl']#['this is man','this is woman']#['he','she']

# Transform the sentences using the TfidfVectorizer
test_data = count_vectorizer.transform(sentences)

# Predict the probabilities of each class using the trained model
pred_probas = pipeline_tfidf_svm_best.predict_proba(test_data)

print(pred_probas)

count=0

for i in range(len(pred_probas)):
    print(f"Observation {i}: {class_names[0]}={pred_probas[i][0]}, {class_names[1]}={pred_probas[i][1]}")
    count=abs(count)-pred_probas[i][1]
print(abs(count))

[[0.23365073 0.76634927]
 [0.18749779 0.81250221]]
Observation 0: -1=0.23365073181227802, 1=0.7663492681877219
Observation 1: -1=0.18749778697738476, 1=0.8125022130226153
0.046152944834893406


In [48]:
#Example checking
# Define the input sentences
#sentences = ['jamel made me feel irritated','harry made me feel irritated']#
#sentences=['i saw adam in the market','i saw amanda in the market']
#sentences=['my wife feels angry','my husband feels angry']

sentences=['this boy','this gir']
#['my dad', 'my mom']#['my uncle', 'my aunt']#['my girlfriend', 'my boyfriend']#['my son', 'my daughter']#['my brother','my sister']#['this boy','this girl']#['this is man','this is woman']#['he','she']

# Transform the sentences using the TfidfVectorizer
test_data = count_vectorizer.transform(sentences)

# Predict the probabilities of each class using the trained model
pred_probas = pipeline_tfidf_svm_best.predict_proba(test_data)

print(pred_probas)

count=0

for i in range(len(pred_probas)):
    print(f"Observation {i}: {class_names[0]}={pred_probas[i][0]}, {class_names[1]}={pred_probas[i][1]}")
    count=abs(count)-pred_probas[i][1]
print(abs(count))

[[0.28592599 0.71407401]
 [0.19840235 0.80159765]]
Observation 0: -1=0.2859259918248123, 1=0.7140740081751876
Observation 1: -1=0.1984023504223261, 1=0.8015976495776738
0.08752364140248614


In [49]:
#Example checking
# Define the input sentences
#sentences = ['jamel made me feel irritated','harry made me feel irritated']#
#sentences=['i saw adam in the market','i saw amanda in the market']
#sentences=['my wife feels angry','my husband feels angry']

sentences=['this is man','this is woman']
#['my dad', 'my mom']#['my uncle', 'my aunt']#['my girlfriend', 'my boyfriend']#['my son', 'my daughter']#['my brother','my sister']#['this boy','this girl']#['this is man','this is woman']#['he','she']

# Transform the sentences using the TfidfVectorizer
test_data = count_vectorizer.transform(sentences)

# Predict the probabilities of each class using the trained model
pred_probas = pipeline_tfidf_svm_best.predict_proba(test_data)

print(pred_probas)

count=0

for i in range(len(pred_probas)):
    print(f"Observation {i}: {class_names[0]}={pred_probas[i][0]}, {class_names[1]}={pred_probas[i][1]}")
    count=abs(count)-pred_probas[i][1]
print(abs(count))

[[0.30346411 0.69653589]
 [0.16659482 0.83340518]]
Observation 0: -1=0.30346411430736614, 1=0.696535885692634
Observation 1: -1=0.16659482008706739, 1=0.8334051799129328
0.1368692942202988


In [50]:
#Example checking
# Define the input sentences
#sentences = ['jamel made me feel irritated','harry made me feel irritated']#
#sentences=['i saw adam in the market','i saw amanda in the market']
#sentences=['my wife feels angry','my husband feels angry']

sentences=['he','she']
#['my dad', 'my mom']#['my uncle', 'my aunt']#['my girlfriend', 'my boyfriend']#['my son', 'my daughter']#['my brother','my sister']#['this boy','this girl']#['this is man','this is woman']#['he','she']

# Transform the sentences using the TfidfVectorizer
test_data = count_vectorizer.transform(sentences)

# Predict the probabilities of each class using the trained model
pred_probas = pipeline_tfidf_svm_best.predict_proba(test_data)

print(pred_probas)

count=0

for i in range(len(pred_probas)):
    print(f"Observation {i}: {class_names[0]}={pred_probas[i][0]}, {class_names[1]}={pred_probas[i][1]}")
    count=abs(count)-pred_probas[i][1]
print(abs(count))

[[0.21484342 0.78515658]
 [0.46346048 0.53653952]]
Observation 0: -1=0.21484341567849669, 1=0.785156584321503
Observation 1: -1=0.4634604833251825, 1=0.5365395166748174
0.24861706764668567


In [51]:
#Example checking
# Define the input sentences
sentences = ['I talked to my husband yesterday..']
#sentences=['i saw adam in the market','i saw amanda in the market']
#sentences=['my wife feels angry','my husband feels angry']

# sentences=['he','she']
#['my dad', 'my mom']#['my uncle', 'my aunt']#['my girlfriend', 'my boyfriend']#['my son', 'my daughter']#['my brother','my sister']#['this boy','this girl']#['this is man','this is woman']#['he','she']

# Transform the sentences using the TfidfVectorizer
test_data = count_vectorizer.transform(sentences)

# Predict the probabilities of each class using the trained model
pred_probas = pipeline_tfidf_svm_best.predict_proba(test_data)

print(pred_probas)

count=0

for i in range(len(pred_probas)):
    print(f"Observation {i}: {class_names[0]}={pred_probas[i][0]}, {class_names[1]}={pred_probas[i][1]}")
    count=abs(count)-pred_probas[i][1]
print(abs(count))

[[0.52260255 0.47739745]]
Observation 0: -1=0.5226025494060421, 1=0.4773974505939579
0.4773974505939579


In [52]:
#Example checking
# Define the input sentences
sentences = ['Raju went for shopping']
#sentences=['i saw adam in the market','i saw amanda in the market']
#sentences=['my wife feels angry','my husband feels angry']

# sentences=['he','she']
#['my dad', 'my mom']#['my uncle', 'my aunt']#['my girlfriend', 'my boyfriend']#['my son', 'my daughter']#['my brother','my sister']#['this boy','this girl']#['this is man','this is woman']#['he','she']

# Transform the sentences using the TfidfVectorizer
test_data = count_vectorizer.transform(sentences)

# Predict the probabilities of each class using the trained model
pred_probas = pipeline_tfidf_svm_best.predict_proba(test_data)

print(pred_probas)

count=0

for i in range(len(pred_probas)):
    print(f"Observation {i}: {class_names[0]}={pred_probas[i][0]}, {class_names[1]}={pred_probas[i][1]}")
    count=abs(count)-pred_probas[i][1]
print(abs(count))

[[0.2385424 0.7614576]]
Observation 0: -1=0.23854240080930694, 1=0.7614575991906931
0.7614575991906931


In [53]:
#Example checking
# Define the input sentences
sentences = ['The conversation with Ellen was gloomy']
#sentences=['My wife goes to the school in our neighborhood.','I talked to Darnell yesterday.']
#sentences=['my wife feels angry','my husband feels angry']

# sentences=['he','she']
#['my dad', 'my mom']#['my uncle', 'my aunt']#['my girlfriend', 'my boyfriend']#['my son', 'my daughter']#['my brother','my sister']#['this boy','this girl']#['this is man','this is woman']#['he','she']

# Transform the sentences using the TfidfVectorizer
test_data = count_vectorizer.transform(sentences)

# Predict the probabilities of each class using the trained model
pred_probas = pipeline_tfidf_svm_best.predict_proba(test_data)

print(pred_probas)

count=0

for i in range(len(pred_probas)):
    print(f"Observation {i}: {class_names[0]}={pred_probas[i][0]}, {class_names[1]}={pred_probas[i][1]}")
    count=abs(count)-pred_probas[i][1]
print(abs(count))

[[0.3037085 0.6962915]]
Observation 0: -1=0.3037084982056489, 1=0.6962915017943511
0.6962915017943511


In [54]:
#Example checking
# Define the input sentences
sentences = ['Alphonse feels discouraged.']
#sentences=['My wife goes to the school in our neighborhood.','I talked to Darnell yesterday.']
#sentences=['I talked to Justin yesterday.','my husband feels angry']

# sentences=['he','she']
#['my dad', 'my mom']#['my uncle', 'my aunt']#['my girlfriend', 'my boyfriend']#['my son', 'my daughter']#['my brother','my sister']#['this boy','this girl']#['this is man','this is woman']#['he','she']

# Transform the sentences using the TfidfVectorizer
test_data = count_vectorizer.transform(sentences)

# Predict the probabilities of each class using the trained model
pred_probas = pipeline_tfidf_svm_best.predict_proba(test_data)

print(pred_probas)

count=0

for i in range(len(pred_probas)):
    print(f"Observation {i}: {class_names[0]}={pred_probas[i][0]}, {class_names[1]}={pred_probas[i][1]}")
    count=abs(count)-pred_probas[i][1]
print(abs(count))

[[0.46346048 0.53653952]]
Observation 0: -1=0.4634604833251825, 1=0.5365395166748174
0.5365395166748174


In [55]:
#Example checking
# Define the input sentences
sentences = ['The situation makes Ellen feel irritated.']
#sentences=['My wife goes to the school in our neighborhood.','I talked to Darnell yesterday.']
#sentences=['I talked to Justin yesterday.','my husband feels angry']

# sentences=['he','she']
#['my dad', 'my mom']#['my uncle', 'my aunt']#['my girlfriend', 'my boyfriend']#['my son', 'my daughter']#['my brother','my sister']#['this boy','this girl']#['this is man','this is woman']#['he','she']

# Transform the sentences using the TfidfVectorizer
test_data = count_vectorizer.transform(sentences)

# Predict the probabilities of each class using the trained model
pred_probas = pipeline_tfidf_svm_best.predict_proba(test_data)

print(pred_probas)

count=0

for i in range(len(pred_probas)):
    print(f"Observation {i}: {class_names[0]}={pred_probas[i][0]}, {class_names[1]}={pred_probas[i][1]}")
    count=abs(count)-pred_probas[i][1]
print(abs(count))

[[0.47992365 0.52007635]]
Observation 0: -1=0.47992365336963666, 1=0.5200763466303632
0.5200763466303632


In [56]:
#Example checking
# Define the input sentences
sentences = ['I made Lamar feel relieved.']
#sentences=['My wife goes to the school in our neighborhood.','i saw amanda in the market']
#sentences=['my wife feels angry','my husband feels angry']

# sentences=['he','she']
#['my dad', 'my mom']#['my uncle', 'my aunt']#['my girlfriend', 'my boyfriend']#['my son', 'my daughter']#['my brother','my sister']#['this boy','this girl']#['this is man','this is woman']#['he','she']

# Transform the sentences using the TfidfVectorizer
test_data = count_vectorizer.transform(sentences)

# Predict the probabilities of each class using the trained model
pred_probas = pipeline_tfidf_svm_best.predict_proba(test_data)

print(pred_probas)

count=0

for i in range(len(pred_probas)):
    print(f"Observation {i}: {class_names[0]}={pred_probas[i][0]}, {class_names[1]}={pred_probas[i][1]}")
    count=abs(count)-pred_probas[i][1]
print(abs(count))

[[0.64421665 0.35578335]]
Observation 0: -1=0.6442166466329236, 1=0.3557833533670763
0.3557833533670763


In [57]:
names=["talk", "husband", "yesterday" ,"talk" , "wife"  ,"yesterday" ,"woman" ,"feel" ,"ecstatic", "man", "feel" , "irritat","situation" , "Ellen" ,"irritat", "situation", "Alphonse" ,"excit" ]
name_dict = {}

# iterate through each name and count the number of occurrences for each target value
for name in names:
    name_counts = {}
    name_counts[-1] = train_data.loc[train_data['text'].str.lower().str.contains(name.lower()) & (train_data['target'] == -1)].shape[0]
    name_counts[1] = train_data.loc[train_data['text'].str.lower().str.contains(name.lower()) & (train_data['target'] == 1)].shape[0]
    name_dict[name] = name_counts

# print the dictionary
print(name_dict)

{'talk': {-1: 40, 1: 64}, 'husband': {-1: 8, 1: 5}, 'yesterday': {-1: 37, 1: 17}, 'wife': {-1: 12, 1: 13}, 'woman': {-1: 7, 1: 12}, 'feel': {-1: 214, 1: 95}, 'ecstatic': {-1: 0, 1: 0}, 'man': {-1: 126, 1: 133}, 'irritat': {-1: 1, 1: 0}, 'situation': {-1: 0, 1: 3}, 'Ellen': {-1: 3, 1: 7}, 'Alphonse': {-1: 0, 1: 0}, 'excit': {-1: 13, 1: 60}}


**Using TfIdf Vectorizer**
**pipeline_tfidf_lr_best-- model name after hyperparameter tuning**

Bias checking for anger emotion for male and female datasets

Anger Male

In [58]:
class_names = pipeline_tfidf_svm_best.classes_

In [59]:
new_anger_male = anger_male_data[['Sentence']]
new_anger_male.shape
anger_male_test = tfidf_vectorizer.transform(new_anger_male['Sentence'])
anger_male_pred = pipeline_tfidf_svm_best.predict_proba(anger_male_test)
anger_male_pred
for i in range(len(anger_male_pred)):
    print(f"Observation {i}: {class_names[0]}={anger_male_pred[i][0]}, {class_names[1]}={1 - anger_male_pred[i][0]}")

Observation 0: -1=0.8915118242007136, 1=0.10848817579928638
Observation 1: -1=0.4634604833251825, 1=0.5365395166748175
Observation 2: -1=0.4634604833251825, 1=0.5365395166748175
Observation 3: -1=0.4634604833251825, 1=0.5365395166748175
Observation 4: -1=0.8260585835706605, 1=0.17394141642933947
Observation 5: -1=0.8915118242007136, 1=0.10848817579928638
Observation 6: -1=0.4634604833251825, 1=0.5365395166748175
Observation 7: -1=0.4634604833251825, 1=0.5365395166748175
Observation 8: -1=0.4634604833251825, 1=0.5365395166748175
Observation 9: -1=0.8260585835706605, 1=0.17394141642933947
Observation 10: -1=0.8915118242007136, 1=0.10848817579928638
Observation 11: -1=0.4634604833251825, 1=0.5365395166748175
Observation 12: -1=0.4634604833251825, 1=0.5365395166748175
Observation 13: -1=0.4634604833251825, 1=0.5365395166748175
Observation 14: -1=0.8260585835706605, 1=0.17394141642933947
Observation 15: -1=0.8915118242007136, 1=0.10848817579928638
Observation 16: -1=0.4634604833251825, 1=0.

Anger Female

In [60]:
new_anger_female=anger_female_data[['Sentence']]
anger_female_test= tfidf_vectorizer.transform(new_anger_female['Sentence'])
anger_female_pred= pipeline_tfidf_svm_best.predict_proba(anger_female_test)
anger_female_pred
for i in range(len(anger_female_pred)):
     print(f"Observation {i}: {class_names[0]}={anger_female_pred[i][0]}, {class_names[1]}={anger_female_pred[i][1]}")

Observation 0: -1=0.8915118242007136, 1=0.1084881757992863
Observation 1: -1=0.4634604833251825, 1=0.5365395166748174
Observation 2: -1=0.4634604833251825, 1=0.5365395166748174
Observation 3: -1=0.4634604833251825, 1=0.5365395166748174
Observation 4: -1=0.8260585835706605, 1=0.1739414164293396
Observation 5: -1=0.8915118242007136, 1=0.1084881757992863
Observation 6: -1=0.4634604833251825, 1=0.5365395166748174
Observation 7: -1=0.4634604833251825, 1=0.5365395166748174
Observation 8: -1=0.4634604833251825, 1=0.5365395166748174
Observation 9: -1=0.8260585835706605, 1=0.1739414164293396
Observation 10: -1=0.8915118242007136, 1=0.1084881757992863
Observation 11: -1=0.4634604833251825, 1=0.5365395166748174
Observation 12: -1=0.4634604833251825, 1=0.5365395166748174
Observation 13: -1=0.4634604833251825, 1=0.5365395166748174
Observation 14: -1=0.8260585835706605, 1=0.1739414164293396
Observation 15: -1=0.8915118242007136, 1=0.1084881757992863
Observation 16: -1=0.4634604833251825, 1=0.5365395

Checking the intensity average for anger emotion and checking bias

In [61]:
diff_anger = []
diff_anger_count = 0  # number of pairs with differences
same_anger_count = 0  # number of pairs without differences
anger_male_prob_sum=0
anger_female_prob_sum=0

# Iterate over each pair of sentences and compute the difference
for i in range(len(anger_female_pred)):
          female_prob = anger_female_pred[i][1]  # probability of positive class for female sentence i
          male_prob = anger_male_pred[i][1]
            # probability of positive class for male sentence i
          anger_male_prob_sum+=male_prob # male probabilities sum
          anger_female_prob_sum+=female_prob # female probabilities sum
          diff = abs(female_prob - male_prob)
          #print(f"Observation {i+1}: {new_male_df[i+1]}, {new_female_df[i+1]}, {diff}")
          diff_anger.append(diff)
          if diff > 0:
              diff_anger_count += 1
          else:
              same_anger_count += 1

# Calculate the average difference, ignoring pairs without differences
if diff_anger_count > 0:
    avg_diff = sum(diff_anger) / diff_anger_count
else:
    avg_diff = 0

print("Number of pairs with differences:", diff_anger_count)
print("Number of pairs without differences:", same_anger_count)
print("Average difference:", avg_diff)

avg_anger_male_scores=(anger_male_prob_sum)/len(anger_male_pred)
avg_anger_female_scores=(anger_female_prob_sum)/len(anger_female_pred)
print("Male anger Average",avg_anger_male_scores)
print("Female anger Average",avg_anger_female_scores)

# Check if F=M not significant
if abs(avg_diff) < 0.05:
    print("F=M not significant")
# Check if F↑–M↓ significant
elif avg_anger_female_scores > avg_anger_male_scores:
    print("F↑–M↓ significant")
# Check if F↓–M↑ significant
else:
    print("F↓–M↑ significant")

Number of pairs with differences: 759
Number of pairs without differences: 291
Average difference: 0.056061083228418476
Male anger Average 0.5125786597601429
Female anger Average 0.5140917695117724
F↑–M↓ significant


Bias checking for sadness emotion with male and female datasets

Sadness Male

In [62]:
new_sadness_male=sadness_male_data[['Sentence']]
new_sadness_male.shape
sadness_male_test = tfidf_vectorizer.transform(new_sadness_male['Sentence'])
sadness_male_pred = pipeline_tfidf_svm_best.predict_proba(sadness_male_test)
sadness_male_pred
for i in range(len(sadness_male_pred)):
    print(f"Observation {i}: {class_names[0]}={sadness_male_pred[i][0]}, {class_names[1]}={sadness_male_pred[i][1]}")

Observation 0: -1=0.9434088515500428, 1=0.05659114844995721
Observation 1: -1=0.8873035875362616, 1=0.11269641246373863
Observation 2: -1=0.4634604833251825, 1=0.5365395166748174
Observation 3: -1=0.7710510360633294, 1=0.22894896393667083
Observation 4: -1=0.7247696410733986, 1=0.2752303589266016
Observation 5: -1=0.9434088515500428, 1=0.05659114844995721
Observation 6: -1=0.8873035875362616, 1=0.11269641246373863
Observation 7: -1=0.4634604833251825, 1=0.5365395166748174
Observation 8: -1=0.7710510360633294, 1=0.22894896393667083
Observation 9: -1=0.7247696410733986, 1=0.2752303589266016
Observation 10: -1=0.9434088515500428, 1=0.05659114844995721
Observation 11: -1=0.8873035875362616, 1=0.11269641246373863
Observation 12: -1=0.4634604833251825, 1=0.5365395166748174
Observation 13: -1=0.7710510360633294, 1=0.22894896393667083
Observation 14: -1=0.7247696410733986, 1=0.2752303589266016
Observation 15: -1=0.9434088515500428, 1=0.05659114844995721
Observation 16: -1=0.8873035875362616, 1

Sadness Female

In [63]:
new_sadness_female=sadness_female_data[['Sentence']]
sadness_female_test= tfidf_vectorizer.transform(new_sadness_female['Sentence'])
sadness_female_pred= pipeline_tfidf_svm_best.predict_proba(sadness_female_test)
sadness_female_pred
for i in range(len(sadness_female_pred)):
    print(f"Observation {i}: {class_names[0]}={sadness_female_pred[i][0]}, {class_names[1]}={sadness_female_pred[i][1]}")

Observation 0: -1=0.9434088515500428, 1=0.05659114844995721
Observation 1: -1=0.8873035875362616, 1=0.11269641246373863
Observation 2: -1=0.4634604833251825, 1=0.5365395166748174
Observation 3: -1=0.7710510360633294, 1=0.22894896393667083
Observation 4: -1=0.7247696410733986, 1=0.2752303589266016
Observation 5: -1=0.9434088515500428, 1=0.05659114844995721
Observation 6: -1=0.8873035875362616, 1=0.11269641246373863
Observation 7: -1=0.4634604833251825, 1=0.5365395166748174
Observation 8: -1=0.7710510360633294, 1=0.22894896393667083
Observation 9: -1=0.7247696410733986, 1=0.2752303589266016
Observation 10: -1=0.9434088515500428, 1=0.05659114844995721
Observation 11: -1=0.8873035875362616, 1=0.11269641246373863
Observation 12: -1=0.4634604833251825, 1=0.5365395166748174
Observation 13: -1=0.7710510360633294, 1=0.22894896393667083
Observation 14: -1=0.7247696410733986, 1=0.2752303589266016
Observation 15: -1=0.9434088515500428, 1=0.05659114844995721
Observation 16: -1=0.8873035875362616, 1

In [64]:
diff_sadness = []
diff_sadness_count = 0  # number of pairs with differences
same_sadness_count = 0  # number of pairs without differences
sadness_male_prob_sum=0
sadness_female_prob_sum=0

# Iterate over each pair of sentences and compute the difference
for i in range(len(sadness_female_pred)):
          female_prob = sadness_female_pred[i][1]  # probability of positive class for female sentence i
          male_prob = sadness_male_pred[i][1]
            # probability of positive class for male sentence i
          sadness_male_prob_sum+=male_prob # male probabilities sum
          sadness_female_prob_sum+=female_prob # female probabilities sum
          diff = abs(female_prob - male_prob)
          #print(f"Observation {i+1}: {new_male_df[i+1]}, {new_female_df[i+1]}, {diff}")
          diff_sadness.append(diff)
          if diff > 0:
              diff_sadness_count += 1
          else:
              same_sadness_count += 1

# Calculate the average difference, ignoring pairs without differences
if diff_sadness_count > 0:
    avg_diff = sum(diff_sadness) / diff_sadness_count
else:
    avg_diff = 0

print("Number of pairs with differences:", diff_sadness_count)
print("Number of pairs without differences:", same_sadness_count)
print("Average difference:", avg_diff)

avg_sadness_male_scores=(sadness_male_prob_sum)/len(sadness_male_pred)
avg_sadness_female_scores=(sadness_female_prob_sum)/len(sadness_female_pred)
print("Male sadness Average",avg_sadness_male_scores)
print("Female sadness Average",avg_sadness_female_scores)

# Check if F=M not significant
if abs(avg_diff) < 0.05:
    print("F=M not significant")
# Check if F↑–M↓ significant
elif avg_sadness_female_scores > avg_sadness_male_scores:
    print("F↑–M↓ significant")
# Check if F↓–M↑ significant
else:
    print("F↓–M↑ significant")
    

Number of pairs with differences: 773
Number of pairs without differences: 277
Average difference: 0.05496709517256519
Male sadness Average 0.46478996844785664
Female sadness Average 0.4657364173642109
F↑–M↓ significant


Bias Checking for joy emtion with male and female data sets

In [65]:
new_joy_male=joy_male_data[['Sentence']]
new_joy_male.shape
joy_male_test = tfidf_vectorizer.transform(new_joy_male['Sentence'])
joy_male_pred = pipeline_tfidf_svm_best.predict_proba(joy_male_test)
joy_male_pred
for i in range(len(joy_male_pred)):
    print(f"Observation {i}: {class_names[0]}={joy_male_pred[i][0]}, {class_names[1]}={joy_male_pred[i][1]}")

Observation 0: -1=0.22175424833556692, 1=0.778245751664433
Observation 1: -1=0.4634604833251825, 1=0.5365395166748174
Observation 2: -1=0.20957535322034573, 1=0.7904246467796543
Observation 3: -1=0.4634604833251825, 1=0.5365395166748174
Observation 4: -1=0.4634604833251825, 1=0.5365395166748174
Observation 5: -1=0.22175424833556692, 1=0.778245751664433
Observation 6: -1=0.4634604833251825, 1=0.5365395166748174
Observation 7: -1=0.20957535322034573, 1=0.7904246467796543
Observation 8: -1=0.4634604833251825, 1=0.5365395166748174
Observation 9: -1=0.4634604833251825, 1=0.5365395166748174
Observation 10: -1=0.22175424833556692, 1=0.778245751664433
Observation 11: -1=0.4634604833251825, 1=0.5365395166748174
Observation 12: -1=0.20957535322034573, 1=0.7904246467796543
Observation 13: -1=0.4634604833251825, 1=0.5365395166748174
Observation 14: -1=0.4634604833251825, 1=0.5365395166748174
Observation 15: -1=0.22175424833556692, 1=0.778245751664433
Observation 16: -1=0.4634604833251825, 1=0.5365

In [66]:
new_joy_female=joy_female_data[['Sentence']]
joy_female_test= tfidf_vectorizer.transform(new_joy_female['Sentence'])
joy_female_pred= pipeline_tfidf_svm_best.predict_proba(joy_female_test)
joy_female_pred
for i in range(len(joy_female_pred)):
    print(f"Observation {i}: {class_names[0]}={joy_female_pred[i][0]}, {class_names[1]}={joy_female_pred[i][1]}")

Observation 0: -1=0.22175424833556692, 1=0.778245751664433
Observation 1: -1=0.4634604833251825, 1=0.5365395166748174
Observation 2: -1=0.20957535322034573, 1=0.7904246467796543
Observation 3: -1=0.4634604833251825, 1=0.5365395166748174
Observation 4: -1=0.4634604833251825, 1=0.5365395166748174
Observation 5: -1=0.22175424833556692, 1=0.778245751664433
Observation 6: -1=0.4634604833251825, 1=0.5365395166748174
Observation 7: -1=0.20957535322034573, 1=0.7904246467796543
Observation 8: -1=0.4634604833251825, 1=0.5365395166748174
Observation 9: -1=0.4634604833251825, 1=0.5365395166748174
Observation 10: -1=0.22175424833556692, 1=0.778245751664433
Observation 11: -1=0.4634604833251825, 1=0.5365395166748174
Observation 12: -1=0.20957535322034573, 1=0.7904246467796543
Observation 13: -1=0.4634604833251825, 1=0.5365395166748174
Observation 14: -1=0.4634604833251825, 1=0.5365395166748174
Observation 15: -1=0.22175424833556692, 1=0.778245751664433
Observation 16: -1=0.4634604833251825, 1=0.5365

Checking the intensity average for joy emotion and checking bias

In [67]:
diff_joy = []
diff_joy_count = 0  # number of pairs with differences
same_joy_count = 0  # number of pairs without differences
joy_male_prob_sum=0
joy_female_prob_sum=0

# Iterate over each pair of sentences and compute the difference
for i in range(len(joy_female_pred)):
          female_prob = joy_female_pred[i][1]  # probability of positive class for female sentence i
          male_prob = joy_male_pred[i][1]
            # probability of positive class for male sentence i
          joy_male_prob_sum+=male_prob # male probabilities sum
          joy_female_prob_sum+=female_prob # female probabilities sum
          diff = abs(female_prob - male_prob)
          #print(f"Observation {i+1}: {new_male_df[i+1]}, {new_female_df[i+1]}, {diff}")
          diff_joy.append(diff)
          if diff > 0:
              diff_joy_count += 1
          else:
              same_joy_count += 1

# Calculate the average difference, ignoring pairs without differences
if diff_joy_count > 0:
    avg_diff = sum(diff_joy) / diff_joy_count
else:
    avg_diff = 0

print("Number of pairs with differences:", diff_joy_count)
print("Number of pairs without differences:", same_joy_count)
print("Average difference:", avg_diff)

avg_joy_male_scores=(joy_male_prob_sum)/len(joy_male_pred)
avg_joy_female_scores=(joy_female_prob_sum)/len(joy_female_pred)
print("Male joy Average",avg_joy_male_scores)
print("Female joy Average",avg_joy_female_scores)

# Check if F=M not significant
if abs(avg_diff) < 0.05:
    print("F=M not significant")
# Check if F↑–M↓ significant
elif avg_joy_female_scores > avg_joy_male_scores:
    print("F↑–M↓ significant")
# Check if F↓–M↑ significant
else:
    print("F↓–M↑ significant")

Number of pairs with differences: 759
Number of pairs without differences: 291
Average difference: 0.05590607592518622
Male joy Average 0.6335854314996359
Female joy Average 0.6373533271031031
F↑–M↓ significant


Bias Checking for fear emtion with male and female data sets

In [68]:
new_fear_male=fear_male_data[['Sentence']]
new_fear_male.shape
fear_male_test = tfidf_vectorizer.transform(new_fear_male['Sentence'])
fear_male_pred = pipeline_tfidf_svm_best.predict_proba(fear_male_test)
fear_male_pred
for i in range(len(fear_male_pred)):
    print(f"Observation {i}: {class_names[0]}={fear_male_pred[i][0]}, {class_names[1]}={fear_male_pred[i][1]}")

Observation 0: -1=0.4634604833251825, 1=0.5365395166748174
Observation 1: -1=0.4634604833251825, 1=0.5365395166748174
Observation 2: -1=0.4634604833251825, 1=0.5365395166748174
Observation 3: -1=0.4634604833251825, 1=0.5365395166748174
Observation 4: -1=0.4634604833251825, 1=0.5365395166748174
Observation 5: -1=0.4634604833251825, 1=0.5365395166748174
Observation 6: -1=0.4634604833251825, 1=0.5365395166748174
Observation 7: -1=0.4634604833251825, 1=0.5365395166748174
Observation 8: -1=0.4634604833251825, 1=0.5365395166748174
Observation 9: -1=0.4634604833251825, 1=0.5365395166748174
Observation 10: -1=0.4634604833251825, 1=0.5365395166748174
Observation 11: -1=0.4634604833251825, 1=0.5365395166748174
Observation 12: -1=0.4634604833251825, 1=0.5365395166748174
Observation 13: -1=0.4634604833251825, 1=0.5365395166748174
Observation 14: -1=0.4634604833251825, 1=0.5365395166748174
Observation 15: -1=0.4634604833251825, 1=0.5365395166748174
Observation 16: -1=0.4634604833251825, 1=0.5365395

In [69]:
new_fear_female=fear_female_data[['Sentence']]
fear_female_test= tfidf_vectorizer.transform(new_fear_female['Sentence'])
fear_female_pred= pipeline_tfidf_svm_best.predict_proba(fear_female_test)
fear_female_pred
for i in range(len(fear_female_pred)):
    print(f"Observation {i}: {class_names[0]}={fear_female_pred[i][0]}, {class_names[1]}={fear_female_pred[i][1]}")

Observation 0: -1=0.4634604833251825, 1=0.5365395166748174
Observation 1: -1=0.4634604833251825, 1=0.5365395166748174
Observation 2: -1=0.4634604833251825, 1=0.5365395166748174
Observation 3: -1=0.4634604833251825, 1=0.5365395166748174
Observation 4: -1=0.4634604833251825, 1=0.5365395166748174
Observation 5: -1=0.4634604833251825, 1=0.5365395166748174
Observation 6: -1=0.4634604833251825, 1=0.5365395166748174
Observation 7: -1=0.4634604833251825, 1=0.5365395166748174
Observation 8: -1=0.4634604833251825, 1=0.5365395166748174
Observation 9: -1=0.4634604833251825, 1=0.5365395166748174
Observation 10: -1=0.4634604833251825, 1=0.5365395166748174
Observation 11: -1=0.4634604833251825, 1=0.5365395166748174
Observation 12: -1=0.4634604833251825, 1=0.5365395166748174
Observation 13: -1=0.4634604833251825, 1=0.5365395166748174
Observation 14: -1=0.4634604833251825, 1=0.5365395166748174
Observation 15: -1=0.4634604833251825, 1=0.5365395166748174
Observation 16: -1=0.4634604833251825, 1=0.5365395

Checking the intensity average for fear emotion and checking bias

In [70]:
diff_fear = []
diff_fear_count = 0  # number of pairs with differences
same_fear_count = 0  # number of pairs without differences
fear_male_prob_sum=0
fear_female_prob_sum=0

# Iterate over each pair of sentences and compute the difference
for i in range(len(fear_female_pred)):
          female_prob = fear_female_pred[i][1]  # probability of positive class for female sentence i
          male_prob = fear_male_pred[i][1]
            # probability of positive class for male sentence i
          fear_male_prob_sum+=male_prob # male probabilities sum
          fear_female_prob_sum+=female_prob # female probabilities sum
          diff = abs(female_prob - male_prob)
          #print(f"Observation {i+1}: {new_male_df[i+1]}, {new_female_df[i+1]}, {diff}")
          diff_fear.append(diff)
          if diff > 0:
              diff_fear_count += 1
          else:
              same_fear_count += 1

# Calculate the average difference, ignoring pairs without differences
if diff_fear_count > 0:
    avg_diff = sum(diff_fear) / diff_fear_count
else:
    avg_diff = 0

print("Number of pairs with differences:", diff_fear_count)
print("Number of pairs without differences:", same_fear_count)
print("Average difference:", avg_diff)

avg_fear_male_scores=(fear_male_prob_sum)/len(fear_male_pred)
avg_fear_female_scores=(fear_female_prob_sum)/len(fear_female_pred)
print("Male fear Average",avg_fear_male_scores)
print("Female fear Average",avg_fear_female_scores)

# Check if F=M not significant
if abs(avg_diff) < 0.05:
    print("F=M not significant")
# Check if F↑–M↓ significant
elif avg_fear_female_scores > avg_fear_male_scores:
    print("F↑–M↓ significant")
# Check if F↓–M↑ significant
else:
    print("F↓–M↑ significant")

Number of pairs with differences: 745
Number of pairs without differences: 305
Average difference: 0.056113263600072516
Male fear Average 0.5915784618976233
Female fear Average 0.593678002299706
F↑–M↓ significant


Male EEC corpus DataSet

## Checking the bias for entire male and female datsets

In [71]:
new_female_df = female_data[['Sentence']]
female_test = tfidf_vectorizer.transform(new_female_df['Sentence'])
female_pred = pipeline_tfidf_svm_best.predict_proba(female_test)
female_pred
print("Class names:", class_names)
for i in range(len(female_pred)):
    print(f"Observation {i}: {class_names[0]}={female_pred[i][0]}, {class_names[1]}={female_pred[i][1]}")

Class names: [-1  1]
Observation 0: -1=0.8915118242007136, 1=0.1084881757992863
Observation 1: -1=0.4634604833251825, 1=0.5365395166748174
Observation 2: -1=0.4634604833251825, 1=0.5365395166748174
Observation 3: -1=0.4634604833251825, 1=0.5365395166748174
Observation 4: -1=0.8260585835706605, 1=0.1739414164293396
Observation 5: -1=0.9434088515500428, 1=0.05659114844995721
Observation 6: -1=0.8873035875362616, 1=0.11269641246373863
Observation 7: -1=0.4634604833251825, 1=0.5365395166748174
Observation 8: -1=0.7710510360633294, 1=0.22894896393667083
Observation 9: -1=0.7247696410733986, 1=0.2752303589266016
Observation 10: -1=0.4634604833251825, 1=0.5365395166748174
Observation 11: -1=0.4634604833251825, 1=0.5365395166748174
Observation 12: -1=0.4634604833251825, 1=0.5365395166748174
Observation 13: -1=0.4634604833251825, 1=0.5365395166748174
Observation 14: -1=0.4634604833251825, 1=0.5365395166748174
Observation 15: -1=0.22175424833556692, 1=0.778245751664433
Observation 16: -1=0.46346

In [72]:
new_male_df = male_data[['Sentence']]
male_test = tfidf_vectorizer.transform(new_male_df['Sentence'])
#male_pred = pipeline_tfidf_lr_best.predict(male_test)
male_pred = pipeline_tfidf_svm_best.predict_proba(male_test)
male_pred
class_names = pipeline_tfidf_svm_best.classes_
print("Class names:", class_names)
for i in range(len(male_pred)):
    print(f"Observation {i}: {class_names[0]}={male_pred[i][0]}, {class_names[1]}={male_pred[i][1]}")

Class names: [-1  1]
Observation 0: -1=0.8915118242007136, 1=0.1084881757992863
Observation 1: -1=0.4634604833251825, 1=0.5365395166748174
Observation 2: -1=0.4634604833251825, 1=0.5365395166748174
Observation 3: -1=0.4634604833251825, 1=0.5365395166748174
Observation 4: -1=0.8260585835706605, 1=0.1739414164293396
Observation 5: -1=0.9434088515500428, 1=0.05659114844995721
Observation 6: -1=0.8873035875362616, 1=0.11269641246373863
Observation 7: -1=0.4634604833251825, 1=0.5365395166748174
Observation 8: -1=0.7710510360633294, 1=0.22894896393667083
Observation 9: -1=0.7247696410733986, 1=0.2752303589266016
Observation 10: -1=0.4634604833251825, 1=0.5365395166748174
Observation 11: -1=0.4634604833251825, 1=0.5365395166748174
Observation 12: -1=0.4634604833251825, 1=0.5365395166748174
Observation 13: -1=0.4634604833251825, 1=0.5365395166748174
Observation 14: -1=0.4634604833251825, 1=0.5365395166748174
Observation 15: -1=0.22175424833556692, 1=0.778245751664433
Observation 16: -1=0.46346

Checking the intensity Average and bias

In [73]:
diffs = []
diff_count = 0  # number of pairs with differences
same_count = 0  # number of pairs without differences
male_prob_sum=0
female_prob_sum=0

# Iterate over each pair of sentences and compute the difference
for i in range(len(female_pred)):
          female_prob = female_pred[i][1]  # probability of positive class for female sentence i
          male_prob = male_pred[i][1]
            # probability of positive class for male sentence i
          male_prob_sum+=male_prob # male probabilities sum
          female_prob_sum+=female_prob # female probabilities sum
          diff = abs(female_prob - male_prob)
          #print(f"Observation {i+1}: {new_male_df[i+1]}, {new_female_df[i+1]}, {diff}")
          diffs.append(diff)
          if diff > 0:
              diff_count += 1
          else:
              same_count += 1

# Calculate the average difference, ignoring pairs without differences
if diff_count > 0:
    avg_diff = sum(diffs) / diff_count
else:
    avg_diff = 0

print("Number of pairs with differences:", diff_count)
print("Number of pairs without differences:", same_count)
print("Average difference:", avg_diff)


avg_male_scores=(male_prob_sum)/len(male_pred)
avg_female_scores=(female_prob_sum)/len(female_pred)
print(avg_male_scores)
print(avg_female_scores)

# Check if F=M not significant
if abs(avg_diff) < 0.05:
    print("F=M not significant")
# Check if F↑–M↓ significant
elif avg_female_scores > avg_male_scores:
    print("F↑–M↓ significant")
# Check if F↓–M↑ significant
else:
    print("F↓–M↑ significant")

Number of pairs with differences: 3121
Number of pairs without differences: 1199
Average difference: 0.055728380295410405
0.552346768951304
0.5545042829378103
F↑–M↓ significant


Bias checking for Non-emotion datasets

In [74]:
data_null_emotion = data[data['Emotion'].isnull()]

data_null_emotion.shape

# Group the data by gender
No_emotion_gender_groups = data_null_emotion.groupby('Gender')

# Create separate datasets for male and female
No_emotion_male_data = No_emotion_gender_groups.get_group('male')
No_emotion_female_data = No_emotion_gender_groups.get_group('female')
print(No_emotion_female_data.shape)
print(No_emotion_male_data.shape)

(120, 8)
(120, 8)


In [75]:
print(No_emotion_male_data['Sentence'].iloc[10:15])
print(No_emotion_female_data['Sentence'].iloc[10:15])

4810     i saw adam in the market.
4811    i saw harry in the market.
4812     i saw josh in the market.
4813    i saw roger in the market.
4814     i saw alan in the market.
Name: Sentence, dtype: object
4840      i saw amanda in the market.
4841    i saw courtney in the market.
4842     i saw heather in the market.
4843     i saw melanie in the market.
4844       i saw katie in the market.
Name: Sentence, dtype: object


In [76]:
male_test = tfidf_vectorizer.transform(No_emotion_male_data['Sentence'])
male_pred = pipeline_tfidf_svm_best.predict_proba(male_test)
male_pred
class_names = pipeline_tfidf_svm_best.classes_
print("Class names:", class_names)
for i in range(len(male_pred)):
    print(f"Observation {i}: {class_names[0]}={male_pred[i][0]}, {class_names[1]}={male_pred[i][1]}")

Class names: [-1  1]
Observation 0: -1=0.430947886969847, 1=0.5690521130301531
Observation 1: -1=0.430947886969847, 1=0.5690521130301531
Observation 2: -1=0.430947886969847, 1=0.5690521130301531
Observation 3: -1=0.430947886969847, 1=0.5690521130301531
Observation 4: -1=0.430947886969847, 1=0.5690521130301531
Observation 5: -1=0.4365149211506793, 1=0.5634850788493205
Observation 6: -1=0.430947886969847, 1=0.5690521130301531
Observation 7: -1=0.430947886969847, 1=0.5690521130301531
Observation 8: -1=0.430947886969847, 1=0.5690521130301531
Observation 9: -1=0.430947886969847, 1=0.5690521130301531
Observation 10: -1=0.2953843928324286, 1=0.7046156071675714
Observation 11: -1=0.4351958297981083, 1=0.5648041702018917
Observation 12: -1=0.4359024809899893, 1=0.5640975190100107
Observation 13: -1=0.4361560457337355, 1=0.5638439542662644
Observation 14: -1=0.430947886969847, 1=0.5690521130301531
Observation 15: -1=0.4365149211506793, 1=0.5634850788493205
Observation 16: -1=0.4579234704638449, 

In [77]:
female_test = tfidf_vectorizer.transform(No_emotion_female_data['Sentence'])
female_pred = pipeline_tfidf_svm_best.predict_proba(female_test)
female_pred
class_names = pipeline_tfidf_svm_best.classes_
print("Class names:", class_names)
for i in range(len(female_pred)):
    print(f"Observation {i}: {class_names[0]}={female_pred[i][0]}, {class_names[1]}={female_pred[i][1]}")

Class names: [-1  1]
Observation 0: -1=0.430947886969847, 1=0.5690521130301531
Observation 1: -1=0.430947886969847, 1=0.5690521130301531
Observation 2: -1=0.430947886969847, 1=0.5690521130301531
Observation 3: -1=0.430947886969847, 1=0.5690521130301531
Observation 4: -1=0.430947886969847, 1=0.5690521130301531
Observation 5: -1=0.4365149211506793, 1=0.5634850788493205
Observation 6: -1=0.430947886969847, 1=0.5690521130301531
Observation 7: -1=0.4365149211506793, 1=0.5634850788493205
Observation 8: -1=0.430947886969847, 1=0.5690521130301531
Observation 9: -1=0.430947886969847, 1=0.5690521130301531
Observation 10: -1=0.6103654284157212, 1=0.3896345715842789
Observation 11: -1=0.4365149211506793, 1=0.5634850788493205
Observation 12: -1=0.430947886969847, 1=0.5690521130301531
Observation 13: -1=0.430947886969847, 1=0.5690521130301531
Observation 14: -1=0.41140835572818996, 1=0.58859164427181
Observation 15: -1=0.430947886969847, 1=0.5690521130301531
Observation 16: -1=0.430947886969847, 1=0

In [78]:
diffs = []
diff_count = 0  # number of pairs with differences
same_count = 0  # number of pairs without differences
male_prob_sum=0
female_prob_sum=0

# Iterate over each pair of sentences and compute the difference
for i in range(len(female_pred)):
          female_prob = female_pred[i][1]  # probability of positive class for female sentence i
          male_prob = male_pred[i][1]
            # probability of positive class for male sentence i
          male_prob_sum+=male_prob # male probabilities sum
          female_prob_sum+=female_prob # female probabilities sum
          diff = abs(female_prob - male_prob)
          #print(f"Observation {i+1}: {new_male_df[i+1]}, {new_female_df[i+1]}, {diff}")
          diffs.append(diff)
          if diff > 0:
              diff_count += 1
          else:
              same_count += 1

# Calculate the average difference, ignoring pairs without differences
if diff_count > 0:
    avg_diff = sum(diffs) / diff_count
else:
    avg_diff = 0

print("Number of pairs with differences:", diff_count)
print("Number of pairs without differences:", same_count)
print("Average difference:", avg_diff)


avg_male_scores=(male_prob_sum)/len(male_pred)
avg_female_scores=(female_prob_sum)/len(female_pred)
print("The Average of male",avg_male_scores)
print("The Average of female",avg_female_scores)

# Check if F=M not significant
if abs(avg_diff) < 0.05:
    print("F=M not significant")
# Check if F↑–M↓ significant
elif avg_female_scores > avg_male_scores:
    print("F↑–M↓ significant")
# Check if F↓–M↑ significant
else:
    print("F↓–M↑ significant")

Number of pairs with differences: 85
Number of pairs without differences: 35
Average difference: 0.054720648869917336
The Average of male 0.6123241182009501
The Average of female 0.6171334183216213
F↑–M↓ significant


Race Dataset

In [79]:
print(data['Race'].value_counts())
# Group the data by Race
Race_groups = data.groupby('Race')

# Create separate datasets for African-American and European
AfA_data = Race_groups.get_group('African-American')
Euro_data = Race_groups.get_group('European')

African-American    2880
European            2880
Name: Race, dtype: int64


In [80]:
print(AfA_data.shape)
print(Euro_data.shape)

print(AfA_data['Emotion'].value_counts())
print(Euro_data['Emotion'].value_counts())

(2880, 8)
(2880, 8)
anger      700
sadness    700
fear       700
joy        700
Name: Emotion, dtype: int64
anger      700
sadness    700
fear       700
joy        700
Name: Emotion, dtype: int64


In [81]:
print(AfA_data['Sentence'].iloc[100:109])
print(Euro_data['Sentence'].iloc[100:109])

100         torrance feels angry.
101       torrance feels furious.
102     torrance feels irritated.
103       torrance feels enraged.
104       torrance feels annoyed.
105           torrance feels sad.
106     torrance feels depressed.
107    torrance feels devastated.
108     torrance feels miserable.
Name: Sentence, dtype: object
300         frank feels angry.
301       frank feels furious.
302     frank feels irritated.
303       frank feels enraged.
304       frank feels annoyed.
305           frank feels sad.
306     frank feels depressed.
307    frank feels devastated.
308     frank feels miserable.
Name: Sentence, dtype: object


In [82]:
AfA_test = tfidf_vectorizer.transform(AfA_data['Sentence'])

In [83]:
AfA_pred = pipeline_tfidf_svm_best.predict_proba(AfA_test)

In [84]:
AfA_pred

array([[0.89151182, 0.10848818],
       [0.46346048, 0.53653952],
       [0.46346048, 0.53653952],
       ...,
       [0.37350717, 0.62649283],
       [0.21393476, 0.78606524],
       [0.33948455, 0.66051545]])

In [85]:
class_names = pipeline_tfidf_svm_best.classes_
print("Class names:", class_names)

Class names: [-1  1]


In [86]:
for i in range(len(AfA_pred)):
    print(f"Observation {i}: {class_names[0]}={AfA_pred[i][0]}, {class_names[1]}={AfA_pred[i][1]}")

Observation 0: -1=0.8915118242007136, 1=0.1084881757992863
Observation 1: -1=0.4634604833251825, 1=0.5365395166748174
Observation 2: -1=0.4634604833251825, 1=0.5365395166748174
Observation 3: -1=0.4634604833251825, 1=0.5365395166748174
Observation 4: -1=0.8260585835706605, 1=0.1739414164293396
Observation 5: -1=0.9434088515500428, 1=0.05659114844995721
Observation 6: -1=0.8873035875362616, 1=0.11269641246373863
Observation 7: -1=0.4634604833251825, 1=0.5365395166748174
Observation 8: -1=0.7710510360633294, 1=0.22894896393667083
Observation 9: -1=0.7247696410733986, 1=0.2752303589266016
Observation 10: -1=0.4634604833251825, 1=0.5365395166748174
Observation 11: -1=0.4634604833251825, 1=0.5365395166748174
Observation 12: -1=0.4634604833251825, 1=0.5365395166748174
Observation 13: -1=0.4634604833251825, 1=0.5365395166748174
Observation 14: -1=0.4634604833251825, 1=0.5365395166748174
Observation 15: -1=0.22175424833556692, 1=0.778245751664433
Observation 16: -1=0.4634604833251825, 1=0.5365

In [87]:
Euro_test=tfidf_vectorizer.transform(Euro_data['Sentence'])

In [88]:
Euro_pred=pipeline_tfidf_svm_best.predict_proba(Euro_test)

In [89]:
Euro_pred

array([[0.62331663, 0.37668337],
       [0.19763042, 0.80236958],
       [0.19763042, 0.80236958],
       ...,
       [0.38905781, 0.61094219],
       [0.24365552, 0.75634448],
       [0.35452998, 0.64547002]])

In [90]:
for i in range(len(Euro_pred)):
    print(f"Observation {i}: {class_names[0]}={Euro_pred[i][0]}, {class_names[1]}={Euro_pred[i][1]}")

Observation 0: -1=0.6233166307861059, 1=0.37668336921389417
Observation 1: -1=0.19763042285886373, 1=0.8023695771411364
Observation 2: -1=0.19763042285886373, 1=0.8023695771411364
Observation 3: -1=0.19763042285886373, 1=0.8023695771411364
Observation 4: -1=0.5737810746534027, 1=0.4262189253465973
Observation 5: -1=0.6030379358228106, 1=0.39696206417718943
Observation 6: -1=0.6347853811789477, 1=0.36521461882105233
Observation 7: -1=0.19763042285886373, 1=0.8023695771411364
Observation 8: -1=0.5101876216794209, 1=0.4898123783205792
Observation 9: -1=0.4343576607120058, 1=0.5656423392879943
Observation 10: -1=0.19763042285886373, 1=0.8023695771411364
Observation 11: -1=0.19763042285886373, 1=0.8023695771411364
Observation 12: -1=0.19763042285886373, 1=0.8023695771411364
Observation 13: -1=0.27628756675798916, 1=0.7237124332420106
Observation 14: -1=0.19763042285886373, 1=0.8023695771411364
Observation 15: -1=0.1414262419429905, 1=0.8585737580570093
Observation 16: -1=0.19763042285886373

In [91]:
diff_list = []
diff_count_race = 0  # number of pairs with differences
same_count_race = 0  # number of pairs without differences
Euro_prob_sum=0
AfA_prob_sum=0

# Iterate over each pair of sentences and compute the difference
for i in range(len(Euro_pred)):
          Euro_prob = Euro_pred[i][1]  # probability of positive class for Euro sentence i
          AfA_prob = AfA_pred[i][1]  # probability of positive class for AfA sentence i
          Euro_prob_sum+=Euro_prob # Euro probabilities sum
          AfA_prob_sum+=AfA_prob # AfA probabilities sum
          diff = abs(Euro_prob - AfA_prob)
          diff_list.append(diff)
          if diff > 0:
              diff_count_race += 1
          else:
              same_count_race += 1

# Calculate the average difference, ignoring pairs without differences
if diff_count_race > 0:
    avg_diff_race = sum(diff_list) / diff_count_race
else:
    avg_diff_race = 0

print("Number of pairs with differences:", diff_count_race)
print("Number of pairs without differences:", same_count_race)
print("Average difference:", avg_diff_race)

Number of pairs with differences: 1910
Number of pairs without differences: 970
Average difference: 0.06497714135678861


In [92]:
avg_AfA_scores=(AfA_prob_sum)/len(AfA_pred)
avg_Euro_scores=(Euro_prob_sum)/len(Euro_pred)

print(avg_AfA_scores)
print(avg_Euro_scores)

# Check if E=A not significant
if abs(avg_diff_race) < 0.05:
    print("E=A not significant")
# Check if E↑–A↓ significant
elif avg_Euro_scores > avg_AfA_scores:
    print("E↑–A↓ significant")
else:
    print("E↓–A↑ significant")

0.49975461761626166
0.4967509055687607
E↓–A↑ significant


Bias Measure for Emtions in Race

In [93]:
AfA_emotion_groups = AfA_data.groupby('Emotion')

# Create separate datasets for AfA and Euro
anger_AfA_data = AfA_emotion_groups.get_group('anger')
sadness_AfA_data=AfA_emotion_groups.get_group('sadness')
fear_AfA_data=AfA_emotion_groups.get_group('fear')
joy_AfA_data=AfA_emotion_groups.get_group('joy')

In [94]:
Euro_emotion_groups = Euro_data.groupby('Emotion')

# Create separate datasets for AfA and Euro
anger_Euro_data = Euro_emotion_groups.get_group('anger')
sadness_Euro_data=Euro_emotion_groups.get_group('sadness')
fear_Euro_data=Euro_emotion_groups.get_group('fear')
joy_Euro_data=Euro_emotion_groups.get_group('joy')

In [95]:
print(anger_AfA_data.shape)
print(anger_Euro_data.shape)

(700, 8)
(700, 8)


The text in AfA and Euro data sets are same for every index or row, but Race was different

In [96]:
print(anger_AfA_data['Sentence'].iloc[305:310])
print(anger_Euro_data['Sentence'].iloc[305:310])

3620        jamel made me feel angry.
3621      jamel made me feel furious.
3622    jamel made me feel irritated.
3623      jamel made me feel enraged.
3624      jamel made me feel annoyed.
Name: Sentence, dtype: object
3820        harry made me feel angry.
3821      harry made me feel furious.
3822    harry made me feel irritated.
3823      harry made me feel enraged.
3824      harry made me feel annoyed.
Name: Sentence, dtype: object


Bias for Anger emotion

In [97]:
new_anger_AfA=anger_AfA_data[['Sentence']]
new_anger_AfA.shape
anger_AfA_test = tfidf_vectorizer.transform(new_anger_AfA['Sentence'])
anger_AfA_pred = pipeline_tfidf_svm_best.predict_proba(anger_AfA_test)
anger_AfA_pred
for i in range(len(anger_AfA_pred)):
    print(f"Observation {i}: {class_names[0]}={anger_AfA_pred[i][0]}, {class_names[1]}={anger_AfA_pred[i][1]}")

Observation 0: -1=0.8915118242007136, 1=0.1084881757992863
Observation 1: -1=0.4634604833251825, 1=0.5365395166748174
Observation 2: -1=0.4634604833251825, 1=0.5365395166748174
Observation 3: -1=0.4634604833251825, 1=0.5365395166748174
Observation 4: -1=0.8260585835706605, 1=0.1739414164293396
Observation 5: -1=0.8915118242007136, 1=0.1084881757992863
Observation 6: -1=0.4634604833251825, 1=0.5365395166748174
Observation 7: -1=0.4634604833251825, 1=0.5365395166748174
Observation 8: -1=0.4634604833251825, 1=0.5365395166748174
Observation 9: -1=0.8260585835706605, 1=0.1739414164293396
Observation 10: -1=0.8915118242007136, 1=0.1084881757992863
Observation 11: -1=0.4634604833251825, 1=0.5365395166748174
Observation 12: -1=0.4634604833251825, 1=0.5365395166748174
Observation 13: -1=0.4634604833251825, 1=0.5365395166748174
Observation 14: -1=0.8260585835706605, 1=0.1739414164293396
Observation 15: -1=0.8915118242007136, 1=0.1084881757992863
Observation 16: -1=0.4634604833251825, 1=0.5365395

In [98]:
new_anger_Euro=anger_Euro_data[['Sentence']]
anger_Euro_test= tfidf_vectorizer.transform(new_anger_Euro['Sentence'])
anger_Euro_pred= pipeline_tfidf_svm_best.predict_proba(anger_Euro_test)
anger_Euro_pred
for i in range(len(anger_Euro_pred)):
    print(f"Observation {i}: {class_names[0]}={anger_Euro_pred[i][0]}, {class_names[1]}={anger_Euro_pred[i][1]}")

Observation 0: -1=0.6233166307861059, 1=0.37668336921389417
Observation 1: -1=0.19763042285886373, 1=0.8023695771411364
Observation 2: -1=0.19763042285886373, 1=0.8023695771411364
Observation 3: -1=0.19763042285886373, 1=0.8023695771411364
Observation 4: -1=0.5737810746534027, 1=0.4262189253465973
Observation 5: -1=0.8011025048571349, 1=0.19889749514286512
Observation 6: -1=0.4634604833251825, 1=0.5365395166748174
Observation 7: -1=0.4634604833251825, 1=0.5365395166748174
Observation 8: -1=0.4634604833251825, 1=0.5365395166748174
Observation 9: -1=0.7531282518058698, 1=0.24687174819413019
Observation 10: -1=0.7879680662340333, 1=0.21203193376596696
Observation 11: -1=0.4634604833251825, 1=0.5365395166748174
Observation 12: -1=0.4634604833251825, 1=0.5365395166748174
Observation 13: -1=0.4634604833251825, 1=0.5365395166748174
Observation 14: -1=0.7424552078671468, 1=0.2575447921328534
Observation 15: -1=0.7833735274585591, 1=0.2166264725414409
Observation 16: -1=0.4634604833251825, 1=0.

In [99]:
diff_anger = []
diff_anger_count = 0  # number of pairs with differences
same_anger_count = 0  # number of pairs without differences
anger_AfA_prob_sum=0
anger_Euro_prob_sum=0

# Iterate over each pair of sentences and compute the difference
for i in range(len(anger_Euro_pred)):
          Euro_prob = anger_Euro_pred[i][1]  # probability of positive class for Euro sentence i
          AfA_prob = anger_AfA_pred[i][1]    # probability of positive class for AfA sentence i
          anger_AfA_prob_sum+=AfA_prob # AfA probabilities sum
          anger_Euro_prob_sum+=Euro_prob # Euro probabilities sum
          diff = abs(Euro_prob - AfA_prob)
          diff_anger.append(diff)
          if diff > 0:
              diff_anger_count += 1
          else:
              same_anger_count += 1

# Calculate the average difference, ignoring pairs without differences
if diff_anger_count > 0:
    avg_diff = sum(diff_anger) / diff_anger_count
else:
    avg_diff = 0

print("Number of pairs with differences:", diff_anger_count)
print("Number of pairs without differences:", same_anger_count)
print("Average difference:", avg_diff)

avg_anger_AfA_scores=(anger_AfA_prob_sum)/len(anger_AfA_pred)
avg_anger_Euro_scores=(anger_Euro_prob_sum)/len(anger_Euro_pred)
print("AfA anger Average",avg_anger_AfA_scores)
print("Euro anger Average",avg_anger_Euro_scores)

# Check if F=M not significant
if abs(avg_diff) < 0.05:
    print("E=A not significant")
# Check if E↑–A↓ significant
elif avg_anger_Euro_scores > avg_anger_AfA_scores:
    print("E↑–A↓ significant")
# Check if E↓–A↑ significant
else:
    print("E↓–A↑ significant")

Number of pairs with differences: 463
Number of pairs without differences: 237
Average difference: 0.06452559562201411
AfA anger Average 0.454358898830184
Euro anger Average 0.45600855183090344
E↑–A↓ significant


Bias for fear emotion

In [100]:
new_fear_AfA=fear_AfA_data[['Sentence']]
new_fear_AfA.shape
fear_AfA_test = tfidf_vectorizer.transform(new_fear_AfA['Sentence'])
fear_AfA_pred = pipeline_tfidf_svm_best.predict_proba(fear_AfA_test)
fear_AfA_pred
for i in range(len(fear_AfA_pred)):
    print(f"Observation {i}: {class_names[0]}={fear_AfA_pred[i][0]}, {class_names[1]}={fear_AfA_pred[i][1]}")

Observation 0: -1=0.4634604833251825, 1=0.5365395166748174
Observation 1: -1=0.4634604833251825, 1=0.5365395166748174
Observation 2: -1=0.4634604833251825, 1=0.5365395166748174
Observation 3: -1=0.4634604833251825, 1=0.5365395166748174
Observation 4: -1=0.4634604833251825, 1=0.5365395166748174
Observation 5: -1=0.4634604833251825, 1=0.5365395166748174
Observation 6: -1=0.4634604833251825, 1=0.5365395166748174
Observation 7: -1=0.4634604833251825, 1=0.5365395166748174
Observation 8: -1=0.4634604833251825, 1=0.5365395166748174
Observation 9: -1=0.4634604833251825, 1=0.5365395166748174
Observation 10: -1=0.4634604833251825, 1=0.5365395166748174
Observation 11: -1=0.4634604833251825, 1=0.5365395166748174
Observation 12: -1=0.4634604833251825, 1=0.5365395166748174
Observation 13: -1=0.4634604833251825, 1=0.5365395166748174
Observation 14: -1=0.4634604833251825, 1=0.5365395166748174
Observation 15: -1=0.4634604833251825, 1=0.5365395166748174
Observation 16: -1=0.4634604833251825, 1=0.5365395

In [101]:
new_fear_Euro=fear_Euro_data[['Sentence']]
fear_Euro_test= tfidf_vectorizer.transform(new_fear_Euro['Sentence'])
fear_Euro_pred= pipeline_tfidf_svm_best.predict_proba(fear_Euro_test)
fear_Euro_pred
for i in range(len(fear_Euro_pred)):
    print(f"Observation {i}: {class_names[0]}={fear_Euro_pred[i][0]}, {class_names[1]}={fear_Euro_pred[i][1]}")

Observation 0: -1=0.19763042285886373, 1=0.8023695771411364
Observation 1: -1=0.19763042285886373, 1=0.8023695771411364
Observation 2: -1=0.19763042285886373, 1=0.8023695771411364
Observation 3: -1=0.27628756675798916, 1=0.7237124332420106
Observation 4: -1=0.19763042285886373, 1=0.8023695771411364
Observation 5: -1=0.4634604833251825, 1=0.5365395166748174
Observation 6: -1=0.4634604833251825, 1=0.5365395166748174
Observation 7: -1=0.4634604833251825, 1=0.5365395166748174
Observation 8: -1=0.4634604833251825, 1=0.5365395166748174
Observation 9: -1=0.4634604833251825, 1=0.5365395166748174
Observation 10: -1=0.4634604833251825, 1=0.5365395166748174
Observation 11: -1=0.4634604833251825, 1=0.5365395166748174
Observation 12: -1=0.4634604833251825, 1=0.5365395166748174
Observation 13: -1=0.4634604833251825, 1=0.5365395166748174
Observation 14: -1=0.4634604833251825, 1=0.5365395166748174
Observation 15: -1=0.4634604833251825, 1=0.5365395166748174
Observation 16: -1=0.4634604833251825, 1=0.53

In [102]:
diff_fear = []
diff_fear_count = 0  # number of pairs with differences
same_fear_count = 0  # number of pairs without differences
fear_AfA_prob_sum=0
fear_Euro_prob_sum=0

# Iterate over each pair of sentences and compute the difference
for i in range(len(fear_Euro_pred)):
          Euro_prob = fear_Euro_pred[i][1]  # probability of positive class for Euro sentence i
          AfA_prob = fear_AfA_pred[i][1]    # probability of positive class for AfA sentence i
          fear_AfA_prob_sum+=AfA_prob # AfA probabilities sum
          fear_Euro_prob_sum+=Euro_prob # Euro probabilities sum
          diff = abs(Euro_prob - AfA_prob)
          diff_fear.append(diff)
          if diff > 0:
              diff_fear_count += 1
          else:
              same_fear_count += 1

# Calculate the average difference, ignoring pairs without differences
if diff_fear_count > 0:
    avg_diff = sum(diff_fear) / diff_fear_count
else:
    avg_diff = 0

print("Number of pairs with differences:", diff_fear_count)
print("Number of pairs without differences:", same_fear_count)
print("Average difference:", avg_diff)

avg_fear_AfA_scores=(fear_AfA_prob_sum)/len(fear_AfA_pred)
avg_fear_Euro_scores=(fear_Euro_prob_sum)/len(fear_Euro_pred)
print("AfA fear Average",avg_fear_AfA_scores)
print("Euro fear Average",avg_fear_Euro_scores)

# Check if E=A not significant
if abs(avg_diff) < 0.05:
    print("E=A not significant")
# Check if E↑–A↓ significant
elif avg_fear_Euro_scores > avg_fear_AfA_scores:
    print("E↑–A↓ significant")
# Check if E↓–A↑ significant
else:
    print("E↓–A↑ significant")

Number of pairs with differences: 445
Number of pairs without differences: 255
Average difference: 0.0644844762328475
AfA fear Average 0.5403261268456552
Euro fear Average 0.5345005358628198
E↓–A↑ significant


Sadness Bias


In [103]:
new_sadness_AfA=sadness_AfA_data[['Sentence']]
new_sadness_AfA.shape
sadness_AfA_test = tfidf_vectorizer.transform(new_sadness_AfA['Sentence'])
sadness_AfA_pred = pipeline_tfidf_svm_best.predict_proba(sadness_AfA_test)
sadness_AfA_pred
for i in range(len(sadness_AfA_pred)):
    print(f"Observation {i}: {class_names[0]}={sadness_AfA_pred[i][0]}, {class_names[1]}={sadness_AfA_pred[i][1]}")

Observation 0: -1=0.9434088515500428, 1=0.05659114844995721
Observation 1: -1=0.8873035875362616, 1=0.11269641246373863
Observation 2: -1=0.4634604833251825, 1=0.5365395166748174
Observation 3: -1=0.7710510360633294, 1=0.22894896393667083
Observation 4: -1=0.7247696410733986, 1=0.2752303589266016
Observation 5: -1=0.9434088515500428, 1=0.05659114844995721
Observation 6: -1=0.8873035875362616, 1=0.11269641246373863
Observation 7: -1=0.4634604833251825, 1=0.5365395166748174
Observation 8: -1=0.7710510360633294, 1=0.22894896393667083
Observation 9: -1=0.7247696410733986, 1=0.2752303589266016
Observation 10: -1=0.9434088515500428, 1=0.05659114844995721
Observation 11: -1=0.8873035875362616, 1=0.11269641246373863
Observation 12: -1=0.4634604833251825, 1=0.5365395166748174
Observation 13: -1=0.7710510360633294, 1=0.22894896393667083
Observation 14: -1=0.7247696410733986, 1=0.2752303589266016
Observation 15: -1=0.9434088515500428, 1=0.05659114844995721
Observation 16: -1=0.8873035875362616, 1

In [104]:
new_sadness_Euro=sadness_Euro_data[['Sentence']]
sadness_Euro_test= tfidf_vectorizer.transform(new_sadness_Euro['Sentence'])
sadness_Euro_pred= pipeline_tfidf_svm_best.predict_proba(sadness_Euro_test)
sadness_Euro_pred
for i in range(len(sadness_Euro_pred)):
    print(f"Observation {i}: {class_names[0]}={sadness_Euro_pred[i][0]}, {class_names[1]}={sadness_Euro_pred[i][1]}")

Observation 0: -1=0.6030379358228106, 1=0.39696206417718943
Observation 1: -1=0.6347853811789477, 1=0.36521461882105233
Observation 2: -1=0.19763042285886373, 1=0.8023695771411364
Observation 3: -1=0.5101876216794209, 1=0.4898123783205792
Observation 4: -1=0.4343576607120058, 1=0.5656423392879943
Observation 5: -1=0.8096133619308209, 1=0.19038663806917905
Observation 6: -1=0.8047482188440647, 1=0.1952517811559353
Observation 7: -1=0.4634604833251825, 1=0.5365395166748174
Observation 8: -1=0.7028540351868711, 1=0.297145964813129
Observation 9: -1=0.648573455976901, 1=0.35142654402309914
Observation 10: -1=0.7921773434769747, 1=0.2078226565230254
Observation 11: -1=0.7923637592535532, 1=0.20763624074644696
Observation 12: -1=0.4634604833251825, 1=0.5365395166748174
Observation 13: -1=0.6933316386059377, 1=0.3066683613940625
Observation 14: -1=0.6399456825637, 1=0.36005431743629984
Observation 15: -1=0.7862146496399263, 1=0.2137853503600739
Observation 16: -1=0.7880081947419297, 1=0.21199

In [105]:
diff_sadness = []
diff_sadness_count = 0  # number of pairs with differences
same_sadness_count = 0  # number of pairs without differences
sadness_AfA_prob_sum=0
sadness_Euro_prob_sum=0

# Iterate over each pair of sentences and compute the difference
for i in range(len(sadness_Euro_pred)):
          Euro_prob = sadness_Euro_pred[i][1]  # probability of positive class for Euro sentence i
          AfA_prob = sadness_AfA_pred[i][1]    # probability of positive class for AfA sentence i
          sadness_AfA_prob_sum+=AfA_prob # AfA probabilities sum
          sadness_Euro_prob_sum+=Euro_prob # Euro probabilities sum
          diff = abs(Euro_prob - AfA_prob)
          diff_sadness.append(diff)
          if diff > 0:
              diff_sadness_count += 1
          else:
              same_sadness_count += 1

# Calculate the average difference, ignoring pairs without differences
if diff_sadness_count > 0:
    avg_diff = sum(diff_sadness) / diff_sadness_count
else:
    avg_diff = 0

print("Number of pairs with differences:", diff_sadness_count)
print("Number of pairs without differences:", same_sadness_count)
print("Average difference:", avg_diff)

avg_sadness_AfA_scores=(sadness_AfA_prob_sum)/len(sadness_AfA_pred)
avg_sadness_Euro_scores=(sadness_Euro_prob_sum)/len(sadness_Euro_pred)
print("AfA sadness Average",avg_sadness_AfA_scores)
print("Euro sadness Average",avg_sadness_Euro_scores)

# Check if E=A not significant
if abs(avg_diff) < 0.05:
    print("E=A not significant")
# Check if E↑–A↓ significant
elif avg_sadness_Euro_scores > avg_sadness_AfA_scores:
    print("E↑–A↓ significant")
# Check if E↓–A↑ significant
else:
    print("E↓–A↑ significant")

Number of pairs with differences: 481
Number of pairs without differences: 219
Average difference: 0.06401007827770648
AfA sadness Average 0.4018709958865141
Euro sadness Average 0.4101200011979424
E↑–A↓ significant


Joy Bias

In [106]:
new_joy_AfA=joy_AfA_data[['Sentence']]
new_joy_AfA.shape
joy_AfA_test = tfidf_vectorizer.transform(new_joy_AfA['Sentence'])
joy_AfA_pred = pipeline_tfidf_svm_best.predict_proba(joy_AfA_test)
joy_AfA_pred
for i in range(len(joy_AfA_pred)):
    print(f"Observation {i}: {class_names[0]}={joy_AfA_pred[i][0]}, {class_names[1]}={joy_AfA_pred[i][1]}")

Observation 0: -1=0.22175424833556692, 1=0.778245751664433
Observation 1: -1=0.4634604833251825, 1=0.5365395166748174
Observation 2: -1=0.20957535322034573, 1=0.7904246467796543
Observation 3: -1=0.4634604833251825, 1=0.5365395166748174
Observation 4: -1=0.4634604833251825, 1=0.5365395166748174
Observation 5: -1=0.22175424833556692, 1=0.778245751664433
Observation 6: -1=0.4634604833251825, 1=0.5365395166748174
Observation 7: -1=0.20957535322034573, 1=0.7904246467796543
Observation 8: -1=0.4634604833251825, 1=0.5365395166748174
Observation 9: -1=0.4634604833251825, 1=0.5365395166748174
Observation 10: -1=0.22175424833556692, 1=0.778245751664433
Observation 11: -1=0.4634604833251825, 1=0.5365395166748174
Observation 12: -1=0.20957535322034573, 1=0.7904246467796543
Observation 13: -1=0.4634604833251825, 1=0.5365395166748174
Observation 14: -1=0.4634604833251825, 1=0.5365395166748174
Observation 15: -1=0.22175424833556692, 1=0.778245751664433
Observation 16: -1=0.4634604833251825, 1=0.5365

In [107]:
new_joy_Euro=joy_Euro_data[['Sentence']]
joy_Euro_test= tfidf_vectorizer.transform(new_joy_Euro['Sentence'])
joy_Euro_pred= pipeline_tfidf_svm_best.predict_proba(joy_Euro_test)
joy_Euro_pred
for i in range(len(joy_Euro_pred)):
    print(f"Observation {i}: {class_names[0]}={joy_Euro_pred[i][0]}, {class_names[1]}={joy_Euro_pred[i][1]}")

Observation 0: -1=0.1414262419429905, 1=0.8585737580570093
Observation 1: -1=0.19763042285886373, 1=0.8023695771411364
Observation 2: -1=0.1347373329157175, 1=0.8652626670842825
Observation 3: -1=0.19763042285886373, 1=0.8023695771411364
Observation 4: -1=0.19763042285886373, 1=0.8023695771411364
Observation 5: -1=0.32319329257418117, 1=0.6768067074258188
Observation 6: -1=0.4634604833251825, 1=0.5365395166748174
Observation 7: -1=0.30091678152100415, 1=0.6990832184789961
Observation 8: -1=0.4634604833251825, 1=0.5365395166748174
Observation 9: -1=0.4634604833251825, 1=0.5365395166748174
Observation 10: -1=0.33210971738983447, 1=0.6678902826101657
Observation 11: -1=0.4634604833251825, 1=0.5365395166748174
Observation 12: -1=0.310211380140223, 1=0.689788619859777
Observation 13: -1=0.4634604833251825, 1=0.5365395166748174
Observation 14: -1=0.4634604833251825, 1=0.5365395166748174
Observation 15: -1=0.3350599944295426, 1=0.6649400055704571
Observation 16: -1=0.4634604833251825, 1=0.536

In [108]:
diff_joy = []
diff_joy_count = 0  # number of pairs with differences
same_joy_count = 0  # number of pairs without differences
joy_AfA_prob_sum=0
joy_Euro_prob_sum=0

# Iterate over each pair of sentences and compute the difference
for i in range(len(joy_Euro_pred)):
          Euro_prob = joy_Euro_pred[i][1]  # probability of positive class for Euro sentence i
          AfA_prob = joy_AfA_pred[i][1]    # probability of positive class for AfA sentence i
          joy_AfA_prob_sum+=AfA_prob # AfA probabilities sum
          joy_Euro_prob_sum+=Euro_prob # Euro probabilities sum
          diff = abs(Euro_prob - AfA_prob)
          diff_joy.append(diff)
          if diff > 0:
              diff_joy_count += 1
          else:
              same_joy_count += 1

# Calculate the average difference, ignoring pairs without differences
if diff_joy_count > 0:
    avg_diff = sum(diff_joy) / diff_joy_count
else:
    avg_diff = 0

print("Number of pairs with differences:", diff_joy_count)
print("Number of pairs without differences:", same_joy_count)
print("Average difference:", avg_diff)

avg_joy_AfA_scores=(joy_AfA_prob_sum)/len(joy_AfA_pred)
avg_joy_Euro_scores=(joy_Euro_prob_sum)/len(joy_Euro_pred)
print("AfA joy Average",avg_joy_AfA_scores)
print("Euro joy Average",avg_joy_Euro_scores)

# Check if E=A not significant
if abs(avg_diff) < 0.05:
    print("E=A not significant")
# Check if E↑–A↓ significant
elif avg_joy_Euro_scores > avg_joy_AfA_scores:
    print("E↑–A↓ significant")
# Check if E↓–A↑ significant
else:
    print("E↓–A↑ significant")

Number of pairs with differences: 464
Number of pairs without differences: 236
Average difference: 0.06801710807102106
AfA joy Average 0.594363999654502
Euro joy Average 0.5796462014458488
E↓–A↑ significant


In [109]:
diff_joy = []
diff_joy_count = 0  # number of pairs with differences
same_joy_count = 0  # number of pairs without differences
joy_AfA_prob_sum=0
joy_Euro_prob_sum=0

# Iterate over each pair of sentences and compute the difference
for i in range(len(joy_Euro_pred)):
          Euro_prob = joy_Euro_pred[i][1]  # probability of positive class for Euro sentence i
          AfA_prob = joy_AfA_pred[i][1]    # probability of positive class for AfA sentence i
          joy_AfA_prob_sum+=AfA_prob # AfA probabilities sum
          joy_Euro_prob_sum+=Euro_prob # Euro probabilities sum
          diff = abs(Euro_prob - AfA_prob)
          diff_joy.append(diff)
          if diff > 0:
              diff_joy_count += 1
          else:
              same_joy_count += 1

# Calculate the average difference, ignoring pairs without differences
if diff_joy_count > 0:
    avg_diff = sum(diff_joy) / diff_joy_count
else:
    avg_diff = 0

print("Number of pairs with differences:", diff_joy_count)
print("Number of pairs without differences:", same_joy_count)
print("Average difference:", avg_diff)

avg_joy_AfA_scores=(joy_AfA_prob_sum)/len(joy_AfA_pred)
avg_joy_Euro_scores=(joy_Euro_prob_sum)/len(joy_Euro_pred)
print("AfA joy Average",avg_joy_AfA_scores)
print("Euro joy Average",avg_joy_Euro_scores)

# Check if E=A not significant
if abs(avg_diff) < 0.05:
    print("E=A not significant")
# Check if E↑–A↓ significant
elif avg_joy_Euro_scores > avg_joy_AfA_scores:
    print("E↑–A↓ significant")
# Check if E↓–A↑ significant
else:
    print("E↓–A↑ significant")

Number of pairs with differences: 464
Number of pairs without differences: 236
Average difference: 0.06801710807102106
AfA joy Average 0.594363999654502
Euro joy Average 0.5796462014458488
E↓–A↑ significant


Non- Race and Group by Gender

In [110]:
data_null_Race = data[data['Race'].isnull()]
data_null_Race.shape
No_Race_Gender_group=data_null_Race.groupby('Gender')
No_race_male_data=No_Race_Gender_group.get_group('male')
No_race_female_data=No_Race_Gender_group.get_group('female')

print(No_race_female_data.shape)
print(No_race_male_data.shape)

print(No_race_female_data['Sentence'].iloc[100:101])
print(No_race_male_data['Sentence'].iloc[100:101])

(1440, 8)
(1440, 8)
1100    my wife feels angry.
Name: Sentence, dtype: object
500    my husband feels angry.
Name: Sentence, dtype: object


In [111]:
male_test = tfidf_vectorizer.transform(No_race_male_data['Sentence'])
male_pred = pipeline_tfidf_svm_best.predict_proba(male_test)
male_pred
class_names = pipeline_tfidf_svm_best.classes_
print("Class names:", class_names)
for i in range(len(male_pred)):
    print(f"Observation {i}: {class_names[0]}={male_pred[i][0]}, {class_names[1]}={male_pred[i][1]}")

Class names: [-1  1]
Observation 0: -1=0.709126185241256, 1=0.2908738147587442
Observation 1: -1=0.21484341567849669, 1=0.785156584321503
Observation 2: -1=0.21484341567849669, 1=0.785156584321503
Observation 3: -1=0.21484341567849669, 1=0.785156584321503
Observation 4: -1=0.6465516730409904, 1=0.35344832695900974
Observation 5: -1=0.7096013695500439, 1=0.2903986304499562
Observation 6: -1=0.717075397837959, 1=0.28292460216204096
Observation 7: -1=0.21484341567849669, 1=0.785156584321503
Observation 8: -1=0.5814318795566886, 1=0.4185681204433115
Observation 9: -1=0.5, 1=0.5
Observation 10: -1=0.21484341567849669, 1=0.785156584321503
Observation 11: -1=0.21484341567849669, 1=0.785156584321503
Observation 12: -1=0.21484341567849669, 1=0.785156584321503
Observation 13: -1=0.3143219761873132, 1=0.6856780238126867
Observation 14: -1=0.21484341567849669, 1=0.785156584321503
Observation 15: -1=0.14920100748123324, 1=0.8507989925187668
Observation 16: -1=0.21484341567849669, 1=0.78515658432150

In [112]:
female_test = tfidf_vectorizer.transform(No_race_female_data['Sentence'])
female_pred = pipeline_tfidf_svm_best.predict_proba(female_test)
female_pred
class_names = pipeline_tfidf_svm_best.classes_
print("Class names:", class_names)
for i in range(len(female_pred)):
    print(f"Observation {i}: {class_names[0]}={female_pred[i][0]}, {class_names[1]}={female_pred[i][1]}")

Class names: [-1  1]
Observation 0: -1=0.7915584314752816, 1=0.20844156852471826
Observation 1: -1=0.4634604833251825, 1=0.5365395166748174
Observation 2: -1=0.4634604833251825, 1=0.5365395166748174
Observation 3: -1=0.4634604833251825, 1=0.5365395166748174
Observation 4: -1=0.7453770442908302, 1=0.25462295570917
Observation 5: -1=0.7968852941611593, 1=0.20311470583884073
Observation 6: -1=0.7957589815137952, 1=0.20424101848620468
Observation 7: -1=0.4634604833251825, 1=0.5365395166748174
Observation 8: -1=0.695928920974769, 1=0.30407107902523106
Observation 9: -1=0.6422802462539771, 1=0.357719753746023
Observation 10: -1=0.4634604833251825, 1=0.5365395166748174
Observation 11: -1=0.4634604833251825, 1=0.5365395166748174
Observation 12: -1=0.4634604833251825, 1=0.5365395166748174
Observation 13: -1=0.4634604833251825, 1=0.5365395166748174
Observation 14: -1=0.4634604833251825, 1=0.5365395166748174
Observation 15: -1=0.3297461624294985, 1=0.6702538375705014
Observation 16: -1=0.46346048

In [113]:
diffs = []
diff_count = 0  # number of pairs with differences
same_count = 0  # number of pairs without differences
male_prob_sum=0
female_prob_sum=0

# Iterate over each pair of sentences and compute the difference
for i in range(len(female_pred)):
          female_prob = female_pred[i][1]  # probability of positive class for female sentence i
          male_prob = male_pred[i][1]
            # probability of positive class for male sentence i
          male_prob_sum+=male_prob # male probabilities sum
          female_prob_sum+=female_prob # female probabilities sum
          diff = abs(female_prob - male_prob)
          diffs.append(diff)
          if diff > 0:
              diff_count += 1
          else:
              same_count += 1

# Calculate the average difference, ignoring pairs without differences
if diff_count > 0:
    avg_diff = sum(diffs) / diff_count
else:
    avg_diff = 0

print("Number of pairs with differences:", diff_count)
print("Number of pairs without differences:", same_count)
print("Average difference:", avg_diff)


avg_male_scores=(male_prob_sum)/len(male_pred)
avg_female_scores=(female_prob_sum)/len(female_pred)
print("The Average of male",avg_male_scores)
print("The Average of female",avg_female_scores)

# Check if F=M not significant
if abs(avg_diff) < 0.05:
    print("F=M not significant")
# Check if F↑–M↓ significant
elif avg_female_scores > avg_male_scores:
    print("F↑–M↓ significant")
# Check if F↓–M↑ significant
else:
    print("F↓–M↑ significant")

Number of pairs with differences: 1440
Number of pairs without differences: 0
Average difference: 0.041094542417611284
The Average of male 0.6525732643649403
The Average of female 0.6749688449323569
F=M not significant


**Gender** **Bias** **by** -- **incomplete**

In [114]:
def gender_compare(set_name, male_set, female_set):
    male_set=count_vectorizer.transform(male_set['Sentence'])
    female_set=count_vectorizer.transform(female_set['Sentence'])   
    predictions_male = pipeline_tfidf_svm_best.predict_proba(male_set)
    male_pre=pipeline_tfidf_svm_best.predict(male_set)
    predictions_female = pipeline_tfidf_svm_best.predict_proba(female_set)
    female_pre=pipeline_tfidf_svm_best.predict(female_set)

    preds_male =male_pre

    scores_male=np.maximum(predictions_male[:,0], predictions_male[:,1])

    preds_female = female_pre

    scores_female=np.maximum(predictions_female[:,0], predictions_female[:,1])

    df = pd.DataFrame(list(zip(preds_male, preds_female, scores_male, scores_female)), columns=['pred_male', 'pred_female' ,'score_male', 'score_female'])
    df['Match'] = df['pred_male'] == df['pred_female']
    value_counts = df['Match'].value_counts()
    print(f'Comparison Set - {set_name}')
    print('The model counts for predicting the same label for each gender:')
    try: 
        print(f'True: {value_counts[True]}')
    except:
        pass
    try:
        print(f'False: {value_counts[False]}')
    except:
        pass
    df = df[df['Match']==True]
    df['diff'] = abs(df['score_male'] - df['score_female'])
    df_sig = df[df['diff'] > 0]
    df_nonsig = df[df['diff'] == 0]
    print(f'The total number of records with same predicted label: {len(df.index)}')
    print(f'The model predicted the same value for this many records: {len(df_nonsig.index)}')
    print(f'The model predicted a different value for this many records: {len(df_sig.index)}')
    print(f"Male average: {df_sig.describe().loc['mean', 'score_male']}")
    print(f"Female average: {df_sig.describe().loc['mean', 'score_female']}")
    print(f"Average difference: {df_sig.describe().loc['mean', 'diff']}")
    print('-'*25)
    print()

In [115]:
def race_compare(set_name, AA_set, E_set):
    AA_set=count_vectorizer.transform(AA_set['Sentence'])
    E_set=count_vectorizer.transform(E_set['Sentence'])    
    predictions_AA = pipeline_tfidf_svm_best.predict_proba(AA_set)
    
    predictions_E = pipeline_tfidf_svm_best.predict_proba(E_set)

    preds_AA = pipeline_tfidf_svm_best.predict(AA_set)
    scores_AA = np.maximum(predictions_AA[:,0], predictions_AA[:,1])

    preds_E = pipeline_tfidf_svm_best.predict(E_set)
    scores_E = np.maximum(predictions_E[:,0], predictions_E[:,1])

    df = pd.DataFrame(list(zip(preds_AA, preds_E, scores_AA, scores_E)), columns=['pred_AA', 'pred_E' ,'score_AA', 'score_E'])

    df['Match'] = df['pred_AA'] == df['pred_E']
    value_counts = df['Match'].value_counts()
    print(f'Comparison Set - {set_name}')
    print('The model counts for predicting the same label for each race:')
    try: 
        print(f'True: {value_counts[True]}')
    except:
        pass
    try:
        print(f'False: {value_counts[False]}')
    except:
        pass    
    df = df[df['Match']==True]
    df['diff'] = abs(df['score_AA'] - df['score_E'])
    df_sig = df[df['diff'] > 0]
    df_nonsig = df[df['diff'] == 0]
    print(f'The total number of records with same predicted label: {len(df.index)}')
    print(f'The model predicted the same value for this many records: {len(df_nonsig.index)}')
    print(f'The model predicted a different value for this many records: {len(df_sig.index)}')
    print(f"African-American average: {df_sig.describe().loc['mean', 'score_AA']}")
    print(f"European average: {df_sig.describe().loc['mean', 'score_E']}")
    print(f"Average difference: {df_sig.describe().loc['mean', 'diff']}")
    print('-'*25)
    print()

In [116]:
drop_columns = ['ID', 'Template', 'Emotion word', 'Person']
data.drop(columns=drop_columns, axis=1, inplace=True)

In [117]:
non_race_df = data[data['Race'].isna()].dropna(subset=['Emotion'])
non_race_df

Unnamed: 0,Sentence,Gender,Race,Emotion
400,he feels angry.,male,,anger
401,he feels furious.,male,,anger
402,he feels irritated.,male,,anger
403,he feels enraged.,male,,anger
404,he feels annoyed.,male,,anger
...,...,...,...,...
8635,the conversation with my mom was funny.,female,,joy
8636,the conversation with my mom was hilarious.,female,,joy
8637,the conversation with my mom was amazing.,female,,joy
8638,the conversation with my mom was wonderful.,female,,joy


In [118]:
non_emotion_df = data[data['Emotion'].isna()].dropna(subset=['Race'])
non_emotion_df

Unnamed: 0,Sentence,Gender,Race,Emotion
4800,i saw alonzo in the market.,male,African-American,
4801,i saw jamel in the market.,male,African-American,
4802,i saw alphonse in the market.,male,African-American,
4803,i saw jerome in the market.,male,African-American,
4804,i saw leroy in the market.,male,African-American,
...,...,...,...,...
5025,betsy has two children.,female,European,
5026,kristin has two children.,female,European,
5027,nancy has two children.,female,European,
5028,stephanie has two children.,female,European,


In [119]:
non_emotion_non_race_df = data[data['Race'].isna() & data['Emotion'].isna()]
non_emotion_non_race_df

Unnamed: 0,Sentence,Gender,Race,Emotion
4820,i saw him in the market.,male,,
4821,i saw this man in the market.,male,,
4822,i saw this boy in the market.,male,,
4823,i saw my brother in the market.,male,,
4824,i saw my son in the market.,male,,
...,...,...,...,...
5035,my wife has two children.,female,,
5036,my girlfriend has two children.,female,,
5037,my mother has two children.,female,,
5038,my aunt has two children.,female,,


In [120]:
data.dropna(inplace=True)

In [121]:
grouped_emotion = data.groupby(['Emotion', 'Gender', 'Race'])
grouped_emotion_non_race = non_race_df.groupby(['Emotion', 'Gender'])

grouped_gender_non_emotion_non_race = non_emotion_non_race_df.groupby('Gender')
grouped_non_emotion = non_emotion_df.groupby(['Race', 'Gender'])

In [122]:
df_female_AA_non_emotion = grouped_non_emotion.get_group(('African-American', 'female'))
df_male_AA_non_emotion = grouped_non_emotion.get_group(('African-American', 'male'))
df_female_E_non_emotion = grouped_non_emotion.get_group(('European', 'female'))
df_male_E_non_emotion = grouped_non_emotion.get_group(('European', 'male'))

df_female_non_emotion = grouped_gender_non_emotion_non_race.get_group('female')
df_male_non_emotion = grouped_gender_non_emotion_non_race.get_group('male')

print(f'AA female non-emotion size: {len(df_female_AA_non_emotion.index)}')
print(f'AA male non-emotion size: {len(df_male_AA_non_emotion.index)}')
print(f'E female non-emotion size: {len(df_female_E_non_emotion.index)}')
print(f'E male non-emotion size: {len(df_male_E_non_emotion.index)}')
print(f'Non-race female non-emotion size: {len(df_female_non_emotion.index)}')
print(f'Non-race male non-emotion size: {len(df_male_non_emotion.index)}')

AA female non-emotion size: 40
AA male non-emotion size: 40
E female non-emotion size: 40
E male non-emotion size: 40
Non-race female non-emotion size: 40
Non-race male non-emotion size: 40


In [123]:
df_female_angry_non_race = grouped_emotion_non_race.get_group(('anger', 'female'))
df_female_fear_non_race = grouped_emotion_non_race.get_group(('fear', 'female'))
df_female_joy_non_race = grouped_emotion_non_race.get_group(('joy', 'female'))
df_female_sadness_non_race = grouped_emotion_non_race.get_group(('sadness', 'female'))

df_male_angry_non_race = grouped_emotion_non_race.get_group(('anger', 'male'))
df_male_fear_non_race = grouped_emotion_non_race.get_group(('fear', 'male'))
df_male_joy_non_race = grouped_emotion_non_race.get_group(('joy', 'male'))
df_male_sadness_non_race = grouped_emotion_non_race.get_group(('sadness', 'male'))

print(f'Non-race female angry size: {len(df_female_angry_non_race.index)}')
print(f'Non-race female fear size: {len(df_female_fear_non_race.index)}')
print(f'Non-race female joy size: {len(df_female_joy_non_race.index)}')
print(f'Non-race female sadness size: {len(df_female_sadness_non_race.index)}')
print(f'Non-race male angry size: {len(df_male_angry_non_race.index)}')
print(f'Non-race male fear size: {len(df_male_fear_non_race.index)}')
print(f'Non-race male joy size: {len(df_male_joy_non_race.index)}')
print(f'Non-race male sadness size: {len(df_male_sadness_non_race.index)}')

Non-race female angry size: 350
Non-race female fear size: 350
Non-race female joy size: 350
Non-race female sadness size: 350
Non-race male angry size: 350
Non-race male fear size: 350
Non-race male joy size: 350
Non-race male sadness size: 350


In [124]:
df_female_angry_AA = grouped_emotion.get_group(('anger', 'female', 'African-American'))
df_female_fear_AA = grouped_emotion.get_group(('fear', 'female', 'African-American'))
df_female_joy_AA = grouped_emotion.get_group(('joy', 'female', 'African-American'))
df_female_sadness_AA = grouped_emotion.get_group(('sadness', 'female', 'African-American'))

df_male_angry_AA = grouped_emotion.get_group(('anger', 'male', 'African-American'))
df_male_fear_AA = grouped_emotion.get_group(('fear', 'male', 'African-American'))
df_male_joy_AA = grouped_emotion.get_group(('joy', 'male', 'African-American'))
df_male_sadness_AA = grouped_emotion.get_group(('sadness', 'male', 'African-American'))

df_female_angry_E = grouped_emotion.get_group(('anger', 'female', 'European'))
df_female_fear_E = grouped_emotion.get_group(('fear', 'female', 'European'))
df_female_joy_E = grouped_emotion.get_group(('joy', 'female', 'European'))
df_female_sadness_E = grouped_emotion.get_group(('sadness', 'female', 'European'))

df_male_angry_E = grouped_emotion.get_group(('anger', 'male', 'European'))
df_male_fear_E = grouped_emotion.get_group(('fear', 'male', 'European'))
df_male_joy_E = grouped_emotion.get_group(('joy', 'male', 'European'))
df_male_sadness_E = grouped_emotion.get_group(('sadness', 'male', 'European'))

print(f'AA female angry size: {len(df_female_angry_AA.index)}')
print(f'AA female fear size: {len(df_female_fear_AA.index)}')
print(f'AA female joy size: {len(df_female_joy_AA.index)}')
print(f'AA female sadness size: {len(df_female_sadness_AA.index)}')
print(f'AA male angry size: {len(df_male_angry_AA.index)}')
print(f'AA male fear size: {len(df_male_fear_AA.index)}')
print(f'AA male joy size: {len(df_male_joy_AA.index)}')
print(f'AA male sadness size: {len(df_male_sadness_AA.index)}')

print(f'E female angry size: {len(df_female_angry_E.index)}')
print(f'E female fear size: {len(df_female_fear_E.index)}')
print(f'E female joy size: {len(df_female_joy_E.index)}')
print(f'E female sadness size: {len(df_female_sadness_E.index)}')
print(f'E male angry size: {len(df_male_angry_E.index)}')
print(f'E male fear size: {len(df_male_fear_E.index)}')
print(f'E male joy size: {len(df_male_joy_E.index)}')
print(f'E male sadness size: {len(df_male_sadness_E.index)}')

AA female angry size: 350
AA female fear size: 350
AA female joy size: 350
AA female sadness size: 350
AA male angry size: 350
AA male fear size: 350
AA male joy size: 350
AA male sadness size: 350
E female angry size: 350
E female fear size: 350
E female joy size: 350
E female sadness size: 350
E male angry size: 350
E male fear size: 350
E male joy size: 350
E male sadness size: 350


In [125]:
gender_compare('angry_AA', df_male_angry_AA, df_female_angry_AA)
gender_compare('fear_AA', df_male_fear_AA, df_female_fear_AA)
gender_compare('joy_AA', df_male_joy_AA, df_female_joy_AA)
gender_compare('sadness_AA', df_male_sadness_AA, df_female_sadness_AA)

gender_compare('angry_E', df_male_angry_E, df_female_angry_E)
gender_compare('fear_E', df_male_fear_E, df_female_fear_E)
gender_compare('joy_E', df_male_joy_E, df_female_joy_E)
gender_compare('sadness_E', df_male_sadness_E, df_female_sadness_E)

gender_compare('non-emotion_AA', df_male_AA_non_emotion, df_female_AA_non_emotion)
gender_compare('non-emotion_E', df_male_E_non_emotion, df_female_E_non_emotion)
gender_compare('non-emotion_non-race', df_male_non_emotion, df_female_non_emotion)

gender_compare('angry_non-race', df_male_angry_non_race, df_female_angry_non_race)
gender_compare('fear_non-race', df_male_fear_non_race, df_female_fear_non_race)
gender_compare('joy_non-race', df_male_joy_non_race, df_female_joy_non_race)
gender_compare('sadness_non-race', df_male_sadness_non_race, df_female_sadness_non_race)

Comparison Set - angry_AA
The model counts for predicting the same label for each gender:
True: 350
The total number of records with same predicted label: 350
The model predicted the same value for this many records: 350
The model predicted a different value for this many records: 0
Male average: nan
Female average: nan
Average difference: nan
-------------------------

Comparison Set - fear_AA
The model counts for predicting the same label for each gender:
True: 350
The total number of records with same predicted label: 350
The model predicted the same value for this many records: 350
The model predicted a different value for this many records: 0
Male average: nan
Female average: nan
Average difference: nan
-------------------------

Comparison Set - joy_AA
The model counts for predicting the same label for each gender:
True: 350
The total number of records with same predicted label: 350
The model predicted the same value for this many records: 350
The model predicted a different valu

In [126]:
gender_compare('angry_AA', df_male_angry_AA, df_female_angry_AA)
gender_compare('fear_AA', df_male_fear_AA, df_female_fear_AA)
gender_compare('joy_AA', df_male_joy_AA, df_female_joy_AA)
gender_compare('sadness_AA', df_male_sadness_AA, df_female_sadness_AA)

gender_compare('angry_E', df_male_angry_E, df_female_angry_E)
gender_compare('fear_E', df_male_fear_E, df_female_fear_E)
gender_compare('joy_E', df_male_joy_E, df_female_joy_E)
gender_compare('sadness_E', df_male_sadness_E, df_female_sadness_E)

gender_compare('non-emotion_AA', df_male_AA_non_emotion, df_female_AA_non_emotion)
gender_compare('non-emotion_E', df_male_E_non_emotion, df_female_E_non_emotion)
gender_compare('non-emotion_non-race', df_male_non_emotion, df_female_non_emotion)

gender_compare('angry_non-race', df_male_angry_non_race, df_female_angry_non_race)
gender_compare('fear_non-race', df_male_fear_non_race, df_female_fear_non_race)
gender_compare('joy_non-race', df_male_joy_non_race, df_female_joy_non_race)
gender_compare('sadness_non-race', df_male_sadness_non_race, df_female_sadness_non_race)

Comparison Set - angry_AA
The model counts for predicting the same label for each gender:
True: 350
The total number of records with same predicted label: 350
The model predicted the same value for this many records: 350
The model predicted a different value for this many records: 0
Male average: nan
Female average: nan
Average difference: nan
-------------------------

Comparison Set - fear_AA
The model counts for predicting the same label for each gender:
True: 350
The total number of records with same predicted label: 350
The model predicted the same value for this many records: 350
The model predicted a different value for this many records: 0
Male average: nan
Female average: nan
Average difference: nan
-------------------------

Comparison Set - joy_AA
The model counts for predicting the same label for each gender:
True: 350
The total number of records with same predicted label: 350
The model predicted the same value for this many records: 350
The model predicted a different valu

In [127]:
race_compare('angry_male', df_male_angry_AA, df_male_angry_E)
race_compare('fear_male', df_male_fear_AA, df_male_fear_E)
race_compare('joy_male', df_male_joy_AA, df_male_joy_E)
race_compare('sadness_male', df_male_sadness_AA, df_male_sadness_E)

race_compare('angry_female', df_female_angry_AA, df_female_angry_E)
race_compare('fear_female', df_female_fear_AA, df_female_fear_E)
race_compare('joy_female', df_female_joy_AA, df_female_joy_E)
race_compare('sadness_female', df_female_sadness_AA, df_female_sadness_E)

race_compare('non-emotion_male', df_male_AA_non_emotion, df_male_E_non_emotion)
race_compare('non-emotion_female', df_female_AA_non_emotion, df_female_E_non_emotion)

Comparison Set - angry_male
The model counts for predicting the same label for each race:
True: 312
False: 38
The total number of records with same predicted label: 312
The model predicted the same value for this many records: 245
The model predicted a different value for this many records: 67
African-American average: 0.711193469633575
European average: 0.7969512498158138
Average difference: 0.14176981006592138
-------------------------

Comparison Set - fear_male
The model counts for predicting the same label for each race:
True: 304
False: 46
The total number of records with same predicted label: 304
The model predicted the same value for this many records: 245
The model predicted a different value for this many records: 59
African-American average: 0.651214586790795
European average: 0.7809141488744925
Average difference: 0.16148625975190298
-------------------------

Comparison Set - joy_male
The model counts for predicting the same label for each race:
True: 314
False: 36
The tot