First, we need to merge all files into one, which will later be referred to as 'merged_data'.

In [1]:
from google.colab import drive
drive.mount('/content/drive/', force_remount=True)
%cd drive/My Drive/MS thesis/Data/keyBERTproject/

Mounted at /content/drive/
/content/drive/My Drive/MS thesis/Data/keyBERTproject


In [3]:
import pandas as pd
import glob

# List all CSV files in a directory
csv_files = glob.glob("/content/drive/My Drive/MS thesis/Data/keyBERTproject/subreddits/*.csv")

# Initialize an empty list to store DataFrames
data_frames = []

# Loop through each CSV file and read its data into a list of DataFrames
for csv_file in csv_files:
    df = pd.read_csv(csv_file)
    data_frames.append(df)

# Concatenate the list of DataFrames into a single DataFrame
merged_data = pd.concat(data_frames, ignore_index=True)

# Specify the path and filename for the merged CSV file
output_file = "/content/drive/My Drive/MS thesis/Data/keyBERTproject/subreddits/merged_data.csv"

# Save the merged data to a new CSV file
merged_data.to_csv(output_file, index=False)

print(f"Merged data saved to {output_file}")


Merged data saved to /content/drive/My Drive/MS thesis/Data/keyBERTproject/subreddits/merged_data.csv


In [4]:
print("The merged data file has number of rows: ", merged_data.shape[0])

The merged data file has number of rows:  97345


Next, we apply the initial filtering using the 45 key words. The new file "initially_filtered_dataset.csv" will have a new column "Keyword Present" with binary values True/False.
We save submissions that contain keywords into a separate file "initially_filtered_gender_dataset.csv".

In [5]:
%cd drive/My Drive/MS thesis/Data/keyBERTproject/subreddits/

# read the csv file
df = pd.read_csv("/content/drive/My Drive/MS thesis/Data/keyBERTproject/subreddits/merged_data.csv")

# List of keywords to filter by
keywords = ['sexism' , 'gender bias', 'gender discrimination' , 'sexual discrimination' , 'male chauvinism' , 'antifeminism',
            'favouritism' , 'discrimination' , 'gender disparity' , 'gender difference' , 'gender inequality' , 'gender inequity' ,
            'gender imbalance' , 'gender' , 'feminism' , 'patriarchy' , 'misogyny', 'misandry' , 'lgbtq' , 'egalitarianism' , 'masculine' ,
            'manly' , 'manful' , 'mannish' , 'manlike' , 'Womanly', 'womanlike', 'womanish', 'femalelike', 'unfeminine', 'paternal' ,
            'maternal' , 'lgb' , 'lgbt', 'transgender' , 'gay' , 'lesbian' , 'bisexual' , 'homosexual' , 'genderfluid', 'no-binary' ,
            'nonbinary' , 'non-binary', 'intersex' , 'agender']

# Custom function to check if any word contains any keyword
def contains_keyword(text, keywords):
    words = text.lower().split()  # Split text into words and make them lowercase
    for word in words:
        if word in keywords:
            return True
    return False

# Adding a new column 'Keyword Present' with True if any word contains any keyword, else False
df['Keyword Present'] = df.apply(lambda row: contains_keyword(str(row['Title']) + ' ' + str(row['Post_body']), keywords), axis=1)

# Specify the path for the output CSV file
output_csv_path = "output_data.csv"  # Replace with the desired output file path

# Save the updated DataFrame to a new CSV file
df.to_csv("initially_filtered_dataset.csv", index=False)


# Printing the updated DataFrame
print(df)

# print rows where 'Keyword Present' is True
print(df[df['Keyword Present'] == True])

[Errno 2] No such file or directory: 'drive/My Drive/MS thesis/Data/keyBERTproject/subreddits/'
/content/drive/My Drive/MS thesis/Data/keyBERTproject
      Subreddit_name   Post_date   Created_UTC  Post_ID  \
0           temu_ads  2023-10-30  1698696837.0  17k2hh3   
1           temu_ads  2023-10-27  1698399240.0  17hjqjv   
2           temu_ads  2023-10-26  1698336239.0  17gzuse   
3           temu_ads  2023-10-24  1698160015.0  17fezf3   
4           temu_ads  2023-10-24  1698153833.0  17fcqh1   
...              ...         ...           ...      ...   
97340      AndroidTV  2023-07-18  1689638359.0  152icoi   
97341      AndroidTV  2023-07-18  1689632644.0  152g1c9   
97342      AndroidTV  2023-07-18  1689629230.0  152el47   
97343      AndroidTV  2023-07-18  1689628343.0  152e78n   
97344      AndroidTV  2023-07-17  1689624412.0  152cg17   

                                                   Title  \
0      💵 Get cash up to $200.00 today!💰Click and acce...   
1                    

In [6]:
#print column names of the updated DataFrame
print(df.columns)

Index(['Subreddit_name', 'Post_date', 'Created_UTC', 'Post_ID', 'Title',
       'Post_body', 'Author', 'Score', 'Upvote_ratio', 'Num_of_comments',
       'NSFW', 'URL', 'Keyword Present'],
      dtype='object')


In [7]:
# print number of rows where 'Keyword Present' is True
print(df[df['Keyword Present'] == True].shape[0])

142


In [8]:
# save the rows where 'Keyword Present' is True to a new CSV file
df[df['Keyword Present'] == True].to_csv("initially_filtered_gender_dataset.csv", index=False)

Since the first round of filtering gave a very low number of matching words, we will apply some pre-processing steps to improve the filtering results.

In [9]:
# first, we will apply pyspellchecker to the text in the merged_data.csv file
!pip install pyspellchecker --quiet

[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m6.8/6.8 MB[0m [31m19.1 MB/s[0m eta [36m0:00:00[0m
[?25h

In [None]:
from spellchecker import SpellChecker

# Initialize the SpellChecker
spell = SpellChecker()

# Function to correct the spelling in a given column of the data frame
def correct_spelling(text):
    if isinstance(text, str):  # Check if the value is a string
        corrected_text = []
        misspelled_words = spell.unknown(text.split())  # Find misspelled words in text
        for word in text.split():
            if word in misspelled_words:
                correction = spell.correction(word)
                if correction:
                    corrected_text.append(correction)  # Correct misspelled words
                else:
                    corrected_text.append(word)  # Append the original word if no correction is found
            else:
                corrected_text.append(word)  # Append correctly spelled words
        return " ".join(corrected_text)
    else:
        return text  # Return non-string values as-is
# read the csv file
df_spellcheck = pd.read_csv("/content/drive/My Drive/MS thesis/Data/keyBERTproject/merged_data.csv")

# Apply the spell checker to the 'Post body' column of the DataFrame df_spellcheck
df_spellcheck['Post_body'] = df_spellcheck['Post_body'].apply(correct_spelling)

# Save the updated DataFrame to a new CSV file
df_spellcheck.to_csv("/content/drive/My Drive/MS thesis/Data/keyBERTproject/merged_data_spellcheck.csv", index=False)

In [None]:
df_spellcheck.head()

Unnamed: 0,Post date,Created UTC,Post ID,Title,Post body,Author,Score,Upvote ratio,Num of comments,NSFW,URL
0,2023-08-29,1693336000.0,164r0hx,Looks like we’re getting the edit button next 👀,,Eddygraphic,4,1.0,1,False,https://i.redd.it/qf7qs0ayj3lb1.jpg
1,2023-08-29,1693268000.0,1642t4e,How does thread choose what to show on your FYP?,My husband and I just started using threads mo...,No-Tumbleweed1873,1,1.0,1,False,https://www.reddit.com/r/ThreadsApp/comments/1...
2,2023-08-29,1693331000.0,164p7gl,Viewing links in reader mode,Threads app displays some site in reader view ...,fermentationfan,1,1.0,1,False,https://www.reddit.com/r/ThreadsApp/comments/1...
3,2023-08-29,1693319000.0,164jw8s,Is this new? Arrow button in lower right of vi...,,Jimbuub,8,1.0,3,False,https://i.redd.it/tp8brg0x62lb1.jpg
4,2023-08-29,1693282000.0,16483kj,Is there an easy way to schedule posts?,Trying to remain active but honestly life is g...,dnaboe,1,0.6,3,False,https://www.reddit.com/r/ThreadsApp/comments/1...


Now, we will apply the same filtering on the corrected data frame.

In [None]:
#since I had to reset the runtime, I had to load dataset to a dataframe again
import pandas as pd
df_spellcheck = pd.read_csv("/content/drive/My Drive/MS thesis/Data/keyBERTproject/merged_data_spellcheck.csv")
#drop rows with NaN values
df_spellcheck = df_spellcheck.dropna()

# List of keywords to filter by
keywords = ['sexism' , 'gender bias', 'gender discrimination' , 'sexual discrimination' , 'male chauvinism' , 'antifeminism',
            'favouritism' , 'discrimination' , 'gender disparity' , 'gender difference' , 'gender inequality' , 'gender inequity' ,
            'gender imbalance' , 'gender' , 'feminism' , 'patriarchy' , 'misogyny', 'misandry' , 'lgbtq' , 'egalitarianism' , 'masculine' ,
            'manly' , 'manful' , 'mannish' , 'manlike' , 'Womanly', 'womanlike', 'womanish', 'femalelike', 'unfeminine', 'paternal' ,
            'maternal' , 'lgb' , 'lgbt', 'transgender' , 'gay' , 'lesbian' , 'bisexual' , 'homosexual' , 'genderfluid', 'no-binary' ,
            'nonbinary' , 'non-binary', 'intersex' , 'agender']

# Custom function to check if any word contains any keyword
def contains_keyword(text, keywords):
    words = text.lower().split()  # Split text into words and make them lowercase
    for word in words:
        if word in keywords:
            return True
    return False

# Adding a new column 'Keyword Present' with True if any word contains any keyword, else False
df_spellcheck['Keyword Present'] = df_spellcheck.apply(lambda row: contains_keyword(row['Title'] + ' ' + row['Post body'], keywords), axis=1)

# Save the updated DataFrame to a new CSV file
df_spellcheck.to_csv("initially_filtered_dataset.csv", index=False)


# Printing the updated DataFrame
print(df_spellcheck)

# print rows where 'Keyword Present' is True
print(df_spellcheck[df_spellcheck['Keyword Present'] == True])

        Post date   Created UTC  Post ID  \
1      2023-08-29  1.693268e+09  1642t4e   
2      2023-08-29  1.693331e+09  164p7gl   
4      2023-08-29  1.693282e+09  16483kj   
5      2023-08-29  1.693274e+09  164537g   
6      2023-08-29  1.693268e+09  1642wto   
...           ...           ...      ...   
15423  2022-12-12  1.670851e+09   zjyy2d   
15424  2022-12-11  1.670792e+09   zj8c0o   
15425  2022-12-11  1.670791e+09   zj7yx4   
15426  2022-12-11  1.670782e+09   zj20ds   
15427  2022-12-11  1.670779e+09   zj0jje   

                                                   Title  \
1       How does thread choose what to show on your FYP?   
2                           Viewing links in reader mode   
4                Is there an easy way to schedule posts?   
5                                   It has been too long   
6      logged into threads.net on desktop. Tells me I...   
...                                                  ...   
15423  Multiple Documents\Documents\Documents folde

In [None]:
# print number of rows where 'Keyword Present' is True
print(df_spellcheck[df_spellcheck['Keyword Present'] == True].shape[0])

31


In [None]:
%cd drive/My Drive/MS thesis/Data/keyBERTproject/

[Errno 2] No such file or directory: 'drive/My Drive/MS thesis/Data/keyBERTproject/'
/content


In [None]:
# save the rows where 'Keyword Present' is True to a new CSV file
df_spellcheck[df_spellcheck['Keyword Present'] == True].to_csv("initially_filtered_spellchecked_gender_dataset.csv", index=False)

Applying pyspellcheck insignificantly improved matching: from 26 to 31.

Next, we will try to apply text correction from nltk library.

In [None]:
import pandas as pd
import nltk
from nltk.corpus import words
from nltk.metrics import edit_distance

# Download the NLTK words dataset, punkt
nltk.download('punkt')
nltk.download('words')

# Read the csv file
df_nltk = pd.read_csv("/content/drive/My Drive/MS thesis/Data/keyBERTproject/merged_data.csv")

# Function to perform spell checking using NLTK
def nltk_spell_check(text):
    english_words = set(words.words())
    tokens = nltk.word_tokenize(text)
    corrected_tokens = [min(english_words, key=lambda word: edit_distance(word, token)) if token.lower() not in english_words else token for token in tokens]
    corrected_text = ' '.join(corrected_tokens)
    return corrected_text

# Apply spell checking to both columns
df_nltk[['Title', 'Post body']].applymap(nltk_spell_check)

# Display the DataFrame with the corrected columns
print(df_nltk)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Package words is already up-to-date!


Next, we apply KeyBERT pre-trained model on the initially filtered dataset to extract even more key words.
We start by applying it on the EXIST dataset.

In [None]:
#installing sentence-transformers
!pip install -U sentence-transformers --quiet

!pip install keybert --q

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/86.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━[0m [32m41.0/86.0 kB[0m [31m1.3 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m86.0/86.0 kB[0m [31m1.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.3/1.3 MB[0m [31m22.4 MB/s[0m eta [36m0:00:00[0m
[?25h  Building wheel for sentence-transformers (setup.py) ... [?25l[?25hdone
  Preparing metadata (setup.py) ... [?25l[?25hdone
  Building wheel for keybert (setup.py) ... [?25l[?25hdone


In [None]:
from keybert import KeyBERT
import pandas as pd
from sentence_transformers import SentenceTransformer
from nltk.tokenize import line_tokenize

sentence_model = SentenceTransformer('all-mpnet-base-v2')
kw_model = KeyBERT(model=sentence_model)

.gitattributes:   0%|          | 0.00/1.18k [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/10.6k [00:00<?, ?B/s]

config.json:   0%|          | 0.00/571 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

data_config.json:   0%|          | 0.00/39.3k [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/438M [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/239 [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/363 [00:00<?, ?B/s]

train_script.py:   0%|          | 0.00/13.1k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

In [None]:
%cd drive/My Drive/MS thesis/Data/keyBERTproject/EXIST_2021_Dataset/
test_path = "/content/drive/My Drive/MS thesis/Data/keyBERTproject/EXIST_2021_Dataset/EXIST2021_test_labeled.tsv"
test_data = pd.read_csv(test_path, sep='\t')

train_path = "/content/drive/My Drive/MS thesis/Data/keyBERTproject/EXIST_2021_Dataset/EXIST2021_training.tsv"
train_data = pd.read_csv(train_path, sep='\t')

# print(test_data)
# print(train_data)

sexist_data2 = []
for index, row in test_data.iterrows():
    # print(row['task1'])
    if row['task1'] == 'sexist' and row['language'] == 'en':
        sexist_data2.append(row['text'])
# print(len(sexist_data))

for index, row in train_data.iterrows():
    if row['task1'] == 'sexist' and row['language'] == 'en':
        sexist_data2.append(row['text'])

print(len(sexist_data2))

nslice = 5
cslice2 = len(sexist_data2)//nslice
print(f"Number of each slice is: {cslice2}")
split_lists2 = [sexist_data2[x:x+cslice2] for x in range(0, len(sexist_data2), cslice2)]


slice_candidreview2 = []
for slice in split_lists2:
    slice_candidreview2.append(" ".join(slice))

print(len(slice_candidreview2))

[Errno 2] No such file or directory: 'drive/My Drive/MS thesis/Data/keyBERTproject/EXIST_2021_Dataset/'
/content/drive/My Drive/MS thesis/Data/keyBERTproject
2794
Number of each slice is: 558
6


In [None]:
seed_keywords = ["sexism" , "gender bias", "gender discrimination" , "sexual discrimination" , "male chauvinism" , "antifeminism",
 "favouritism" , "discrimination" , "gender disparity" , "gender difference" , "gender inequality" , "gender inequity" , "gender imbalance" ,
  "gender" , "feminism" , "patriarchy" , "misogyny", "misandry" , "lgbtq" , "egalitarianism" , "masculine" , "manly" ,
   "manful" , "mannish" , "manlike" , "Womanly", "womanlike", "womanish", "femalelike", "unfeminine", "paternal" ,
    "maternal" , "lgb" , "lgbt", "transgender" , "gay" , "lesbian" , "bisexual" , "homosexual" , "genderfluid",
     "no-binary" , "nonbinary" , "non-binary", "intersex" , "agender", "girl", "boy", "men", "man", "woman", "women", "male", "female"]


candidate_keywords2 = []
for slice_doc in slice_candidreview2:
     rel_keywords2 = kw_model.extract_keywords(docs=slice_doc, keyphrase_ngram_range=(3,3), top_n=50, use_mmr=True, diversity=0.1, seed_keywords = seed_keywords, stop_words=None)
     candidate_keywords2.append(rel_keywords2)
     print(rel_keywords2)

[('why feminism keeps', 0.6194), ('man free feminist', 0.615), ('to feminism njabulodhlamin3', 0.6132), ('instincts of patriarchy', 0.5973), ('called patriarchy we', 0.5951), ('fight against misogyny', 0.5926), ('comments about feminism', 0.5895), ('weaponized misogyny in', 0.5892), ('feminism in any', 0.5891), ('bad mouthing feminism', 0.5844), ('so called patriarchy', 0.5788), ('women fault for', 0.5772), ('fight for feminism', 0.577), ('compel speech patriarchy', 0.5768), ('patriarchy feminists publichealth', 0.576), ('feminism the fight', 0.5736), ('same patriarchy you', 0.5721), ('all feminism demand', 0.5718), ('status 938174640370962434 misogyny', 0.5711), ('patriarchy have you', 0.5697), ('should behave woman', 0.5689), ('feminists marginalizing women', 0.5672), ('slurring feminists with', 0.5669), ('women against feminism', 0.5664), ('be feminist they', 0.5663), ('humiliation respect woman', 0.5657), ('is feminist the', 0.5652), ('mouthing feminism and', 0.5649), ('patriarchy 

In [None]:
import numpy as np
keywords_exist_2 = np.array(candidate_keywords2)
keywords_exist_2_list = []
for i in range(keywords_exist_2.shape[0]):
  for j in range(keywords_exist_2.shape[1]):
    #print(keywords_exist_2[i, j, 0])
    keywords_exist_2_list.append(str(keywords_exist_2[i, j, 0]))

In [None]:
len(keywords_exist_2_list)

300

Apply KeyBERT to sexism dataset.

In [None]:
%cd drive/My Drive/MS thesis/Data/keyBERTproject/Sexism_data
input_path = "/content/drive/My Drive/MS thesis/Data/keyBERTproject/Sexism_data/sexism_data.csv"
input_data = pd.read_csv(input_path)

# print(input_data)
sexist_data3 = []
for index, row in input_data.iterrows():
    # temp = row['sexist']
    # print(type(temp))
    if row['sexist'] == True:

        sexist_data3.append(row['text'])

print(len(sexist_data3))

nslice = 3
cslice3 = len(sexist_data3)//nslice
print(f"Number of each slice is: {cslice3}")
split_lists3 = [sexist_data3[x:x+cslice3] for x in range(0, len(sexist_data3), cslice3)]


slice_candidreview3 = []
for slice in split_lists3:
    slice_candidreview3.append(" ".join(slice))

print(len(slice_candidreview3))

[Errno 2] No such file or directory: 'drive/My Drive/MS thesis/Data/keyBERTproject/Sexism_data'
/content/drive/My Drive/MS thesis/Data/keyBERTproject
1809
Number of each slice is: 603
3


In [None]:
seed_keywords = ["sexism" , "gender bias", "gender discrimination" , "sexual discrimination" , "male chauvinism" , "antifeminism",
 "favouritism" , "discrimination" , "gender disparity" , "gender difference" , "gender inequality" , "gender inequity" , "gender imbalance" ,
  "gender" , "feminism" , "patriarchy" , "misogyny", "misandry" , "lgbtq" , "egalitarianism" , "masculine" , "manly" ,
   "manful" , "mannish" , "manlike" , "Womanly", "womanlike", "womanish", "femalelike", "unfeminine", "paternal" ,
    "maternal" , "lgb" , "lgbt", "transgender" , "gay" , "lesbian" , "bisexual" , "homosexual" , "genderfluid",
     "no-binary" , "nonbinary" , "non-binary", "intersex" , "agender", "girl", "boy", "men", "man", "woman", "women", "male", "female"]


candidate_keywords3 = []
for slice_doc in slice_candidreview3:
     rel_keywords3 = kw_model.extract_keywords(docs=slice_doc, keyphrase_ngram_range=(3,3), top_n=50, use_mmr=True, diversity=0.1, seed_keywords = seed_keywords, stop_words=None)
     candidate_keywords3.append(rel_keywords3)
     print(rel_keywords3)

[('games not sexist', 0.7322), ('mention1598 not sexist', 0.6716), ('sexist but females', 0.6667), ('mention1679 not sexist', 0.6643), ('men not sexist', 0.6599), ('mention3161 not sexist', 0.6554), ('mention2547 not sexist', 0.6549), ('sexist but women', 0.654), ('sexist but female', 0.6531), ('mention487 not sexist', 0.6529), ('mention997 not sexist', 0.6529), ('sexist but nothing', 0.6501), ('mention3247 not sexist', 0.6497), ('am not sexist', 0.6493), ('mention2554 not sexist', 0.6483), ('sports lesbians sexistiknow', 0.6481), ('mention2632 not sexist', 0.6463), ('mention1473 not sexist', 0.646), ('mention4731 not sexist', 0.6459), ('sexist am not', 0.6446), ('sexist but woman', 0.644), ('mention4472 not sexist', 0.644), ('mention420 not sexist', 0.6412), ('sexist rt mention1860', 0.6397), ('mention1886 not sexist', 0.6372), ('sexist but some', 0.636), ('mention3316 not sexist', 0.6341), ('mention1955 not sexist', 0.6339), ('sexist but men', 0.6328), ('mention3450 not sexist', 0.63

Apply KeyBERT on our unlabeled data

In [None]:
%cd drive/My Drive/MS thesis/Data/keyBERTproject
input_path = "/content/drive/My Drive/MS thesis/Data/keyBERTproject/merged_data.csv"
input_data = pd.read_csv(input_path)

# print(input_data)
merged_data = []
for index, row in input_data.iterrows():
        merged_data.append(str(row['Title']) + ' ' + str(row['Post_body']))

print(len(merged_data))

nslice = 3
cslice3 = len(merged_data)//nslice
print(f"Number of each slice is: {cslice3}")
split_lists3 = [merged_data[x:x+cslice3] for x in range(0, len(merged_data), cslice3)]


slice_candidreview4 = []
for slice in split_lists3:
    slice_candidreview4.append(" ".join(slice))

print(len(slice_candidreview4))

[Errno 2] No such file or directory: 'drive/My Drive/MS thesis/Data/keyBERTproject'
/content/drive/My Drive/MS thesis/Data/keyBERTproject
97345
Number of each slice is: 32448
4


In [None]:
seed_keywords = ["sexism" , "gender bias", "gender discrimination" , "sexual discrimination" , "male chauvinism" , "antifeminism",
 "favouritism" , "discrimination" , "gender disparity" , "gender difference" , "gender inequality" , "gender inequity" , "gender imbalance" ,
  "gender" , "feminism" , "patriarchy" , "misogyny", "misandry" , "lgbtq" , "egalitarianism" , "masculine" , "manly" ,
   "manful" , "mannish" , "manlike" , "Womanly", "womanlike", "womanish", "femalelike", "unfeminine", "paternal" ,
    "maternal" , "lgb" , "lgbt", "transgender" , "gay" , "lesbian" , "bisexual" , "homosexual" , "genderfluid",
     "no-binary" , "nonbinary" , "non-binary", "intersex" , "agender", "girl", "boy", "men", "man", "woman", "women", "male", "female"]


candidate_keywords3 = []
for slice_doc in slice_candidreview4:
     rel_keywords3 = kw_model.extract_keywords(docs=slice_doc, keyphrase_ngram_range=(1,1), top_n=50, use_mmr=True, diversity=0.1, seed_keywords = seed_keywords, stop_words=None)
     candidate_keywords3.append(rel_keywords3)
     print(rel_keywords3)

In [None]:
pip install tensorrt

Collecting tensorrt
  Downloading tensorrt-8.6.1.post1.tar.gz (18 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: tensorrt
  Building wheel for tensorrt (setup.py) ... [?25l[?25hdone
  Created wheel for tensorrt: filename=tensorrt-8.6.1.post1-py2.py3-none-any.whl size=17283 sha256=acf2f9b449f06d52f0b44a53a75cfa8a9b5746abd3ed14bc9a1f76df8da5b71b
  Stored in directory: /root/.cache/pip/wheels/f4/c8/0e/b79b08e45752491b9acfdbd69e8a609e8b2ed7640dda5a3e59
Successfully built tensorrt
Installing collected packages: tensorrt
Successfully installed tensorrt-8.6.1.post1
