## Checking top 10 words of each dataset by TF-IDF

In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import os

os.chdir(r'.')

### CMSB TF-IDF

In [57]:
df = pd.read_csv('cmsb_input.csv') # import data

custom_stop_words = ['sports','en','way','men','women','girls','need','good','oh','mkr', 'don', 'ha','ve','kat','want',
                     'know','just','like','andre','lo','wa','ner','really','make',
                     'going','think','right','time','better','look','football'] #remove non-English & stopwords
default_stop_words = set(TfidfVectorizer(stop_words='english').get_stop_words())
combined_stop_words = list(default_stop_words.union(custom_stop_words))

vectorizer = TfidfVectorizer(stop_words=combined_stop_words, max_features=1000)
tfidf_matrix = vectorizer.fit_transform(df['prepro'])
feature_array = vectorizer.get_feature_names_out()
tfidf_sorting = tfidf_matrix.sum(axis=0).A.flatten()

top_n = 10
sorted_items = sorted(zip(tfidf_sorting, feature_array), reverse=True)[:top_n]
sorted_items

[(247.70676603168087, 'man'),
 (246.46270631399614, 'woman'),
 (225.22567043159842, 'people'),
 (133.46188619124007, 'sexist'),
 (105.57341954236507, 'girl'),
 (104.63247660496128, 'hate'),
 (102.347307276787, 'shit'),
 (97.15804242589361, 'work'),
 (89.82687393441205, 'female'),
 (85.78277481822326, 'fuck')]

### SM TF-IDF

In [85]:
df2 = pd.read_csv('supremacist_input.csv')

custom_stop_words = ['saw','area','thank','ll','li','youtube','make','need','look','got','old','time','school','en','did','way',
                     'say','nt','like', 'just','good','know','want','ha','wa','ner','im','ry','think','new','day','going','years','great',
                     'io','year','ery','video','coury','whites','blacks','home','world','thing','does','maybe','er','right','looking',
                     'let','ca','watch','said','news','post','ago','long','come','times','seen','yes','really','little'] #remove non-English & stopwords
default_stop_words = set(TfidfVectorizer(stop_words='english').get_stop_words())
combined_stop_words = list(default_stop_words.union(custom_stop_words))

vectorizer = TfidfVectorizer(stop_words=combined_stop_words, max_features=1000)
tfidf_matrix = vectorizer.fit_transform(df2['prepro'])
feature_array = vectorizer.get_feature_names_out()
tfidf_sorting = tfidf_matrix.sum(axis=0).A.flatten()

top_n = 10
sorted_items = sorted(zip(tfidf_sorting, feature_array), reverse=True)[:top_n]
sorted_items

[(276.911330931663, 'white'),
 (182.04055215381644, 'people'),
 (123.94169949058865, 'black'),
 (77.87656881316387, 'race'),
 (75.80686009817212, 'kids'),
 (72.24151345874068, 'hope'),
 (67.11286966551256, 'thread'),
 (51.334493669320345, 'jews'),
 (50.929893418493656, 'children'),
 (47.58735624645415, 'man')]

### TRAC TF-IDF

In [95]:
df3 = pd.read_csv('TRAC_input.csv')

custom_stop_words = ['like', 'just','kabir','singh','good','know',
                     'want','bhai','lo','ha','ry','right','bro','watch','agree','movies',
                     'nice','really','great','best','youtube','roy','think','com','www'] #remove non-English & stopwords
default_stop_words = set(TfidfVectorizer(stop_words='english').get_stop_words())
combined_stop_words = list(default_stop_words.union(custom_stop_words))

vectorizer = TfidfVectorizer(stop_words=combined_stop_words, max_features=1000)
tfidf_matrix = vectorizer.fit_transform(df3['prepro'])
feature_array = vectorizer.get_feature_names_out()
tfidf_sorting = tfidf_matrix.sum(axis=0).A.flatten()

top_n = 10
sorted_items = sorted(zip(tfidf_sorting, feature_array), reverse=True)[:top_n]
sorted_items

[(147.55652516885982, 'movie'),
 (103.31849472958109, 'review'),
 (98.42659136286072, 'video'),
 (74.06823535550538, 'people'),
 (73.36721477200646, 'sir'),
 (64.13674722235349, 'india'),
 (49.99240087412689, 'bollywood'),
 (49.7807585078537, 'man'),
 (40.086748150438744, 'true'),
 (39.46001219084956, 'film')]

### HSOL TF-IDF

In [60]:
df4 = pd.read_csv('Davidson2017.csv')

custom_stop_words = ['yall','like','ai','im','know','got','amp','ca','bitches','hoes','just','lol','niggas'] #remove non-English & stopwords
default_stop_words = set(TfidfVectorizer(stop_words='english').get_stop_words())
combined_stop_words = list(default_stop_words.union(custom_stop_words))

vectorizer = TfidfVectorizer(stop_words=combined_stop_words, max_features=1000)
tfidf_matrix = vectorizer.fit_transform(df4['prepro'])
feature_array = vectorizer.get_feature_names_out()
tfidf_sorting = tfidf_matrix.sum(axis=0).A.flatten()

top_n = 10
sorted_items = sorted(zip(tfidf_sorting, feature_array), reverse=True)[:top_n]
sorted_items

[(1807.5605399742553, 'bitch'),
 (644.6584876026697, 'pussy'),
 (630.4868675544802, 'hoe'),
 (457.78449449406384, 'ass'),
 (457.123966574544, 'fuck'),
 (411.2587828142525, 'trash'),
 (399.3763762060491, 'shit'),
 (381.98449776784094, 'nigga'),
 (281.6739648947166, 'love'),
 (236.47528157964675, 'bad')]