# Cleaning

1. Check for duplicates
2. Sigmoid hate speech score
3. 

# 1. Import data

In [1]:
from datasets import load_dataset
import pandas as pd
import tqdm as notebook_tqdm

# Load the dataset from the Hugging Face Hub
# The dataset is available at: https://huggingface.co/datasets/ucberkeley-dlab/measuring-hate-speech
notebook_tqdm.tqdm.pandas()
dataset = load_dataset('ucberkeley-dlab/measuring-hate-speech')

# Convert the dataset to a pandas DataFrame
df = dataset['train'].to_pandas()

In [2]:
df.head()

Unnamed: 0,comment_id,annotator_id,platform,sentiment,respect,insult,humiliate,status,dehumanize,violence,...,annotator_religion_hindu,annotator_religion_jewish,annotator_religion_mormon,annotator_religion_muslim,annotator_religion_nothing,annotator_religion_other,annotator_sexuality_bisexual,annotator_sexuality_gay,annotator_sexuality_straight,annotator_sexuality_other
0,47777,10873,3,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,False,False,False,False,False,False,False,False,True,False
1,39773,2790,2,0.0,0.0,0.0,0.0,2.0,0.0,0.0,...,False,False,False,False,False,False,False,False,True,False
2,47101,3379,3,4.0,4.0,4.0,4.0,4.0,4.0,0.0,...,False,False,False,False,True,False,False,False,True,False
3,43625,7365,3,2.0,3.0,2.0,1.0,2.0,0.0,0.0,...,False,False,False,False,False,False,False,False,True,False
4,12538,488,0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,False,False,False,False,False,False,False,False,True,False


# 2. EDA

In [3]:
# Summary statistics of the dataset
df.describe()

Unnamed: 0,comment_id,annotator_id,platform,sentiment,respect,insult,humiliate,status,dehumanize,violence,...,hatespeech,hate_speech_score,infitms,outfitms,annotator_severity,std_err,annotator_infitms,annotator_outfitms,hypothesis,annotator_age
count,135556.0,135556.0,135556.0,135556.0,135556.0,135556.0,135556.0,135556.0,135556.0,135556.0,...,135556.0,135556.0,135556.0,135556.0,135556.0,135556.0,135556.0,135556.0,135556.0,135451.0
mean,23530.416138,5567.097812,1.281352,2.954307,2.828875,2.56331,2.278638,2.698575,1.846211,1.052045,...,0.744733,-0.567428,1.034322,1.001052,-0.018817,0.300588,1.007158,1.011841,0.014589,37.910772
std,12387.194125,3230.508937,1.023542,1.231552,1.309548,1.38983,1.370876,0.8985,1.402372,1.345706,...,0.93226,2.380003,0.496867,0.791943,0.487261,0.23638,0.269876,0.675863,0.613006,11.641276
min,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,-8.34,0.1,0.07,-1.82,0.02,0.39,0.28,-1.578693,18.0
25%,18148.0,2719.0,0.0,2.0,2.0,2.0,1.0,2.0,1.0,0.0,...,0.0,-2.33,0.71,0.56,-0.38,0.03,0.81,0.67,-0.341008,29.0
50%,20052.0,5602.5,1.0,3.0,3.0,3.0,3.0,3.0,2.0,0.0,...,0.0,-0.34,0.96,0.83,-0.02,0.34,0.97,0.85,0.110405,35.0
75%,32038.25,8363.0,2.0,4.0,4.0,4.0,3.0,3.0,3.0,2.0,...,2.0,1.41,1.3,1.22,0.35,0.42,1.17,1.13,0.449555,45.0
max,50070.0,11142.0,3.0,4.0,4.0,4.0,4.0,4.0,4.0,4.0,...,2.0,6.3,5.9,9.0,1.36,1.9,2.01,9.0,0.987511,81.0


In [4]:
# Print the information about the dataset using pandas because df.info() is not well suited for large datasets
summary = pd.DataFrame({
    'Column': df.columns,
    'Non-Null Count': df.notnull().sum().values,
    'Dtype': df.dtypes.values
})

summary


Unnamed: 0,Column,Non-Null Count,Dtype
0,comment_id,135556,int32
1,annotator_id,135556,int32
2,platform,135556,int8
3,sentiment,135556,float64
4,respect,135556,float64
...,...,...,...
126,annotator_religion_other,135556,bool
127,annotator_sexuality_bisexual,135556,bool
128,annotator_sexuality_gay,135556,bool
129,annotator_sexuality_straight,135556,bool


In [5]:
# Check for duplicate rows based on the 'text' column
duplicate_texts = df[df.duplicated(subset='text', keep=False)]
print(f"Number of duplicate texts: {duplicate_texts.shape[0]}")
duplicate_texts[['comment_id', 'text']].head()

Number of duplicate texts: 125479


Unnamed: 0,comment_id,text
0,47777,Yes indeed. She sort of reminds me of the elde...
1,39773,The trans women reading this tweet right now i...
2,47101,Question: These 4 broads who criticize America...
3,43625,It is about time for all illegals to go back t...
4,12538,For starters bend over the one in pink and kic...


In [6]:
# Group by 'text' and count unique 'hate_speech_score' values per group
score_check = df.groupby("text")["hate_speech_score"].nunique()

# Find texts with more than one unique score
inconsistent = score_check[score_check > 1]

# Show how many inconsistencies there are
print(f"Number of 'text' entries with inconsistent scores: {len(inconsistent)}")

# Optionally, view a few examples
if not inconsistent.empty:
    print(df[df["text"].isin(inconsistent.index)].sort_values("text").head(10))

Number of 'text' entries with inconsistent scores: 0


## 2.1 Modifying the dataframe

In [7]:
# Step 1: Select relevant columns
cols_to_keep = ['comment_id', 'text', 'hate_speech_score'] + [col for col in df.columns if col.startswith('target_')]
df_relevant = df[cols_to_keep].copy()

# Step 2: Convert booleans to integers for mean calculation
bool_cols = [col for col in df_relevant.columns if col.startswith('target_')]
df_relevant[bool_cols] = df_relevant[bool_cols].astype(int)

# Step 3: Group by 'comment_id' and 'text' to keep them in final output
agg_df = df_relevant.groupby(['comment_id', 'text']).mean().reset_index()

In [8]:
min_score = agg_df['hate_speech_score'].min()  # e.g., -8.34
max_score = agg_df['hate_speech_score'].max()  # e.g., +6.3

# Add normalized column - [0, 1]
agg_df.insert(agg_df.columns.get_loc('hate_speech_score') + 1, 'hate_speech_score_normalized', (agg_df['hate_speech_score'] - min_score) / (max_score - min_score))



In [9]:
# Normalize the thresholds given by the dataset authors
threshold_hate = (0.5 - min_score) / (max_score - min_score)  
threshold_counter = (-1 - min_score) / (max_score - min_score)

# Add binary columns for hate speech
agg_df.insert(agg_df.columns.get_loc('hate_speech_score_normalized') + 1, 'is_hate_speech', (agg_df['hate_speech_score_normalized'] > threshold_hate).astype(int))

# Print new normalized thresholds 
print(f"Threshold for hate speech: {threshold_hate:.3f} -> Equivalent to {0.5} on the original scale")
print(f"Threshold for counter speech: {threshold_counter:.3f} -> Equivalent to {-1} on the original scale")
print(f"Between the two thresholds: {threshold_hate:.3f} and {threshold_counter:.3f} -> Equivalent to {0.5} and {-1} on the original scale which is the unambiguous region")


Threshold for hate speech: 0.604 -> Equivalent to 0.5 on the original scale
Threshold for counter speech: 0.501 -> Equivalent to -1 on the original scale
Between the two thresholds: 0.604 and 0.501 -> Equivalent to 0.5 and -1 on the original scale which is the unambiguous region


In [10]:
agg_df.head(40)

Unnamed: 0,comment_id,text,hate_speech_score,hate_speech_score_normalized,is_hate_speech,target_race_asian,target_race_black,target_race_latinx,target_race_middle_eastern,target_race_native_american,...,target_age_other,target_age,target_disability_physical,target_disability_cognitive,target_disability_neurological,target_disability_visually_impaired,target_disability_hearing_impaired,target_disability_unspecific,target_disability_other,target_disability
0,1,White people are trash.,0.46,0.601093,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,"You've caught the big gay, it's known to be ai...",0.03,0.571721,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,"I'm not saying you said that, I'm just saying ...",-1.29,0.481557,0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,Donald Trump. Yeet myself off a building onto ...,-0.24,0.553279,0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,Fabrice Fabrice is ostensibly black or black/l...,-2.84,0.375683,0,0.0,1.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,6,First off you look cool as fuck! Anyway if we ...,1.72,0.687158,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6,7,\*points to posters asking for palestinian rig...,-0.77,0.517077,0,0.0,0.0,0.0,0.333333,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7,8,Guranteed at least one of these guys has raped...,1.24,0.654372,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,10,"They'll come back in your plan, also. Plus we ...",1.35,0.661885,1,0.0,0.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9,11,"eat my fuck, bitch",1.07,0.64276,1,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


# 3. Bag of Words

In [11]:
# Dataframe for Bag of Words (BoW) representation
bow_df = agg_df.copy()

# Releveant columns
cols_to_keep = ['text', 'is_hate_speech']

# Select relevant columns
bow_df = bow_df[cols_to_keep].copy()

In [12]:
label_counts = bow_df['is_hate_speech'].value_counts()
label_percentage = bow_df['is_hate_speech'].value_counts(normalize=True) * 100

print("Label Counts:")
print(label_counts)
print("\nLabel Percentage Distribution:")
print(label_percentage)

Label Counts:
is_hate_speech
0    29211
1    10354
Name: count, dtype: int64

Label Percentage Distribution:
is_hate_speech
0    73.830406
1    26.169594
Name: proportion, dtype: float64


## 3.1 Data preprocessing

## TODO: Maybe remove preprocessing from pipepine to just have it done once for "Part 2" of Assignment 3 approach

In [13]:
import re
import nltk
import string
from nltk.tokenize import TweetTokenizer
from nltk.corpus import stopwords, wordnet
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer

# Download necessary NLTK resources
for resource in ['punkt', 'stopwords', 'wordnet', 'averaged_perceptron_tagger']:
    nltk.download(resource, quiet=True)

# Tokenizer
tknzr = TweetTokenizer()
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

def get_wordnet_pos(tag):
    """
    Convert the part of speech tag to a format that WordNet lemmatizer can understand.
    starts with 'J' for adjectives, 'V' for verbs, 'N' for nouns, and 'R' for adverbs.
    Args:
        tag (str): The part of speech tag.
    Returns:
        str: The WordNet part of speech tag.
    """
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN
    
def is_ascii(token):
    return all(ord(c) < 128 for c in token)

def remove_repeated_chars(token, threshold=3):
    return re.sub(r'(.)\1{' + str(threshold) + r',}', r'\1', token)

def preprocess(text):
    """
    Preprocess the text by tokenizing, removing punctuation, stop words,
    and lemmatizing the words.
    Args:
        text (str): The text to preprocess.
    Returns:
        str: The preprocessed text.
    Example:
        >>> preprocess("This is an EXAMPLE sentence!!!.")
        'example sentence'
        
    """
    tokens = tknzr.tokenize(text) # Tokenize the text
    tokens = [word.lower() for word in tokens if word not in string.punctuation] # Remove punctuation
    tokens = [word for word in tokens if word not in stop_words] # Remove stop words
    tokens = [remove_repeated_chars(word) for word in tokens] # Remove repeated characters, ex: "loooove" -> "love"
    tokens = [word for word in tokens if len(word) >= 2 and is_ascii(word)]  # Filter by length and ASCII
    pos_tags = pos_tag(tokens) # Get part of speech tags
    lemmatized = [lemmatizer.lemmatize(word, get_wordnet_pos(pos)) for word, pos in pos_tags] # Lemmatize the words using the part of speech tags
    return " ".join(lemmatized)

In [14]:
from sklearn.base import BaseEstimator, TransformerMixin

class TextPreprocessor(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None):
        return self
    
    def transform(self, X):
        return X.progress_apply(preprocess)

In [15]:
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression

pipeline_bow = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', CountVectorizer()),
    ('classifier', LogisticRegression(class_weight='balanced'))
])

## 3.2 Train Test Split

In [16]:
from sklearn.model_selection import train_test_split

X = bow_df['text']
y = bow_df['is_hate_speech']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


## 3.3 Fit model to training set - BoW + LogReg

In [17]:
# Fit the pipeline to the training data
pipeline_bow.fit(X_train, y_train)


100%|██████████| 31652/31652 [00:24<00:00, 1301.89it/s]


In [18]:
from sklearn.metrics import classification_report

# Make predictions on the test set
y_pred_bow = pipeline_bow.predict(X_test)

# Generate the classification report
report_bow = classification_report(y_test, y_pred_bow, output_dict=True)

100%|██████████| 7913/7913 [00:06<00:00, 1209.07it/s]


In [19]:
results = []
results.append({
    'model': 'BoW + LogisticRegression',
    'precision_hate': report_bow['1']['precision'],
    'recall_hate': report_bow['1']['recall'],
    'f1_hate': report_bow['1']['f1-score'],
    'accuracy': report_bow['accuracy']
})

## 3.4 Fit model to training set - TFIDF + LogRes

In [20]:
from sklearn.feature_extraction.text import TfidfVectorizer

pipeline_tfidf = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', TfidfVectorizer()),
    ('classifier', LogisticRegression(class_weight='balanced'))
])

pipeline_tfidf.fit(X_train, y_train)
y_pred_tfidf = pipeline_tfidf.predict(X_test)

report_tfidf = classification_report(y_test, y_pred_tfidf, output_dict=True)

100%|██████████| 31652/31652 [00:22<00:00, 1426.24it/s]
100%|██████████| 7913/7913 [00:06<00:00, 1198.19it/s]


In [21]:
results.append({
    'model': 'TF-IDF + LogisticRegression',
    'precision_hate': report_tfidf['1']['precision'],
    'recall_hate': report_tfidf['1']['recall'],
    'f1_hate': report_tfidf['1']['f1-score'],
    'accuracy': report_tfidf['accuracy']
})

## 3.4.1 Hyperparameter tuning with GridSearchCV

In [None]:
from sklearn.model_selection import GridSearchCV

pipeline_tuned = Pipeline([
    ('preprocessor', TextPreprocessor()),
    ('vectorizer', TfidfVectorizer()),  # or CountVectorizer()
    ('classifier', LogisticRegression(class_weight='balanced', max_iter=1000))
])

param_grid = {
    'vectorizer__ngram_range': [(1,1), (1,2)],
    'vectorizer__max_features': [5000, 10000],
    'vectorizer__max_df': [0.75, 0.9],
    'classifier__C': [0.1, 1],
    'classifier__penalty': ['l1', 'l2'],
    'classifier__solver': ['liblinear']  # Needed for 'l1' penalty
}

grid_search = GridSearchCV(
    pipeline_tuned,
    param_grid,
    cv=3,  # 3-fold cross-validation
    scoring='f1',  # use 'f1' to focus on hate speech detection
    verbose=2, # print progress
    n_jobs=1  # use all CPU cores
)

grid_search.fit(X_train, y_train)

Fitting 3 folds for each of 64 candidates, totalling 192 fits


100%|██████████| 21101/21101 [00:15<00:00, 1362.40it/s]
100%|██████████| 10551/10551 [00:07<00:00, 1443.19it/s]


[CV] END classifier__C=0.01, classifier__penalty=l1, classifier__solver=liblinear, vectorizer__max_df=0.75, vectorizer__max_features=5000, vectorizer__ngram_range=(1, 1); total time=  23.1s


100%|██████████| 21101/21101 [00:15<00:00, 1360.12it/s]
100%|██████████| 10551/10551 [00:07<00:00, 1351.21it/s]


[CV] END classifier__C=0.01, classifier__penalty=l1, classifier__solver=liblinear, vectorizer__max_df=0.75, vectorizer__max_features=5000, vectorizer__ngram_range=(1, 1); total time=  23.6s


100%|██████████| 21102/21102 [00:14<00:00, 1413.62it/s]
100%|██████████| 10550/10550 [00:07<00:00, 1423.14it/s]


[CV] END classifier__C=0.01, classifier__penalty=l1, classifier__solver=liblinear, vectorizer__max_df=0.75, vectorizer__max_features=5000, vectorizer__ngram_range=(1, 1); total time=  22.6s


100%|██████████| 21101/21101 [00:14<00:00, 1429.13it/s]
100%|██████████| 10551/10551 [00:07<00:00, 1376.47it/s]


[CV] END classifier__C=0.01, classifier__penalty=l1, classifier__solver=liblinear, vectorizer__max_df=0.75, vectorizer__max_features=5000, vectorizer__ngram_range=(1, 2); total time=  23.2s


100%|██████████| 21101/21101 [00:14<00:00, 1412.57it/s]
100%|██████████| 10551/10551 [00:07<00:00, 1441.49it/s]


[CV] END classifier__C=0.01, classifier__penalty=l1, classifier__solver=liblinear, vectorizer__max_df=0.75, vectorizer__max_features=5000, vectorizer__ngram_range=(1, 2); total time=  23.0s


100%|██████████| 21102/21102 [00:14<00:00, 1437.06it/s]
100%|██████████| 10550/10550 [00:07<00:00, 1393.92it/s]


[CV] END classifier__C=0.01, classifier__penalty=l1, classifier__solver=liblinear, vectorizer__max_df=0.75, vectorizer__max_features=5000, vectorizer__ngram_range=(1, 2); total time=  23.0s


100%|██████████| 21101/21101 [00:14<00:00, 1420.79it/s]
100%|██████████| 10551/10551 [00:07<00:00, 1374.38it/s]


[CV] END classifier__C=0.01, classifier__penalty=l1, classifier__solver=liblinear, vectorizer__max_df=0.75, vectorizer__max_features=10000, vectorizer__ngram_range=(1, 1); total time=  23.0s


100%|██████████| 21101/21101 [00:14<00:00, 1472.37it/s]
100%|██████████| 10551/10551 [00:07<00:00, 1481.11it/s]


[CV] END classifier__C=0.01, classifier__penalty=l1, classifier__solver=liblinear, vectorizer__max_df=0.75, vectorizer__max_features=10000, vectorizer__ngram_range=(1, 1); total time=  21.8s


100%|██████████| 21102/21102 [00:14<00:00, 1464.58it/s]
100%|██████████| 10550/10550 [00:07<00:00, 1486.24it/s]


[CV] END classifier__C=0.01, classifier__penalty=l1, classifier__solver=liblinear, vectorizer__max_df=0.75, vectorizer__max_features=10000, vectorizer__ngram_range=(1, 1); total time=  21.8s


100%|██████████| 21101/21101 [00:14<00:00, 1447.03it/s]
100%|██████████| 10551/10551 [00:07<00:00, 1485.69it/s]


[CV] END classifier__C=0.01, classifier__penalty=l1, classifier__solver=liblinear, vectorizer__max_df=0.75, vectorizer__max_features=10000, vectorizer__ngram_range=(1, 2); total time=  22.4s


100%|██████████| 21101/21101 [00:15<00:00, 1365.45it/s]
100%|██████████| 10551/10551 [00:08<00:00, 1237.43it/s]


[CV] END classifier__C=0.01, classifier__penalty=l1, classifier__solver=liblinear, vectorizer__max_df=0.75, vectorizer__max_features=10000, vectorizer__ngram_range=(1, 2); total time=  25.0s


100%|██████████| 21102/21102 [00:14<00:00, 1414.98it/s]
100%|██████████| 10550/10550 [00:07<00:00, 1498.65it/s]


[CV] END classifier__C=0.01, classifier__penalty=l1, classifier__solver=liblinear, vectorizer__max_df=0.75, vectorizer__max_features=10000, vectorizer__ngram_range=(1, 2); total time=  22.7s


100%|██████████| 21101/21101 [00:14<00:00, 1482.35it/s]
100%|██████████| 10551/10551 [00:07<00:00, 1382.05it/s]


[CV] END classifier__C=0.01, classifier__penalty=l1, classifier__solver=liblinear, vectorizer__max_df=0.9, vectorizer__max_features=5000, vectorizer__ngram_range=(1, 1); total time=  22.2s


100%|██████████| 21101/21101 [00:14<00:00, 1412.98it/s]
100%|██████████| 10551/10551 [00:07<00:00, 1384.55it/s]


[CV] END classifier__C=0.01, classifier__penalty=l1, classifier__solver=liblinear, vectorizer__max_df=0.9, vectorizer__max_features=5000, vectorizer__ngram_range=(1, 1); total time=  22.9s


100%|██████████| 21102/21102 [00:14<00:00, 1442.02it/s]
100%|██████████| 10550/10550 [00:07<00:00, 1450.15it/s]


[CV] END classifier__C=0.01, classifier__penalty=l1, classifier__solver=liblinear, vectorizer__max_df=0.9, vectorizer__max_features=5000, vectorizer__ngram_range=(1, 1); total time=  22.2s


100%|██████████| 21101/21101 [00:14<00:00, 1433.69it/s]
100%|██████████| 10551/10551 [00:07<00:00, 1425.26it/s]


[CV] END classifier__C=0.01, classifier__penalty=l1, classifier__solver=liblinear, vectorizer__max_df=0.9, vectorizer__max_features=5000, vectorizer__ngram_range=(1, 2); total time=  22.9s


100%|██████████| 21101/21101 [00:14<00:00, 1440.73it/s]
100%|██████████| 10551/10551 [00:07<00:00, 1436.52it/s]


[CV] END classifier__C=0.01, classifier__penalty=l1, classifier__solver=liblinear, vectorizer__max_df=0.9, vectorizer__max_features=5000, vectorizer__ngram_range=(1, 2); total time=  22.8s


100%|██████████| 21102/21102 [00:14<00:00, 1416.05it/s]
100%|██████████| 10550/10550 [00:07<00:00, 1413.39it/s]


[CV] END classifier__C=0.01, classifier__penalty=l1, classifier__solver=liblinear, vectorizer__max_df=0.9, vectorizer__max_features=5000, vectorizer__ngram_range=(1, 2); total time=  23.1s


100%|██████████| 21101/21101 [00:14<00:00, 1455.32it/s]
100%|██████████| 10551/10551 [00:07<00:00, 1386.99it/s]


[CV] END classifier__C=0.01, classifier__penalty=l1, classifier__solver=liblinear, vectorizer__max_df=0.9, vectorizer__max_features=10000, vectorizer__ngram_range=(1, 1); total time=  22.4s


100%|██████████| 21101/21101 [00:14<00:00, 1425.33it/s]
100%|██████████| 10551/10551 [00:07<00:00, 1452.44it/s]


[CV] END classifier__C=0.01, classifier__penalty=l1, classifier__solver=liblinear, vectorizer__max_df=0.9, vectorizer__max_features=10000, vectorizer__ngram_range=(1, 1); total time=  22.4s


100%|██████████| 21102/21102 [00:14<00:00, 1430.96it/s]
100%|██████████| 10550/10550 [00:07<00:00, 1443.93it/s]


[CV] END classifier__C=0.01, classifier__penalty=l1, classifier__solver=liblinear, vectorizer__max_df=0.9, vectorizer__max_features=10000, vectorizer__ngram_range=(1, 1); total time=  22.3s


100%|██████████| 21101/21101 [00:14<00:00, 1440.78it/s]
100%|██████████| 10551/10551 [00:07<00:00, 1446.81it/s]


[CV] END classifier__C=0.01, classifier__penalty=l1, classifier__solver=liblinear, vectorizer__max_df=0.9, vectorizer__max_features=10000, vectorizer__ngram_range=(1, 2); total time=  22.7s


100%|██████████| 21101/21101 [00:14<00:00, 1433.08it/s]
100%|██████████| 10551/10551 [00:07<00:00, 1401.16it/s]


[CV] END classifier__C=0.01, classifier__penalty=l1, classifier__solver=liblinear, vectorizer__max_df=0.9, vectorizer__max_features=10000, vectorizer__ngram_range=(1, 2); total time=  23.1s


 54%|█████▎    | 11323/21102 [00:07<00:06, 1436.87it/s]


KeyboardInterrupt: 

In [None]:
print("Best params:", grid_search.best_params_)
print("Best cross-validated F1 score:", grid_search.best_score_)

# Use best model to predict test set
best_model = grid_search.best_estimator_
y_pred_tuned = best_model.predict(X_test)


print(classification_report(y_test, y_pred_tuned))

In [24]:
results_df = pd.DataFrame(results)

results_df

Unnamed: 0,model,precision_hate,recall_hate,f1_hate,accuracy
0,BoW + LogisticRegression,0.55595,0.683624,0.613212,0.775054
1,TF-IDF + LogisticRegression,0.551529,0.707849,0.619987,0.773664
