In [1]:
import pandas as pd
import numpy as np
import re
import seaborn as sns
import matplotlib.pyplot as plt
from wordcloud import WordCloud

In [2]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import classification_report, accuracy_score
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, Conv1D, GlobalMaxPooling1D, Dense, Dropout

In [3]:
import plotly.graph_objects as go

In [4]:
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Unzipping corpora/stopwords.zip.
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [5]:
from tqdm import tqdm

In [6]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [7]:
folder = '/content/drive/MyDrive/Colab Notebooks/master_course/cs6120-nlp/final_project'
df1 = pd.read_csv(f"{folder}/binary/train.csv")
df2 = pd.read_csv(f"{folder}/binary/test.csv")
df3 = pd.read_csv(f"{folder}/binary/test_labels.csv")
assert (df2['id'] == df3['id']).all(), 'test.csv row id sequence is the same as test_labels.csv'

In [8]:
test = pd.concat([df2, df3], axis=1)
test.drop(['id', 'severe_toxic'],inplace=True, axis=1)
test = test[test['toxic'] != -1]

In [9]:
df1.drop(['id', 'severe_toxic'], inplace=True, axis = 1)
df = pd.concat([df1, test], axis=0)
df.reset_index(drop=True, inplace=True)
df['total'] = df['toxic'] + df['obscene'] + df['threat'] + df['insult'] + df['identity_hate']
df

Unnamed: 0,comment_text,toxic,obscene,threat,insult,identity_hate,total
0,Explanation\nWhy the edits made under my usern...,0,0,0,0,0,0
1,D'aww! He matches this background colour I'm s...,0,0,0,0,0,0
2,"Hey man, I'm really not trying to edit war. It...",0,0,0,0,0,0
3,"""\nMore\nI can't make any real suggestions on ...",0,0,0,0,0,0
4,"You, sir, are my hero. Any chance you remember...",0,0,0,0,0,0
...,...,...,...,...,...,...,...
223544,":Jerome, I see you never got around to this…! ...",0,0,0,0,0,0
223545,==Lucky bastard== \n http://wikimediafoundatio...,0,0,0,0,0,0
223546,==shame on you all!!!== \n\n You want to speak...,0,0,0,0,0,0
223547,MEL GIBSON IS A NAZI BITCH WHO MAKES SHITTY MO...,1,1,0,1,0,3


In [10]:
labels = ["toxic", "obscene", "threat", "insult", "identity_hate"]

## Data Preprocessing

In [11]:
import re
# Preprocess the data
stop_words = set(stopwords.words('english'))
def preprocess_text(text):
    text = text.lower()
    text = re.sub(r'http\S+', '', text)
    text = re.sub(r'[^\w\s\.\,\!\?]', '', text)
    tokens = word_tokenize(text)
    tokens = [word for word in tokens if not word in stop_words]
    tokens = [word for word in tokens if len(word) > 2]
    return tokens


The above function takes a string of text as input and performs several preprocessing steps to prepare it for further natural language processing tasks:


* Converts all text to lowercase to ensure consistency.
* Removes any numbers with a regular expression that looks for one or more digits.
* Strips out URLs, which typically start with "http" and continue until a space is encountered.
* Eliminates special characters, retaining only alphanumeric characters and whitespace.
* Tokenizes the cleaned text into individual words.
* Filters out any stop words from the tokens, which are common words that usually do not contribute to the meaning of the text.
* Discards words that are less than three characters long, as these are often not meaningful.

In [12]:
%%time
tqdm.pandas()
df['tokens'] = df['comment_text'].progress_apply(preprocess_text)
df['tokens'].head()

100%|██████████| 223549/223549 [02:53<00:00, 1291.17it/s]

CPU times: user 2min 40s, sys: 1.7 s, total: 2min 42s
Wall time: 2min 53s





0    [explanation, edits, made, username, hardcore,...
1    [daww, matches, background, colour, seemingly,...
2    [hey, man, really, trying, edit, war, guy, con...
3    [cant, make, real, suggestions, improvement, w...
4           [sir, hero, chance, remember, page, thats]
Name: tokens, dtype: object

### Embedding

In [14]:
!pip install fasttext

Collecting fasttext
  Downloading fasttext-0.9.2.tar.gz (68 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/68.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m10.2/68.8 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m68.8/68.8 kB[0m [31m851.7 kB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pybind11>=2.2 (from fasttext)
  Using cached pybind11-2.12.0-py3-none-any.whl (234 kB)
Building wheels for collected packages: fasttext
  Building wheel for fasttext (setup.py) ... [?25l[?25hdone
  Created wheel for fasttext: filename=fasttext-0.9.2-cp310-cp310-linux_x86_64.whl size=4227142 sha256=3652ddb6d53bee4c004d82cdcb88d1a7922ba1c1d3b596722b0664e87f9a4308
  Stored in directory: /root/.cache/pip/wheels/a5/13/75/f811c84a8ab36eedbaef977a6a58a98990e8e0f1967f98f394
Successfully built fasttex

In [15]:
%%time
import fasttext.util
fasttext.util.download_model('en', if_exists='ignore')
ft = fasttext.load_model('cc.en.300.bin')

Downloading https://dl.fbaipublicfiles.com/fasttext/vectors-crawl/cc.en.300.bin.gz

CPU times: user 13min 51s, sys: 1min 45s, total: 15min 37s
Wall time: 18min 43s




In [None]:
%%time
# Word2Vec embedding
model_w2v = Word2Vec(sentences=df['tokens'], vector_size=100, window=5, min_count=2, workers=4)

# Transform each comment into a vector
def comment_to_avg_vector(comment, model):
    words = comment
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    if len(word_vectors) == 0:
        return np.zeros(model.vector_size)
    else:
        return np.mean(word_vectors, axis=0)

# X = np.array([comment_to_avg_vector(comment, model_w2v) for comment in df['tokens']])
X = []
for comment in tqdm(df['tokens']):
    X.append(comment_to_avg_vector(comment, model_w2v))

X = np.array(X)


100%|██████████| 223549/223549 [00:32<00:00, 6848.65it/s]


CPU times: user 3min 22s, sys: 1.67 s, total: 3min 24s
Wall time: 2min 24s


In [None]:
# # Preparing the target labels
# label_encoder = LabelEncoder()
# y = df.iloc[:, 1:6].values

In [None]:
y = df.iloc[:, 1:6].values
y

array([[0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0],
       ...,
       [0, 0, 0, 0, 0],
       [1, 1, 0, 1, 0],
       [0, 0, 0, 0, 0]])

In [None]:
y.shape

(223549, 5)

In [None]:
X.shape

(223549, 100)

In [None]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.15, stratify=df['total'], random_state=42)

## CNN

In [None]:
model = Sequential()
model.add(Dense(256, activation='relu', input_dim=100))
model.add(Dropout(0.5))
model.add(Dense(128, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(5, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

In [None]:
# Train the CNN model
model.fit(X_train, y_train, validation_data=(X_val, y_val), epochs=1, batch_size=32, verbose=1)

# Evaluate the model's performance on the validation set
val_loss, val_accuracy = model.evaluate(X_val, y_val, verbose=2)
print(f"Validation Accuracy: {val_accuracy}")

1048/1048 - 2s - loss: 0.0736 - accuracy: 0.9950 - 2s/epoch - 2ms/step
Validation Accuracy: 0.9950496554374695


In [None]:
from tensorflow.keras.models import load_model

# Assuming 'model' is your trained model
predictions = model.predict(X_val)
predictions_binary = (predictions > 0.5).astype(int)




In [None]:
labels = ["toxic", "obscene", "threat", "insult", "identity_hate"]
ref = dict(zip(range(5), labels))

In [None]:
# Calculate accuracy for each label
accuracies = []
for i in range(predictions_binary.shape[1]):  # Loop over each label
    acc = accuracy_score(y_val[:, i], predictions_binary[:, i])
    accuracies.append(acc)
    print(f"Accuracy for label {i} - {ref[i]}: {acc}")

print(f'Gross Accuracy: {np.mean(np.all(y_val == predictions_binary, axis=1))}')

Accuracy for label 0 - toxic: 0.9448602868815793
Accuracy for label 1 - obscene: 0.9712223779560433
Accuracy for label 2 - threat: 0.9973160767005637
Accuracy for label 3 - insult: 0.964780962037396
Accuracy for label 4 - identity_hate: 0.9906659111919601
Gross Accuracy: 0.9151880237378105


In [None]:
# labels = ['toxic', 'obscene', 'threat', 'insult', 'identity_hate']
# plt.bar(labels, accuracies, color='blue')
# plt.xlabel('Labels')
# plt.ylabel('Accuracy')
# plt.title('Accuracy for each label')
# plt.xticks(rotation=45)
# plt.show()

In [None]:
from nltk.tokenize import word_tokenize
import re
import numpy as np
from gensim.models import Word2Vec

# Assuming model_w2v is your trained Word2Vec model
def preprocess_and_embed(text, model_w2v):
    stop_words = set(stopwords.words('english'))
    # Clean text
    text = text.lower()
    text = re.sub(r'[^\w\s]', '', text)
    text = re.sub(r'\d+', '', text)
    tokens = word_tokenize(text)
    tokens = [token for token in tokens if token not in stop_words and len(token) > 2]

    # Convert tokens to vector
    vector = np.mean([model_w2v.wv[token] for token in tokens if token in model_w2v.wv], axis=0)
    return np.array([vector])

# Update the function as necessary to match your specific preprocessing pipeline


In [None]:
def predict_toxicity(model, model_w2v):
    # Get input text from the user
    input_text = input("Enter text for toxicity prediction: ")

    # Preprocess the text and embed using Word2Vec
    X = preprocess_and_embed(input_text, model_w2v)

    # Check for empty or invalid input handling
    if X.shape[1] != model_w2v.vector_size or np.isnan(X).any():
        print("Error: Text contains words not in the vocabulary or other issues.")
        return

    # Predict using the model
    predictions = model.predict(X)
    labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']
    prediction_dict = dict(zip(labels, predictions.flatten()))
    print(prediction_dict)

    colors = {'<0.5': 'green', '=0.5': 'blue', '>0.5': 'red'}

    fig = go.Figure()
    for key, value in prediction_dict.items():
        color = colors['<0.5'] if value < 0.5 else (colors['=0.5'] if value == 0.5 else colors['>0.5'])
        fig.add_trace(go.Bar(x=[key], y=[value], marker=dict(color=color), text=f'{value:.2f}', textposition='auto'))

    fig.update_layout(
        title='Plot for Dictionary',
        xaxis=dict(title='Keys'),
        yaxis=dict(title='Values', range=[0, 1]),
        shapes=[dict(type='line', xref='paper', x0=0, y0=0.5, x1=1, y1=0.5, line=dict(color='gray', dash='dash'))],
        showlegend=False,
        autosize=False,
        width=800,
        height=600,
    )

    fig.show()

    return prediction_dict

In [None]:
# Example usage
predicted_labels = predict_toxicity(model, model_w2v)
# print(predicted_labels)


Enter text for toxicity prediction: I hate
{'toxic': 0.6022484, 'severe_toxic': 0.10361155, 'obscene': 0.011947266, 'threat': 0.14108397, 'insult': 0.031298485}


## LSTM

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import LSTM, Dense, Embedding, SpatialDropout1D

# Tokenization and integer encoding
tokenizer = Tokenizer()
tokenizer.fit_on_texts(train['tokens'])  # Assuming 'tokens' are already prepared as lists of words
sequences = tokenizer.texts_to_sequences(train['tokens'])

# Padding sequences
max_length = max(len(x) for x in sequences)  # Or choose a fixed length
X = pad_sequences(sequences, maxlen=max_length, padding='post')

# Preparing the target labels remains the same
y = train.iloc[:, 2:8].values

In [None]:
# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)


In [None]:
# LSTM Model Definition
model = Sequential()
model.add(Embedding(input_dim=len(tokenizer.word_index) + 1, output_dim=100, input_length=max_length))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, return_sequences=False))
model.add(Dense(6, activation='sigmoid'))

model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
model.summary()


Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_1 (Embedding)     (None, 1250, 100)         260500    
                                                                 
 spatial_dropout1d_1 (Spati  (None, 1250, 100)         0         
 alDropout1D)                                                    
                                                                 
 lstm_1 (LSTM)               (None, 100)               80400     
                                                                 
 dense_7 (Dense)             (None, 6)                 606       
                                                                 
Total params: 341506 (1.30 MB)
Trainable params: 341506 (1.30 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [None]:
# Training
model.fit(X_train, y_train, epochs=5, batch_size=32, validation_data=(X_val, y_val), verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7e8a6bf58be0>

In [None]:
# from tensorflow.keras.models import Sequential
# from tensorflow.keras.layers import LSTM, Dense, Dropout, Embedding
# from tensorflow.keras.preprocessing.sequence import pad_sequences

# # Padding sequences to ensure uniform input size
# max_length = 200  # You can choose a different length based on your data
# X_train_padded = pad_sequences(X_train, maxlen=max_length, padding='post')
# X_val_padded = pad_sequences(X_val, maxlen=max_length, padding='post')

# # Define the LSTM model
# model = Sequential()
# model.add(Embedding(input_dim=len(model_w2v.wv.index_to_key), output_dim=100, input_length=max_length))
# model.add(LSTM(128, return_sequences=True))
# model.add(GlobalMaxPooling1D())  # This layer can help in reducing the sequence to a vector
# model.add(Dropout(0.5))
# model.add(Dense(128, activation='relu'))
# model.add(Dropout(0.5))
# model.add(Dense(6, activation='sigmoid'))  # Output layer

# model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# # Display model architecture
# print(model.summary())


Model: "sequential_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 200, 100)          9173600   
                                                                 
 lstm (LSTM)                 (None, 200, 128)          117248    
                                                                 
 global_max_pooling1d (Glob  (None, 128)               0         
 alMaxPooling1D)                                                 
                                                                 
 dropout_12 (Dropout)        (None, 128)               0         
                                                                 
 dense_18 (Dense)            (None, 128)               16512     
                                                                 
 dropout_13 (Dropout)        (None, 128)               0         
                                                      

In [None]:
# Evaluate the model
val_loss, val_accuracy = model.evaluate(X_val, y_val, verbose=2)
print(f"Validation Accuracy: {val_accuracy}")

# Predict and evaluate using metrics
# predictions = model.predict(X_val_padded)
predictions = model.predict(X_val)
predictions_binary = (predictions > 0.5).astype(int)

print(classification_report(y_val, predictions_binary, target_names=['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']))


1/1 - 0s - loss: 0.2013 - accuracy: 1.0000 - 291ms/epoch - 291ms/step
Validation Accuracy: 1.0




               precision    recall  f1-score   support

        toxic       0.00      0.00      0.00         3
 severe_toxic       0.00      0.00      0.00         1
      obscene       0.00      0.00      0.00         3
       threat       0.00      0.00      0.00         0
       insult       0.00      0.00      0.00         3
identity_hate       0.00      0.00      0.00         0

    micro avg       0.00      0.00      0.00        10
    macro avg       0.00      0.00      0.00        10
 weighted avg       0.00      0.00      0.00        10
  samples avg       0.00      0.00      0.00        10



  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [None]:
# import re
# import string
# from nltk.stem.wordnet import WordNetLemmatizer
# from nltk.corpus import stopwords
# from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
# def tokenize(text):
#     '''
#     Tokenize text and return a non-unique list of tokenized words found in the text.
#     Normalize to lowercase, strip punctuation, remove stop words, filter non-ascii characters.
#     Lemmatize the words and lastly drop words of length < 3.
#     '''
#     text = text.lower()
#     regex = re.compile('[' + re.escape(string.punctuation) + '0-9\\r\\t\\n]')
#     nopunct = regex.sub(" ", text)
#     words = nopunct.split(' ')
#     # remove any non ascii
#     words = [word.encode('ascii', 'ignore').decode('ascii') for word in words]
#     lmtzr = WordNetLemmatizer()
#     words = [lmtzr.lemmatize(w) for w in words]
#     words = [w for w in words if len(w) > 2]
#     return words

In [None]:
# vector = TfidfVectorizer(ngram_range=(1, 1), analyzer='word',
#                          tokenizer=tokenize, stop_words='english',
#                          strip_accents='unicode', use_idf=1, min_df=10)
# X_train = vector.fit_transform(train['comment_text'])
# X_test = vector.transform(test['comment_text'])



LookupError: 
**********************************************************************
  Resource [93mwordnet[0m not found.
  Please use the NLTK Downloader to obtain the resource:

  [31m>>> import nltk
  >>> nltk.download('wordnet')
  [0m
  For more information see: https://www.nltk.org/data.html

  Attempted to load [93mcorpora/wordnet[0m

  Searched in:
    - '/root/nltk_data'
    - '/usr/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/share/nltk_data'
    - '/usr/local/share/nltk_data'
    - '/usr/lib/nltk_data'
    - '/usr/local/lib/nltk_data'
**********************************************************************


## Transformers

In [None]:
from transformers import pipeline
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/1.15k [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.63G [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

In [None]:
candidate_labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']


In [None]:
# Define the sequence to classify and the candidate labels
sequence = "Fuck you."


In [None]:
# Perform zero-shot classification
results = classifier(sequence, candidate_labels)

print("Labels:", results['labels'])
print("Scores:", results['scores'])

Labels: ['insult', 'threat', 'toxic', 'identity_hate', 'severe_toxic', 'obscene']
Scores: [0.43729498982429504, 0.39923781156539917, 0.04664387181401253, 0.03944968432188034, 0.03930214047431946, 0.03807152807712555]


In [None]:
train = train.sample(frac=0.00025, random_state=1)

In [None]:
train.shape

(40, 9)

In [None]:
def get_predictions(text):
    result = classifier(text, candidate_labels)
    # Convert probabilities to binary predictions based on a threshold
    predictions = [1 if score >= 0.5 else 0 for score in result['scores']]
    return dict(zip(result['labels'], predictions))

# Apply to a portion of the dataset for demonstration
train['predicted_labels'] = train['comment_text'].apply(get_predictions)


In [None]:
from sklearn.metrics import accuracy_score

# Function to calculate accuracy for each label
def calculate_accuracy(data, true_labels):
    accuracies = {}
    for label in candidate_labels:
        # Extract predicted and actual labels for the current category
        y_pred = data['predicted_labels'].apply(lambda x: x[label])
        y_true = data[label]
        accuracies[label] = accuracy_score(y_true, y_pred)
    return accuracies

# Assuming your actual labels are binary [0,1] and stored in the same DataFrame
accuracies = calculate_accuracy(train, candidate_labels)
print("Accuracies by label:", accuracies)


Accuracies by label: {'toxic': 0.925, 'severe_toxic': 0.975, 'obscene': 0.95, 'threat': 0.775, 'insult': 0.975, 'identity_hate': 1.0}


### Rows accuracy

In [None]:
from transformers import pipeline

# Load the classifier
classifier = pipeline("zero-shot-classification", model="facebook/bart-large-mnli")
candidate_labels = ['toxic', 'severe_toxic', 'obscene', 'threat', 'insult', 'identity_hate']

def get_predictions(text):
    result = classifier(text, candidate_labels)
    predictions = [1 if score >= 0.5 else 0 for score in result['scores']]
    return dict(zip(result['labels'], predictions))


In [None]:
# Apply the prediction function to the 'comment_text' column
train['predicted_labels'] = train['comment_text'].apply(get_predictions)

In [None]:
def calculate_row_accuracy(data, candidate_labels):
    # Function to check if all predicted labels match the actual labels for a row
    def is_row_correct(row):
        predicted_labels = row['predicted_labels']
        actual_labels = {label: row[label] for label in candidate_labels}
        return all(predicted_labels[label] == actual_labels[label] for label in candidate_labels)

    # Apply the row correctness check for each row in the DataFrame
    correct_rows = data.apply(is_row_correct, axis=1)
    accuracy = correct_rows.mean()
    return accuracy

# Calculate row-wise accuracy
row_accuracy = calculate_row_accuracy(train, candidate_labels)
print("Row-wise Accuracy:", row_accuracy)
