In [None]:
!pip install transformers
!pip install torch
!pip install scikit-learn




In [None]:
import pandas as pd
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import torch
from torch.utils.data import DataLoader, TensorDataset, random_split
from transformers import BertTokenizer, BertForSequenceClassification, AdamW, get_linear_schedule_with_warmup
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from tqdm.notebook import tqdm


# Loading Data

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
df = pd.read_csv('/content/drive/MyDrive/NLP labs/NLP project/toxicTweet.csv')
df2 = pd.read_csv('/content/drive/MyDrive/NLP labs/NLP project/ToxicTweetsWithPersona.csv')
df2.head()

Unnamed: 0.1,Unnamed: 0,Toxicity,tweet,persona
0,0,0,@user when a father is dysfunctional and is s...,i
1,1,0,@user @user thanks for #lyft credit i can't us...,
2,2,0,bihday your majesty,i
3,3,0,#model i love u take with u all the time in ...,
4,4,0,factsguide: society now #motivation,disputes


# Data Cleaning

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  10000 non-null  int64 
 1   Toxicity    10000 non-null  int64 
 2   tweet       10000 non-null  object
dtypes: int64(2), object(1)
memory usage: 234.5+ KB


In [None]:
df.isnull().sum()

Unnamed: 0    0
Toxicity      0
tweet         0
dtype: int64

In [None]:
df.duplicated().sum()

0

# Text Preprocessing

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Initialize the frequency dictionary
frequency = {}

def clean_text(text):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    # Tokenize the text
    tokens = word_tokenize(text)

    # Clean and preprocess the tokens
    cleaned_tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.isalpha() and token.lower() not in stop_words]

    # Update the word frequency
    for token in cleaned_tokens:
        frequency[token] = frequency.get(token, 0) + 1

    # Join the cleaned tokens to form a cleaned text
    cleaned_text = ' '.join(cleaned_tokens)

    return cleaned_text

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
df['tweet'] = df['tweet'].apply(clean_text)
df.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0.1,Unnamed: 0,Toxicity,tweet
0,0,0,user father dysfunctional selfish drag kid dys...
1,1,0,user user thanks lyft credit ca use cause offe...
2,2,0,bihday majesty
3,3,0,model love u take u time
4,4,0,factsguide society motivation


#Logistic Regression Baseline

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import shuffle
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk


# Split the dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)
#print(y_train)
#print(y_test)

# Define the pipeline
toxic_tweet_pipeline = Pipeline([

    ('vectorizer', TfidfVectorizer()),
    ('classifier', LogisticRegression())
])

# Model Training
toxic_tweet_pipeline.fit(train_df['tweet'], train_df['Toxicity'])

# Model Evaluation
y_pred = toxic_tweet_pipeline.predict(test_df['tweet'])

print(f'Accuracy of Logistic Regression: {accuracy_score(test_df["Toxicity"], y_pred)}')
print('Classification Report:')
print(classification_report(test_df['Toxicity'], y_pred))

Accuracy of Logistic Regression: 0.905
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.94      0.91      1012
           1       0.93      0.87      0.90       988

    accuracy                           0.91      2000
   macro avg       0.91      0.90      0.90      2000
weighted avg       0.91      0.91      0.90      2000



#SVC model

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
from sklearn.utils import shuffle
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import nltk
import pandas as pd
from sklearn.svm import LinearSVC
# Split the dataset into training and testing sets
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)
#print(y_train)
#print(y_test)

# Define the pipeline
toxic_tweet_pipeline = Pipeline([

    ('vectorizer', TfidfVectorizer()),
    ('classifier', LinearSVC())
])

# Model Training
toxic_tweet_pipeline.fit(train_df['tweet'], train_df['Toxicity'])

# Model Evaluation
y_pred = toxic_tweet_pipeline.predict(test_df['tweet'])

print(f'Accuracy of LinearSVC: {accuracy_score(test_df["Toxicity"], y_pred)}')
print('Classification Report:')
print(classification_report(test_df['Toxicity'], y_pred))

Accuracy of LinearSVC: 0.9085
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.92      0.91      1012
           1       0.92      0.89      0.91       988

    accuracy                           0.91      2000
   macro avg       0.91      0.91      0.91      2000
weighted avg       0.91      0.91      0.91      2000



#BERT model

In [None]:
# Split the dataset
train_df, test_df = train_test_split(df, test_size=0.2, random_state=42, shuffle=True)

In [None]:
# Load BERT tokenizer and model
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')
model = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Tokenize and encode the training data
train_encodings = tokenizer(list(train_df['tweet']), truncation=True, padding=True, max_length=128, return_tensors='pt')
train_labels = torch.tensor(list(train_df['Toxicity']))

# Tokenize and encode the testing data
test_encodings = tokenizer(list(test_df['tweet']), truncation=True, padding=True, max_length=128, return_tensors='pt')
test_labels = torch.tensor(list(test_df['Toxicity']))

In [None]:
# Create DataLoader
train_dataset = TensorDataset(train_encodings['input_ids'], train_encodings['attention_mask'], train_labels)
test_dataset = TensorDataset(test_encodings['input_ids'], test_encodings['attention_mask'], test_labels)

train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=16, shuffle=False)

In [None]:
# Set up optimizer and scheduler
optimizer = AdamW(model.parameters(), lr=2e-5)
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=0, num_training_steps=len(train_loader) * 3)



In [None]:
# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

for epoch in range(3):
    print(f"Epoch {epoch + 1}/{3}")
    model.train()
    for batch in train_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer.zero_grad()

        outputs = model(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()

        optimizer.step()
        scheduler.step()

    print("Training complete for epoch", epoch + 1)

# Save the trained model
model.save_pretrained('toxicity_model')
tokenizer.save_pretrained('toxicity_model')

Epoch 1/3
Training complete for epoch 1
Epoch 2/3
Training complete for epoch 2
Epoch 3/3
Training complete for epoch 3


('toxicity_model/tokenizer_config.json',
 'toxicity_model/special_tokens_map.json',
 'toxicity_model/vocab.txt',
 'toxicity_model/added_tokens.json')

In [None]:
from sklearn.metrics import classification_report

# Evaluation
model.eval()
predictions = []
true_labels = []

with torch.no_grad():
    for batch in test_loader:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        outputs = model(input_ids, attention_mask=attention_mask)
        logits = outputs.logits
        predictions.extend(torch.argmax(logits, dim=1).cpu().numpy())
        true_labels.extend(labels.cpu().numpy())

# Calculate metrics
accuracy = accuracy_score(true_labels, predictions)
report = classification_report(true_labels, predictions, target_names=['Not Toxic', 'Toxic'])

print(f"Accuracy: {accuracy}")
print("Classification Report:")
print(report)

Accuracy: 0.926
Classification Report:
              precision    recall  f1-score   support

   Not Toxic       0.93      0.92      0.93      1012
       Toxic       0.92      0.93      0.93       988

    accuracy                           0.93      2000
   macro avg       0.93      0.93      0.93      2000
weighted avg       0.93      0.93      0.93      2000



# Data Cleaning

In [None]:
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000 entries, 0 to 9999
Data columns (total 4 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  10000 non-null  int64 
 1   Toxicity    10000 non-null  int64 
 2   tweet       10000 non-null  object
 3   persona     9996 non-null   object
dtypes: int64(2), object(2)
memory usage: 312.6+ KB


In [None]:
df2.isnull().sum()

Unnamed: 0    0
Toxicity      0
tweet         0
persona       4
dtype: int64

In [None]:
df2.duplicated().sum()

0

#Text Preprocessing

In [None]:
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

# Initialize the frequency dictionary
frequency = {}

def clean_text(text):
    stop_words = set(stopwords.words('english'))
    lemmatizer = WordNetLemmatizer()

    # Tokenize the text
    tokens = word_tokenize(text)

    # Clean and preprocess the tokens
    cleaned_tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens if token.isalpha() and token.lower() not in stop_words]

    # Update the word frequency
    for token in cleaned_tokens:
        frequency[token] = frequency.get(token, 0) + 1

    # Join the cleaned tokens to form a cleaned text
    cleaned_text = ' '.join(cleaned_tokens)

    return cleaned_text

In [None]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
df2['tweet'] = df2['tweet'].apply(clean_text)
df2.head()

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Unnamed: 0.1,Unnamed: 0,Toxicity,tweet,persona
0,0,0,user father dysfunctional selfish drag kid dys...,i
1,1,0,user user thanks lyft credit ca use cause offe...,
2,2,0,bihday majesty,i
3,3,0,model love u take u time,
4,4,0,factsguide society motivation,disputes


#Logistic Regression Baseline with persona column

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, classification_report
# Split the dataset into training and testing sets
train_df, test_df = train_test_split(df2, test_size=0.2, random_state=42, shuffle=True)


# Define the pipeline using ColumnTransformer
toxic_tweet_pipeline = Pipeline([
    ('union', ColumnTransformer(
        transformers=[
            ('text', TfidfVectorizer(), 'tweet'),
            ('persona', OneHotEncoder(handle_unknown='ignore'), ['persona'])
        ],

    )),
    ('classifier', LogisticRegression())
])

# Model Training
toxic_tweet_pipeline.fit(train_df[['tweet', 'persona']], train_df['Toxicity'])

# Model Evaluation
y_pred = toxic_tweet_pipeline.predict(test_df[['tweet', 'persona']])

print(f'Accuracy of Logistic Regression: {accuracy_score(test_df["Toxicity"], y_pred)}')
print('Classification Report:')
print(classification_report(test_df['Toxicity'], y_pred))


Accuracy of Logistic Regression: 0.9025
Classification Report:
              precision    recall  f1-score   support

           0       0.88      0.94      0.91      1012
           1       0.93      0.87      0.90       988

    accuracy                           0.90      2000
   macro avg       0.90      0.90      0.90      2000
weighted avg       0.90      0.90      0.90      2000



#SVC model with persona column

In [None]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report, accuracy_score
# Split the dataset into training and testing sets
train_df, test_df = train_test_split(df2, test_size=0.2, random_state=42, shuffle=True)


# Define the pipeline using ColumnTransformer
toxic_tweet_pipeline = Pipeline([
    ('union', ColumnTransformer(
        transformers=[
            ('text', TfidfVectorizer(), 'tweet'),
            ('persona', OneHotEncoder(handle_unknown='ignore'), ['persona'])
        ],

    )),
    ('classifier', LinearSVC())
])

# Model Training
toxic_tweet_pipeline.fit(train_df[['tweet', 'persona']], train_df['Toxicity'])

# Model Evaluation
y_pred = toxic_tweet_pipeline.predict(test_df[['tweet', 'persona']])

print(f'Accuracy of LinearSVC: {accuracy_score(test_df["Toxicity"], y_pred)}')
print('Classification Report:')
print(classification_report(test_df['Toxicity'], y_pred))


Accuracy of LinearSVC: 0.9105
Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.92      0.91      1012
           1       0.92      0.90      0.91       988

    accuracy                           0.91      2000
   macro avg       0.91      0.91      0.91      2000
weighted avg       0.91      0.91      0.91      2000



#BERT model with persona column

In [None]:
# Concatenate 'tweet' and 'persona' columns into a single string
SEP = "[SEP]"
df2['input_text'] = df2['tweet'] + ' ' + SEP + ' ' + df2['persona'].astype(str)

In [None]:
# Split the dataset
train_df2, test_df2 = train_test_split(df2, test_size=0.2, random_state=42, shuffle=True)

In [None]:
# Load BERT tokenizer and model
tokenizer2 = BertTokenizer.from_pretrained('bert-base-uncased')
model2 = BertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2)

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [None]:
# Tokenize and encode the training data
train_encodings2 = tokenizer2(list(train_df2['input_text']), truncation=True, padding=True, max_length=128, return_tensors='pt')
train_labels2 = torch.tensor(list(train_df2['Toxicity']))

In [None]:
# Tokenize and encode the testing data
test_encodings2 = tokenizer2(list(test_df2['input_text']), truncation=True, padding=True, max_length=128, return_tensors='pt')
test_labels2 = torch.tensor(list(test_df2['Toxicity']))

In [None]:
# Create DataLoader
train_dataset2 = TensorDataset(train_encodings2['input_ids'], train_encodings2['attention_mask'], train_labels2)
test_dataset2 = TensorDataset(test_encodings2['input_ids'], test_encodings2['attention_mask'], test_labels2)

In [None]:
train_loader2 = DataLoader(train_dataset2, batch_size=16, shuffle=True)
test_loader2 = DataLoader(test_dataset2, batch_size=16, shuffle=False)

In [None]:
# Set up optimizer and scheduler
optimizer2 = AdamW(model2.parameters(), lr=2e-5)
scheduler2 = get_linear_schedule_with_warmup(optimizer2, num_warmup_steps=0, num_training_steps=len(train_loader2) * 3)



In [None]:
# Training loop
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model2.to(device)

for epoch in range(3):
    print(f"Epoch {epoch + 1}/{3}")
    model2.train()
    for batch in train_loader2:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        optimizer2.zero_grad()

        outputs = model2(input_ids, attention_mask=attention_mask, labels=labels)
        loss = outputs.loss
        loss.backward()

        optimizer2.step()
        scheduler2.step()

    print("Training complete for epoch", epoch + 1)

# Save the trained model
model2.save_pretrained('toxicity_model2')
tokenizer2.save_pretrained('toxicity_model2')

Epoch 1/3
Training complete for epoch 1
Epoch 2/3
Training complete for epoch 2
Epoch 3/3
Training complete for epoch 3


('toxicity_model2/tokenizer_config.json',
 'toxicity_model2/special_tokens_map.json',
 'toxicity_model2/vocab.txt',
 'toxicity_model2/added_tokens.json')

In [None]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# Evaluation
model2.eval()
predictions2 = []
true_labels2 = []

with torch.no_grad():
    for batch in test_loader2:
        input_ids, attention_mask, labels = batch
        input_ids, attention_mask, labels = input_ids.to(device), attention_mask.to(device), labels.to(device)

        outputs2 = model2(input_ids, attention_mask=attention_mask)
        logits = outputs2.logits
        predictions2.extend(torch.argmax(logits, dim=1).cpu().numpy())
        true_labels2.extend(labels.cpu().numpy())

# Calculate metrics
accuracy2 = accuracy_score(true_labels2, predictions2)
report2 = classification_report(true_labels2, predictions2, target_names=['Not Toxic', 'Toxic'])

# Confusion Matrix
conf_matrix = confusion_matrix(true_labels2, predictions2)
print("Confusion Matrix:")
print(conf_matrix)

print(f"Accuracy: {accuracy2}")
print("Classification Report:")
print(report2)

Confusion Matrix:
[[949  63]
 [ 48 940]]
Accuracy: 0.9445
Classification Report:
              precision    recall  f1-score   support

   Not Toxic       0.95      0.94      0.94      1012
       Toxic       0.94      0.95      0.94       988

    accuracy                           0.94      2000
   macro avg       0.94      0.94      0.94      2000
weighted avg       0.94      0.94      0.94      2000



In [None]:
from transformers import BertTokenizer, BertForSequenceClassification
import torch

# Load the saved model and tokenizer
model_path = 'toxicity_model2'
model = BertForSequenceClassification.from_pretrained(model_path)
tokenizer = BertTokenizer.from_pretrained(model_path)

# Example sentences
sentences = [
    "This is an example toxic tweet.",
    "I appreciate the positive vibes from this community.",
    "The language used in this comment is inappropriate.",
    "Feeling good about the progress we're making.",
]

# Tokenize and encode the sentences
input_encodings = tokenizer(sentences, truncation=True, padding=True, max_length=128, return_tensors='pt')

# Make predictions
with torch.no_grad():
    model.eval()
    outputs = model(**input_encodings)
    logits = outputs.logits

# Convert logits to probabilities and get predicted classes
probs = torch.nn.functional.softmax(logits, dim=-1)
predicted_classes = torch.argmax(probs, dim=-1).tolist()

# Output the results
for sentence, predicted_class in zip(sentences, predicted_classes):
    label = "Toxic" if predicted_class == 1 else "Not Toxic"
    print(f"Sentence: {sentence}\nPredicted: {label}\n")


Sentence: This is an example toxic tweet.
Predicted: Toxic

Sentence: I appreciate the positive vibes from this community.
Predicted: Not Toxic

Sentence: The language used in this comment is inappropriate.
Predicted: Toxic

Sentence: Feeling good about the progress we're making.
Predicted: Not Toxic

