In [4]:
import pandas as pd
import numpy as np
from collections import defaultdict
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from textblob import TextBlob
from sklearn.model_selection import train_test_split
import nltk
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize

nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('omw-1.4')

# preprocessing the data

def preprocess_sentence(sentence):
    stop_words = set(stopwords.words('english'))
    stemmer = PorterStemmer()
    lemmatizer = WordNetLemmatizer()
    
    # Tokenize the sentence
    tokens = word_tokenize(sentence.lower())
    
    # Remove stop words
    tokens = [token for token in tokens if token not in stop_words]
    
    # Stem each word
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    
    # Lemmatize each word
    lemmatized_tokens = [lemmatizer.lemmatize(token) for token in stemmed_tokens]
    
    # Join the tokens back into a sentence
    cleaned_sentence = ' '.join(lemmatized_tokens)
    
    return cleaned_sentence


def detect_offensive(statement):
    """
    This function takes in a statement as input and returns True if it is offensive, and False otherwise.
    """
    blob = TextBlob(statement)
    polarity = blob.sentiment.polarity
    if polarity < -0.5:
        return True
    else:
        return False

# Load the data into a pandas dataframe
# df = pd.read_csv("data.csv")
# print(df['text'].head(5))
# df['text'] = df['text'].apply(preprocess_sentence)
# print(df['text'].head(5))

df2 = pd.read_csv("data2.csv")
df2['tweet'] = df2['tweet'].apply(preprocess_sentence)

# Split the data into training and testing sets
# X_train, X_test, y_train, y_test = train_test_split(df['text'], df['task1'], test_size=0.2, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(df2['tweet'], df2['class'], test_size=0.2, random_state=42)

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [6]:
# Convert the text data into numerical features using CountVectorizer
vectorizer = CountVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)

# Train a Naive Bayes classifier on the training data
nb = MultinomialNB()
nb.fit(X_train, y_train)

# Evaluate the performance of the classifier on the testing data
accuracy = nb.score(X_test, y_test)
precision = nb.score(X_test, y_test == 1)
print("Accuracy:", accuracy)
print("Precision:", precision)

Accuracy: 0.8323185011709602
Precision: 0.7386416861826698


In [20]:
# the method Used:  score = Summation of (1 + log(tf))

def basemodel(rs):
    X_train, X_test, y_train, y_test = train_test_split(df2['tweet'], df2['class'], test_size=0.2, random_state=rs)
    # evaluting the vocabulary
    vocab = set()
    for sentence in X_train:
        words = sentence.split()
        for word in words:
            vocab.add(word)

    # Create a dictionary to store the count of each word in each class
    # array of index = word for all labels
    # count how many time a word occured in each labels

    class_word_counts = {}
    for c in np.unique(y_train):
        class_word_counts[c] = {}
        for word in vocab:
            class_word_counts[c][word] = 0

    # Count the number of occurrences of each word in each class
    for i in range(len(X_train)):
        words = X_train.iloc[i].split()
        c = y_train.iloc[i]
        for word in words:
            class_word_counts[c][word] += 1

    doc_freq = {}

    for word in vocab:
      doc_freq[word] = 0
      for c in np.unique(y_train):
        if class_word_counts[c][word] != 0:
          doc_freq[word] += 1


    # Compute the total count of words in each class
    class_word_totals = {}
    for c in np.unique(y_train):
        class_word_totals[c] = sum(class_word_counts[c].values())

    # Define a function to predict the class of a new text sample
    def predict(text):
        text = preprocess_sentence(text)
        words = text.split()
        probs = {}
        for c in np.unique(y_train):
            score = 0
            for word in words:
              if word in vocab:
                if class_word_counts[c][word] != 0:
                  score += (1 + np.log(class_word_counts[c][word]))
                else:
                  score += 0

            probs[c] = score
        return max(probs, key=probs.get)

    # Evaluate the performance of the classifier on the testing data
    correct = 0
    tp = 0
    fp = 0
    tn = 0
    fn = 0
    for i in range(len(X_test)):
        pred = predict(X_test.iloc[i])
        if pred == 2:
          # off = detect_offensive(X_test.iloc[i])
          # if off == True:
          #   if y_test.iloc[i] == 0 or y_test.iloc[i] == 1:
          #     correct += 1
          # elif y_test.iloc[i] == 2:
          #   correct += 1
          if y_test.iloc[i] == 2:
            tn += 1
            correct += 1
          else:
            fn += 1
        elif y_test.iloc[i] == 0 or y_test.iloc[i] == 1:
          tp += 1
          correct += 1
        else:
          fp += 1
    accuracy = correct / len(y_test)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    print("Accuracy:", accuracy)
    print("Percision:", precision)
    print("Recall:", recall)
    return accuracy, precision, recall


avg_accuracy = 0
avg_precision = 0
avg_recall = 0
for i in range(10):
    accuracy, precision, recall = basemodel(42 + 2 * i)
    avg_accuracy += accuracy
    avg_precision += precision
    avg_recall += recall

avg_accuracy /= 10
avg_precision /= 10
avg_recall /= 10

print("average accuracy:", avg_accuracy)
print("average precision: ", avg_precision)
print("average recall: ", avg_recall)
f1_score = (2 * avg_precision * avg_recall) / (avg_precision + avg_recall)
print(f1_score)

Accuracy: 0.8313817330210773
Percision: 0.8302687411598303
Recall: 1.0
Accuracy: 0.8238875878220141
Percision: 0.8227251296558227
Recall: 1.0
Accuracy: 0.839344262295082
Percision: 0.8382075471698113
Recall: 1.0
Accuracy: 0.8374707259953161
Percision: 0.8371873525247758
Recall: 0.9988738738738738
Accuracy: 0.8351288056206089
Percision: 0.8340425531914893
Recall: 0.9994334277620397
Accuracy: 0.8384074941451991
Percision: 0.8382838283828383
Recall: 0.998876404494382
Accuracy: 0.8304449648711943
Percision: 0.829636621047664
Recall: 0.9994314951677089
Accuracy: 0.8449648711943794
Percision: 0.8441190363722249
Recall: 0.9994407158836689
Accuracy: 0.8426229508196721
Percision: 0.8416075650118203
Recall: 0.9994385176866929
Accuracy: 0.8374707259953161
Percision: 0.83719829626124
Recall: 0.9983069977426636
average accuracy: 0.8361124121779862
average precision:  0.8353276670777516
average recall:  0.9993801432611029
0.9100194362174151


In [19]:
# the method Used:  score = Summation of (1 + log(tf)) * log(N/df)

def basemodel2(rs):
    X_train, X_test, y_train, y_test = train_test_split(df2['tweet'], df2['class'], test_size=0.2, random_state=rs)

    # evaluting the vocabulary
    vocab = set()
    for sentence in X_train:
        words = sentence.split()
        for word in words:
            vocab.add(word)

    # Create a dictionary to store the count of each word in each class
    # array of index = word for all labels
    # count how many time a word occured in each labels

    class_word_counts = {}
    for c in np.unique(y_train):
        class_word_counts[c] = {}
        for word in vocab:
            class_word_counts[c][word] = 0

    # Count the number of occurrences of each word in each class
    for i in range(len(X_train)):
        words = X_train.iloc[i].split()
        c = y_train.iloc[i]
        for word in words:
            class_word_counts[c][word] += 1

    doc_freq = {}

    for word in vocab:
      doc_freq[word] = 0
      for c in np.unique(y_train):
        if class_word_counts[c][word] != 0:
          doc_freq[word] += 1


    # Compute the total count of words in each class
    class_word_totals = {}
    for c in np.unique(y_train):
        class_word_totals[c] = sum(class_word_counts[c].values())

    # Define a function to predict the class of a new text sample
    def predict(text):
        text = preprocess_sentence(text)
        words = text.split()
        probs = {}
        for c in np.unique(y_train):
            score = 0
            for word in words:
              if word in vocab:
                if class_word_counts[c][word] != 0:
                  score += (1 + np.log(class_word_counts[c][word])) * (1 + np.log(len(np.unique(y_train))/doc_freq[word]))
                else:
                  score += 0

            probs[c] = score
        return max(probs, key=probs.get)

    # Evaluate the performance of the classifier on the testing data
    correct = 0
    tp = 0
    fp = 0
    tn = 0
    fn = 0
    for i in range(len(X_test)):
        pred = predict(X_test.iloc[i])
        if pred == 2:
          # off = detect_offensive(X_test.iloc[i])
          # if off == True:
          #   if y_test.iloc[i] == 0 or y_test.iloc[i] == 1:
          #     correct += 1
          # elif y_test.iloc[i] == 2:
          #   correct += 1
          if y_test.iloc[i] == 2:
            tn += 1
            correct += 1
          else:
            fn += 1
        elif y_test.iloc[i] == 0 or y_test.iloc[i] == 1:
          tp += 1
          correct += 1
        else:
          fp += 1
    accuracy = correct / len(y_test)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    print("Accuracy:", accuracy)
    print("Percision:", precision)
    print("Recall:", recall)
    return accuracy, precision, recall


avg_accuracy = 0
avg_precision = 0
avg_recall = 0
for i in range(10):
    accuracy, precision, recall = basemodel2(42 + 2 * i)
    avg_accuracy += accuracy
    avg_precision += precision
    avg_recall += recall

avg_accuracy /= 10
avg_precision /= 10
avg_recall /= 10

print("average accuracy:", avg_accuracy)
print("average precision: ", avg_precision)
print("average recall: ", avg_recall)
f1_score = (2 * avg_precision * avg_recall) / (avg_precision + avg_recall)
print(f1_score)

Accuracy: 0.8374707259953161
Percision: 0.8353889943074004
Recall: 1.0
Accuracy: 0.8281030444964871
Percision: 0.8262310606060606
Recall: 1.0
Accuracy: 0.8444964871194379
Percision: 0.8429046037019459
Recall: 0.9994372537985369
Accuracy: 0.8398126463700234
Percision: 0.8391674550614948
Recall: 0.9988738738738738
Accuracy: 0.839344262295082
Percision: 0.8376068376068376
Recall: 0.9994334277620397
Accuracy: 0.8444964871194379
Percision: 0.8434535104364327
Recall: 0.998876404494382
Accuracy: 0.8374707259953161
Percision: 0.8358705994291151
Recall: 0.9988629903354178
Accuracy: 0.8463700234192038
Percision: 0.8456439393939394
Recall: 0.9988814317673378
Accuracy: 0.8477751756440282
Percision: 0.8460076045627376
Recall: 0.9994385176866929
Accuracy: 0.8426229508196721
Percision: 0.8419047619047619
Recall: 0.9977426636568849
average accuracy: 0.8407962529274003
average precision:  0.8394179367010726
average recall:  0.9991546563375167
0.9123472668348473


In [14]:
  # the method Used: 

def tests(rs):
    X_train, X_test, y_train, y_test = train_test_split(df2['tweet'], df2['class'], test_size=0.2, random_state=rs)

    # evaluting the vocabulary
    vocab = set()
    for sentence in X_train:
        words = sentence.split()
        for word in words:
            vocab.add(word)

    # Create a dictionary to store the count of each word in each class
    # array of index = word for all labels
    # count how many time a word occured in each labels

    class_word_counts = {}
    for c in np.unique(y_train):
        class_word_counts[c] = {}
        for word in vocab:
            class_word_counts[c][word] = 0

    # Count the number of occurrences of each word in each class
    for i in range(len(X_train)):
        words = X_train.iloc[i].split()
        c = y_train.iloc[i]
        for word in words:
            class_word_counts[c][word] += 1

    # Prior probability of a class
    class_priors = {}
    for c in np.unique(y_train):
        class_priors[c] = (len(y_train[y_train == c]) + 1) / (len(y_train) + len(np.unique(y_train)))


    # Compute the total count of words in each class
    class_word_totals = {}
    for c in np.unique(y_train):
        class_word_totals[c] = sum(class_word_counts[c].values())

    # Define a function to predict the class of a new text sample
    def predict(text):
        text = preprocess_sentence(text)
        words = text.split()
        probs = {}
        for c in np.unique(y_train):
            log_prob = np.log(class_priors[c])
            # log_prob = 1;
            for word in words:
              count = 1 # Laplace smoothing
              if word in vocab:
                  count += class_word_counts[c][word] 
              log_prob += np.log(count / (class_word_totals[c] + len(vocab)))
            probs[c] = log_prob
        return max(probs, key=probs.get)

    # Evaluate the performance of the classifier on the testing data
    correct = 0
    tp = 0
    fp = 0
    tn = 0
    fn = 0
    for i in range(len(X_test)):
        pred = predict(X_test.iloc[i])
        if pred == 2:
          # off = detect_offensive(X_test.iloc[i])
          # if off == True:
          #   if y_test.iloc[i] == 0 or y_test.iloc[i] == 1:
          #     correct += 1
          # elif y_test.iloc[i] == 2:
          #   correct += 1
          if y_test.iloc[i] == 2:
            tn += 1
            correct += 1
          else:
            fn += 1
        elif y_test.iloc[i] == 0 or y_test.iloc[i] == 1:
          tp += 1
          correct += 1
        else:
          fp += 1
    accuracy = correct / len(y_test)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    print("Accuracy:", accuracy)
    print("Percision:", precision)
    print("Recall:", recall)
    return accuracy, precision, recall


avg_accuracy = 0
avg_precision = 0
avg_recall = 0
for i in range(10):
    accuracy, precision, recall = tests(42 + 2 * i)
    avg_accuracy += accuracy
    avg_precision += precision
    avg_recall += recall

avg_accuracy /= 10
avg_precision /= 10
avg_recall /= 10

print("average accuracy:", avg_accuracy)
print("average precision: ", avg_precision)
print("average recall: ", avg_recall)
f1_score = (2 * avg_precision * avg_recall) / (avg_precision + avg_recall)
print(f1_score)

Accuracy: 0.9072599531615925
Percision: 0.926815947569634
Recall: 0.9636570130607609
Accuracy: 0.9063231850117096
Percision: 0.9205225911812738
Recall: 0.969054441260745
Accuracy: 0.9152224824355972
Percision: 0.936542669584245
Recall: 0.9634214969048959
Accuracy: 0.9152224824355972
Percision: 0.9360306178239475
Recall: 0.963963963963964
Accuracy: 0.9110070257611241
Percision: 0.931980252331322
Recall: 0.9626062322946176
Accuracy: 0.9142857142857143
Percision: 0.9304582210242588
Recall: 0.9696629213483146
Accuracy: 0.907728337236534
Percision: 0.9295929592959296
Recall: 0.9607731665719159
Accuracy: 0.9217798594847775
Percision: 0.9421713038734315
Recall: 0.9658836689038032
Accuracy: 0.9208430913348946
Percision: 0.9366197183098591
Recall: 0.9708029197080292
Accuracy: 0.907728337236534
Percision: 0.9282218597063622
Recall: 0.9633182844243793
average accuracy: 0.9127400468384076
average precision:  0.9318956140700264
average recall:  0.9653144108441424
0.9483106813173678


In [13]:
f1_score = (2 * avg_precision * avg_recall) / (avg_precision + avg_recall)
print(f1_score)

9.483106813173677


In [17]:
def tests_another(rs):
    X_train, X_test, y_train, y_test = train_test_split(df2['tweet'], df2['class'], test_size=0.2, random_state=rs)

    # Define the vocabulary
    vocab = set()
    for sentence in X_train:
        words = sentence.split()
        for word in words:
            vocab.add(word)

    class_doc_counts = {}
    for c in np.unique(y_train):
        class_doc_counts[c] = {}
        for word in vocab:
            class_doc_counts[c][word] = 0

    # Count the number of occurrences of each word in each class
    for i in range(len(X_train)):
        words = X_train.iloc[i].split()
        c = y_train.iloc[i]
        for word in np.unique(words):
            class_doc_counts[c][word] += 1

    # Prior probability of a class
    class_priors = {}
    for c in np.unique(y_train):
        class_priors[c] = (len(y_train[y_train == c]) + 1) / (len(y_train) + len(np.unique(y_train)))

    # Compute the total count of words in each class
    class_docs_totals = {}
    for c in np.unique(y_train):
        class_docs_totals[c] = sum(class_doc_counts[c].values())

    # Define a function to predict the class of a new text sample
    def predict(text):
        words = preprocess_sentence(text).split()
        probs = {}
        for c in np.unique(y_train):
            log_prob = np.log(class_priors[c])
            # log_prob = 1;
            for word in words:
              count = 1  # Laplace smoothing
              if word in vocab:
                  count += class_doc_counts[c][word]
              log_prob += np.log(count / (class_docs_totals[c] + len(vocab)))
            probs[c] = log_prob
        return max(probs, key=probs.get)

    # Evaluate the performance of the classifier on the testing data
    correct = 0
    tp = 0
    fp = 0
    tn = 0
    fn = 0
    for i in range(len(X_test)):
        pred = predict(X_test.iloc[i])
        if pred == 2:
          # off = detect_offensive(X_test.iloc[i])
          # if off == True:
          #   if y_test.iloc[i] == 0 or y_test.iloc[i] == 1:
          #     correct += 1
          # elif y_test.iloc[i] == 2:
          #   correct += 1
          if y_test.iloc[i] == 2:
            tn += 1
            correct += 1
          else:
            fn += 1
        elif y_test.iloc[i] == 0 or y_test.iloc[i] == 1:
          tp += 1
          correct += 1
        else:
          fp += 1
    accuracy = correct / len(y_test)
    precision = tp / (tp + fp)
    recall = tp / (tp + fn)
    print("Accuracy:", accuracy)
    print("Percision:", precision)
    print("Recall:", recall)
    return accuracy, precision, recall


avg_accuracy = 0
avg_precision = 0
avg_recall = 0
for i in range(10):
    accuracy, precision, recall = tests_another(42 + 2 * i)
    avg_accuracy += accuracy
    avg_precision += precision
    avg_recall += recall

avg_accuracy /= 10
avg_precision /= 10
avg_recall /= 10

print("average accuracy:", avg_accuracy)
print("average precision: ", avg_precision)
print("average recall: ", avg_recall)
f1_score = (2 * avg_precision * avg_recall) / (avg_precision + avg_recall)
print(f1_score)

Accuracy: 0.9039812646370023
Percision: 0.9147121535181236
Recall: 0.9744463373083475
Accuracy: 0.8899297423887588
Percision: 0.8986272439281943
Recall: 0.9753581661891118
Accuracy: 0.9072599531615925
Percision: 0.9197235513024987
Recall: 0.9735509285312324
Accuracy: 0.9096018735362997
Percision: 0.924396782841823
Recall: 0.9707207207207207
Accuracy: 0.9053864168618267
Percision: 0.9137109581789307
Recall: 0.9779036827195468
Accuracy: 0.9081967213114754
Percision: 0.916403785488959
Recall: 0.9792134831460674
Accuracy: 0.9021077283372365
Percision: 0.9148822269807281
Recall: 0.9715747583854463
Accuracy: 0.9199063231850118
Percision: 0.9298245614035088
Recall: 0.9781879194630873
Accuracy: 0.9175644028103045
Percision: 0.9239302694136292
Recall: 0.9820325659741718
Accuracy: 0.907728337236534
Percision: 0.9168872419269455
Recall: 0.9774266365688488
average accuracy: 0.9071662763466042
average precision:  0.9173098774983341
average recall:  0.9760415199006582
0.9457647727551592
