In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

In [None]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
import pandas as pd
import numpy as np
import re

In [None]:

# Read the CSV file
data = pd.read_csv('/kaggle/input/content-moderation-dataset/testing.csv')
data['labels']=data['class'].map({0:"Adult content",1:"Offensive Language",2:"Normal Language"})

In [None]:
# Clean the text in the second column
data['tweet'] = data['tweet'].apply(lambda x: x.lower())  # Convert to lowercase
data['tweet'] = data['tweet'].apply(lambda x: re.sub(r'http\S+|www\S+|@\S+|&\S+|#\S+|[^A-Za-z0-9]+', ' ', x))  # Remove URLs, mentions, hashtags, and special characters
data['tweet'] = data['tweet'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())  # Remove extra whitespace

In [None]:

data[['tweet','labels']]

### Apply Decesion Tree Model

In [None]:
import pandas as pd
import numpy as np
import re

# Read the CSV file
data = pd.read_csv('/kaggle/input/content-moderation-dataset/testing.csv')
data['labels'] = data['class'].map({0: "Adult content", 1: "Offensive Language", 2: "Normal Language"})

# Clean the text in the "tweet" column
data['tweet'] = data['tweet'].apply(lambda x: str(x).lower())  # Convert to lowercase
data['tweet'] = data['tweet'].apply(lambda x: re.sub(r'http\S+|www\S+|@\S+|&\S+|#\S+|[^A-Za-z0-9]+', ' ', x))  # Remove URLs, mentions, hashtags, and special characters
data['tweet'] = data['tweet'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())  # Remove extra whitespace

# Drop rows with missing values
data = data.dropna()

# Split the data into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.33, random_state=42)

# Collect sentences and labels from training and testing data
train_sentences = train_data["tweet"].values
train_labels = train_data["labels"].values
test_sentences = test_data["tweet"].values
test_labels = test_data["labels"].values

# Create a tool to count words in sentences
word_counter = CountVectorizer()

# Convert sentences into word counts for training and testing
train_word_counts = word_counter.fit_transform(train_sentences)
test_word_counts = word_counter.transform(test_sentences)

# Define a function to calculate Gini impurity
def calculate_gini(labels):
    unique_labels, label_counts = np.unique(labels, return_counts=True)
    probabilities = label_counts / len(labels)
    gini = 1 - np.sum(probabilities ** 2)
    return gini

# Define a function to find the best split
def find_best_split(data, labels):
    num_samples, num_features = data.shape
    best_gini = 1.0
    best_split = None

    for feature_idx in range(num_features):
        feature_values = data[:, feature_idx]
        unique_values = np.unique(feature_values)

        for threshold in unique_values:
            left_indices = np.where(feature_values <= threshold)
            right_indices = np.where(feature_values > threshold)

            left_labels = labels[left_indices]
            right_labels = labels[right_indices]

            if len(left_labels) == 0 or len(right_labels) == 0:
                continue

            gini_left = calculate_gini(left_labels)
            gini_right = calculate_gini(right_labels)
            weighted_gini = (len(left_labels) / num_samples) * gini_left + (len(right_labels) / num_samples) * gini_right

            if weighted_gini < best_gini:
                best_gini = weighted_gini
                best_split = (feature_idx, threshold)

    return best_split, best_gini

# Define a class for a Decision Tree node
class DecisionTreeNode:
    def __init__(self, depth=0, max_depth=None):
        self.depth = depth
        self.max_depth = max_depth
        self.feature_idx = None
        self.threshold = None
        self.left = None
        self.right = None
        self.value = None

    def fit(self, data, labels):
        num_samples, num_features = data.shape
        unique_labels, counts = np.unique(labels, return_counts=True)
        self.value = unique_labels[np.argmax(counts)]

        if self.depth == self.max_depth or len(unique_labels) == 1:
            return

        best_split, best_gini = find_best_split(data, labels)
        if best_split is None or best_gini == 0:
            return

        feature_idx, threshold = best_split
        self.feature_idx = feature_idx
        self.threshold = threshold

        left_indices = np.where(data[:, feature_idx] <= threshold)
        right_indices = np.where(data[:, feature_idx] > threshold)

        self.left = DecisionTreeNode(depth=self.depth + 1, max_depth=self.max_depth)
        self.left.fit(data[left_indices], labels[left_indices])

        self.right = DecisionTreeNode(depth=self.depth + 1, max_depth=self.max_depth)
        self.right.fit(data[right_indices], labels[right_indices])

    def predict(self, data):
        if self.feature_idx is None:
            return self.value

        if data[self.feature_idx] <= self.threshold:
            return self.left.predict(data)
        else:
            return self.right.predict(data)

# Convert labels to numerical values
label_mapping = {"Adult content": 0, "Offensive Language": 1, "Normal Language": 2}
train_labels_numeric = np.array([label_mapping[label] for label in train_labels])
test_labels_numeric = np.array([label_mapping[label] for label in test_labels])

# Build the Decision Tree
max_depth = 5  # You can adjust the maximum depth as needed
decision_tree = DecisionTreeNode(max_depth=max_depth)
decision_tree.fit(train_word_counts.toarray(), train_labels_numeric)

# Make predictions on the training data
train_predictions = np.array([decision_tree.predict(sample) for sample in train_word_counts.toarray()])

# Make predictions on the testing data
test_predictions = np.array([decision_tree.predict(sample) for sample in test_word_counts.toarray()])


# Convert numeric labels back to original labels
train_labels_original = np.array([list(label_mapping.keys())[list(label_mapping.values()).index(label)] for label in train_labels_numeric])
test_labels_original = np.array([list(label_mapping.keys())[list(label_mapping.values()).index(label)] for label in test_labels_numeric])



In [None]:
train_labels_to_use = train_labels_numeric
test_labels_to_use = test_labels_numeric


In [None]:
# Calculate and display training accuracy, precision, recall, and F1-score
train_accuracy = accuracy_score(train_labels_to_use, train_predictions)
train_precision = precision_score(train_labels_to_use, train_predictions, average='weighted')
train_recall = recall_score(train_labels_to_use, train_predictions, average='weighted')
train_f1 = f1_score(train_labels_to_use, train_predictions, average='weighted')

print("Training Metrics:")
print(f"Accuracy: {train_accuracy:.4f}")
print(f"Precision: {train_precision:.4f}")
print(f"Recall: {train_recall:.4f}")
print(f"F1 Score: {train_f1:.4f}")

# Calculate and display testing accuracy, precision, recall, and F1-score
test_accuracy = accuracy_score(test_labels_to_use, test_predictions)
test_precision = precision_score(test_labels_to_use, test_predictions, average='weighted')
test_recall = recall_score(test_labels_to_use, test_predictions, average='weighted')
test_f1 = f1_score(test_labels_to_use, test_predictions, average='weighted')

print("\nTesting Metrics:")
print(f"Accuracy: {test_accuracy:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall: {test_recall:.4f}")
print(f"F1 Score: {test_f1:.4f}")


In [None]:
test="broke bitch cant tell me nothing"
df=cv.transform([test]).toarray()
res=clf.predict(df)
print(res)


### Naive Bayes

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score
from sklearn.naive_bayes import MultinomialNB

# Read the CSV file
data = pd.read_csv('/kaggle/input/content-moderation-dataset/testing.csv')
data['labels'] = data['class'].map({0: "Adult content", 1: "Offensive Language", 2: "Normal Language"})

# Clean the text in the "tweet" column
data['tweet'] = data['tweet'].apply(lambda x: str(x).lower())  # Convert to lowercase
data['tweet'] = data['tweet'].apply(lambda x: re.sub(r'http\S+|www\S+|@\S+|&\S+|#\S+|[^A-Za-z0-9]+', ' ', x))  # Remove URLs, mentions, hashtags, and special characters
data['tweet'] = data['tweet'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())  # Remove extra whitespace

# Drop rows with missing values
data = data.dropna()

# Split the data into training and testing sets
train_data, test_data = train_test_split(data, test_size=0.33, random_state=42)

# Collect sentences and labels from training and testing data
train_sentences = train_data["tweet"].values
train_labels = train_data["labels"].values
test_sentences = test_data["tweet"].values
test_labels = test_data["labels"].values

# Create a tool to count words in sentences
word_counter = CountVectorizer()

# Convert sentences into word counts for training and testing
train_word_counts = word_counter.fit_transform(train_sentences)
test_word_counts = word_counter.transform(test_sentences)

# Build and train the Naive Bayes classifier
naive_bayes_classifier = MultinomialNB()
naive_bayes_classifier.fit(train_word_counts, train_labels)

# Make predictions on the training data
train_predictions = naive_bayes_classifier.predict(train_word_counts)

# Make predictions on the testing data
test_predictions = naive_bayes_classifier.predict(test_word_counts)

# Calculate and display training accuracy, precision, recall, and F1-score
train_accuracy = accuracy_score(train_labels, train_predictions)
train_precision = precision_score(train_labels, train_predictions, average='weighted')
train_recall = recall_score(train_labels, train_predictions, average='weighted')
train_f1 = f1_score(train_labels, train_predictions, average='weighted')

print("Training Metrics:")
print(f"Accuracy: {train_accuracy:.4f}")
print(f"Precision: {train_precision:.4f}")
print(f"Recall: {train_recall:.4f}")
print(f"F1 Score: {train_f1:.4f}")

# Calculate and display testing accuracy, precision, recall, and F1-score
test_accuracy = accuracy_score(test_labels, test_predictions)
test_precision = precision_score(test_labels, test_predictions, average='weighted')
test_recall = recall_score(test_labels, test_predictions, average='weighted')
test_f1 = f1_score(test_labels, test_predictions, average='weighted')

print("\nTesting Metrics:")
print(f"Accuracy: {test_accuracy:.4f}")
print(f"Precision: {test_precision:.4f}")
print(f"Recall: {test_recall:.4f}")
print(f"F1 Score: {test_f1:.4f}")


In [None]:
test="shutup ayesha"
df=cv.transform([test]).toarray()
res=clf.predict(df)
print(res)


## EXTRA CODE

In [None]:
import pandas as pd
import numpy as np
import re
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Read the CSV file
data = pd.read_csv('/kaggle/input/content-moderation-dataset/testing.csv')
data['labels'] = data['class'].map({0: "Adult content", 1: "Offensive Language", 2: "Normal Language"})

# Clean the text in the "tweet" column
data['tweet'] = data['tweet'].apply(lambda x: str(x).lower())  # Convert to lowercase
data['tweet'] = data['tweet'].apply(lambda x: re.sub(r'http\S+|www\S+|@\S+|&\S+|#\S+|[^A-Za-z0-9]+', ' ', x))  # Remove URLs, mentions, hashtags, and special characters
data['tweet'] = data['tweet'].apply(lambda x: re.sub(r'\s+', ' ', x).strip())  # Remove extra whitespace

# Drop rows with missing values
data = data.dropna()
x = data['tweet']
y = data['labels']

# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.33, random_state=42)

# Create a tool to count words in sentences
word_counter = CountVectorizer()

# Convert sentences into word counts for training and testing
train_word_counts = word_counter.fit_transform(X_train)
test_word_counts = word_counter.transform(X_test)

# Identify unique label categories and count the number of categories
label_categories = np.unique(y_train)
num_categories = len(label_categories)
num_sentences, num_words = train_word_counts.shape

# Calculate the chance of each label appearing (Prior probabilities)
label_chances = {}
for label in label_categories:
    label_chances[label] = np.sum(y_train == label) / len(y_train)

# Calculate the chance of each word appearing given a specific label (Conditional probabilities)
word_given_label_chances = {}
for label in label_categories:
    label_word_counts = train_word_counts[y_train == label]
    total_words_in_label = np.sum(label_word_counts)
    word_probabilities = {}
    
    for word, idx in word_counter.vocabulary_.items():
        word_occurrences = np.sum(label_word_counts[:, idx])
        # Apply a smoothing technique to avoid division by zero
        word_probabilities[word] = (word_occurrences + 1) / (total_words_in_label + num_words)
    
    word_given_label_chances[label] = word_probabilities

# Make predictions on the training data
train_predictions = []
for sentence in X_train:
    label_probabilities = {label: np.log(label_chances[label]) for label in label_categories}
    words = sentence.split()
    
    for label in label_categories:
        for word in words:
            if word in word_counter.vocabulary_ and word in word_given_label_chances[label]:
                label_probabilities[label] += np.log(word_given_label_chances[label][word])
    
    predicted_label = max(label_probabilities, key=label_probabilities.get)
    train_predictions.append(predicted_label)

# Evaluate the model on the training data
train_accuracy = accuracy_score(y_train, train_predictions)
train_precision = precision_score(y_train, train_predictions, average='weighted')
train_recall = recall_score(y_train, train_predictions, average='weighted')
train_f1_score = f1_score(y_train, train_predictions, average='weighted')

# Print training performance metrics
print("Training Accuracy:", train_accuracy)
print("Training Precision:", train_precision)
print("Training Recall:", train_recall)
print("Training F1-Score:", train_f1_score)

# Make predictions on the testing data
test_predictions = []
for sentence in X_test:
    label_probabilities = {label: np.log(label_chances[label]) for label in label_categories}
    words = sentence.split()
    
    for label in label_categories:
        for word in words:
            if word in word_counter.vocabulary_ and word in word_given_label_chances[label]:
                label_probabilities[label] += np.log(word_given_label_chances[label][word])
    
    predicted_label = max(label_probabilities, key=label_probabilities.get)
    test_predictions.append(predicted_label)

# Evaluate the model on the testing data
test_accuracy = accuracy_score(y_test, test_predictions)
test_precision = precision_score(y_test, test_predictions, average='weighted')
test_recall = recall_score(y_test, test_predictions, average='weighted')
test_f1_score = f1_score(y_test, test_predictions, average='weighted')

# Print testing performance metrics
print("Testing Accuracy:", test_accuracy)
print("Testing Precision:", test_precision)
print("Testing Recall:", test_recall)
print("Testing F1-Score:", test_f1_score)



In [None]:
test="shutup ayesha"
df=cv.transform([test]).toarray()
res=clf.predict(df)
print(res)

In [None]:
# Function to preprocess a single input text
def preprocess_single_text(input_text):
    # Preprocess the input text
    input_text = str(input_text).lower()  # Convert to lowercase
    input_text = re.sub(r'http\S+|www\S+|@\S+|&\S+|#\S+|[^A-Za-z0-9]+', ' ', input_text)  # Remove URLs, mentions, hashtags, and special characters
    input_text = re.sub(r'\s+', ' ', input_text).strip()  # Remove extra whitespace
    return input_text

# Function to predict the label for a single input text
def predict_single_text(input_text, label_categories, label_chances, word_given_label_chances, vectorizer):
    # Preprocess the input text
    input_text = preprocess_single_text(input_text)
    
    # Tokenize the input text
    words = input_text.split()
    
    # Initialize label probabilities
    label_probabilities = {label: np.log(label_chances[label]) for label in label_categories}
    
    for label in label_categories:
        for word in words:
            if word in vectorizer.vocabulary_ and word in word_given_label_chances[label]:
                log_prob = np.log(word_given_label_chances[label][word])
                label_probabilities[label] += log_prob
    
    # Predict the label with the highest probability
    predicted_label = max(label_probabilities, key=label_probabilities.get)
    return predicted_label

# Example usage:
input_text = "wth is that playing missy? ........ i mean seriously? RT @mr_republicann: This movie gone be trash http://t.co/8BIppVUvzr"
predicted_label = predict_single_text(input_text, label_categories, label_chances, word_given_label_chances, word_counter)
print("Predicted Label:", predicted_label)


###### 