In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
train = pd.read_csv('/content/Generated_text.csv')

In [5]:
train1 = train[train.RDizzl3_seven == False].reset_index(drop=True)
train1=train[train["label"]==1].sample(8000)
train = train[train.RDizzl3_seven == True].reset_index(drop=True)

In [6]:
train=pd.concat([train,train1])

In [None]:
from collections import Counter
def build_vocabulary(data, min_occurrences=5):
    words = [word for essay in data['text'] for word in essay.split()]
    word_counts = Counter(words)

    vocabulary = [word for word, count in word_counts.items() if count <= min_occurrences]

    reverse_index = {word: index for index, word in enumerate(vocabulary)}

    return vocabulary, reverse_index

vocabulary, reverse_index = build_vocabulary(train)

In [7]:
from sklearn.model_selection import train_test_split
train_data, dev_data = train_test_split(train, test_size=0.2, random_state=42)

In [9]:
documents = train_data['text']
labels = train_data['label']

In [10]:
import re
from collections import defaultdict
from math import log, exp

class NaiveBayesClassifier:
    def __init__(self):
        self.class_probs = defaultdict(float)
        self.word_probs = defaultdict(lambda: defaultdict(float))
        self.classes = set()

    def train(self, documents, labels, alpha):
        # Count occurrences of each class
        class_counts = defaultdict(int)
        for label in labels:
            class_counts[label] += 1
            self.classes.add(label)

        total_documents = len(labels)

        # Calculate class probabilities
        for label, count in class_counts.items():
            self.class_probs[label] = count / total_documents

        # Count word occurrences in each class with Laplace smoothing
        word_counts = defaultdict(lambda: defaultdict(int))

        for doc, label in zip(documents, labels):
            words = re.findall(r'\b\w+\b', doc.lower())
            for word in words:
                word_counts[label][word] += 1

        # Calculate word probabilities for each class with Laplace smoothing
        for label in self.classes:
            total_words_in_class = sum(word_counts[label].values())
            total_unique_words = len(set(word_counts[label].keys()))

            for word, count in word_counts[label].items():
                self.word_probs[label][word] = (count + alpha) / (total_words_in_class + alpha * total_unique_words)

    def get_top_words_per_class(self, top_n=10):
        top_words_per_class = defaultdict(list)

        for label in self.classes:
            word_probabilities = self.word_probs[label]
            top_words = sorted(word_probabilities, key=word_probabilities.get, reverse=True)[:top_n]
            top_words_per_class[label] = top_words

        return top_words_per_class

    def predict(self, documents):
        # Assuming 'documents' is a list of documents
        probabilities_list = []

        for document in documents:
            words = re.findall(r'\b\w+\b', str(document).lower())
            scores = defaultdict(float)

            for label in self.classes:
                scores[label] = log(self.class_probs[label])

                for word in words:
                    scores[label] += log(self.word_probs[label].get(word, 1e-10))

            exp_scores = {label: exp(score) for label, score in scores.items()}
            sum_exp_scores = sum(exp_scores.values())

            if sum_exp_scores == 0:
                probabilities = {label: 1 / len(self.classes) for label in self.classes}
            else:
                probabilities = {label: exp_score / sum_exp_scores for label, exp_score in exp_scores.items()}

            probabilities_list.append(probabilities)

        return probabilities_list




# Create and train the classifier
alpha = [0.0, 0.1, 0.5, 1.0, 5.0]


dev_data_documents = dev_data['text']
dev_data_labels = dev_data['label']
best_accuracy = 0
best_alpha = 0
for alpha_val in alpha:
    result = []
    classifier = NaiveBayesClassifier()
    classifier.train(documents, labels, alpha_val)
    train_prob_list = classifier.predict(dev_data_documents)
    for i in range(len(train_prob_list)):
        if(train_prob_list[i][0] >= train_prob_list[i][1]):
            result.append(0)
        else:
            result.append(1)
    count = 0
    for i in range(len(result)):
        if(result[i] == dev_data_labels.iloc[i]):
            count = count + 1

    accuracy = count / len(result)

    if(accuracy > best_accuracy):
        best_accuracy = accuracy
        best_alpha = alpha_val

    print("For alpha value:", alpha_val, "Accuracy:", accuracy)

print("For Best Alpha value:", best_alpha, "Accuracy:", best_accuracy)

best_classifier = NaiveBayesClassifier()
best_classifier.train(documents, labels, best_alpha)
top_words = best_classifier.get_top_words_per_class()
for label, words in top_words.items():
    print(f"Top words for class '{label}': {', '.join(words)}")

For alpha value: 0.0 Accuracy: 0.5015817223198594
For alpha value: 0.1 Accuracy: 0.5015817223198594
For alpha value: 0.5 Accuracy: 0.5015817223198594
For alpha value: 1.0 Accuracy: 0.5015817223198594
For alpha value: 5.0 Accuracy: 0.5015817223198594
For Best Alpha value: 0.0 Accuracy: 0.5015817223198594
Top words for class '0': the, to, a, of, and, in, is, that, it, be
Top words for class '1': the, to, and, of, a, in, that, is, can, it


In [11]:
dev_data_documents = dev_data['text']
dev_data_labels = dev_data['label']
train_prob_list = best_classifier.predict(dev_data_documents)


In [12]:
len(dev_data_documents)

5690

In [13]:
len(train_prob_list)

5690

In [15]:
test_data = pd.read_csv('/content/test_essays.csv')
test_data

Unnamed: 0,id,prompt_id,text
0,0000aaaa,2,Aaa bbb ccc.
1,1111bbbb,3,Bbb ccc ddd.
2,2222cccc,4,CCC ddd eee.


In [16]:
test_documents = test_data['text']
probabilities_list = classifier.predict(test_documents)

print(probabilities_list)

[{0: 3.1793208630032804e-05, 1: 0.9999682067913701}, {0: 0.5011862917398942, 1: 0.4988137082601059}, {0: 0.5011862917398942, 1: 0.4988137082601059}]


In [17]:
proba = []
for i in probabilities_list:
    proba.append(i[1])

predicted = np.array(proba)

In [18]:
output= pd.DataFrame({'id':test_data["id"],'generated':predicted})
output.to_csv('submission.csv', index=False)