In [1]:
import numpy as np
import pandas as pd

In [2]:
df= pd.read_csv('AI Generated Essays Dataset.csv')

In [3]:
df.isnull().sum()

text         0
generated    0
dtype: int64

In [4]:
df.info

<bound method DataFrame.info of                                                    text  generated
0     Machine learning, a subset of artificial intel...          1
1     A decision tree, a prominent machine learning ...          1
2     Education, a cornerstone of societal progress,...          1
3     Computers, the backbone of modern technology, ...          1
4     Chess, a timeless game of strategy and intelle...          1
...                                                 ...        ...
1455  There has been a fuss about the Elector Colleg...          0
1456  Limiting car usage has many advantages. Such a...          0
1457  There's a new trend that has been developing f...          0
1458  As we all know cars are a big part of our soci...          0
1459  Cars have been around since the 1800's and hav...          0

[1460 rows x 2 columns]>

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score


In [6]:
x = df['text']
y = df['generated']

In [7]:
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [8]:
tfidf = TfidfVectorizer(stop_words='english', max_features=5000)
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.transform(X_test)

In [9]:
lr_model = LogisticRegression(max_iter=1000)
lr_model.fit(X_train_tfidf, y_train)


In [10]:
y_pred = lr_model.predict(X_test_tfidf)

In [11]:
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred, target_names=['Human', 'AI'])
conf_matrix = confusion_matrix(y_test, y_pred)

In [12]:
# Let's create a simple function to predict if a given essay/sentence is Human or AI
def predict_text_origin(text):
    text_tfidf = tfidf.transform([text])
    pred = lr_model.predict(text_tfidf)[0]
    label = "AI-generated" if pred == 1 else "Human-written"
    return label

# Example prediction
example_texts = [
    "The quick brown fox jumps over the lazy dog.",
    "Artificial intelligence is revolutionizing the way we write essays."
]

example_predictions = {txt: predict_text_origin(txt) for txt in example_texts}
example_predictions


{'The quick brown fox jumps over the lazy dog.': 'Human-written',
 'Artificial intelligence is revolutionizing the way we write essays.': 'Human-written'}

In [16]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

max_words = 600  # vocabulary size
max_len = 50     # sequence length

tokenizer = Tokenizer(num_words=max_words, oov_token="<OOV>")
tokenizer.fit_on_texts(X_train)
x_train = tokenizer.texts_to_sequences(X_train)
x_test = tokenizer.texts_to_sequences(X_test)

x_train = pad_sequences(x_train, maxlen=max_len, padding='post')
x_test = pad_sequences(x_test, maxlen=max_len, padding='post')


In [17]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense

model = Sequential()
model.add(Embedding(input_dim=max_words, output_dim=100, input_length=max_len))
model.add(LSTM(128, dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])




In [18]:
model.fit(x_train, y_train, validation_data=(x_test, y_test),
          epochs=32, batch_size=64)


Epoch 1/32
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 199ms/step - accuracy: 0.8654 - loss: 0.5263 - val_accuracy: 0.9315 - val_loss: 0.2488
Epoch 2/32
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 149ms/step - accuracy: 0.9558 - loss: 0.1795 - val_accuracy: 0.9418 - val_loss: 0.2235
Epoch 3/32
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 161ms/step - accuracy: 0.9668 - loss: 0.1481 - val_accuracy: 0.9452 - val_loss: 0.1975
Epoch 4/32
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 203ms/step - accuracy: 0.9576 - loss: 0.1506 - val_accuracy: 0.9726 - val_loss: 0.0872
Epoch 5/32
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 168ms/step - accuracy: 0.9837 - loss: 0.0474 - val_accuracy: 0.9795 - val_loss: 0.0910
Epoch 6/32
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 164ms/step - accuracy: 0.9890 - loss: 0.0364 - val_accuracy: 0.9795 - val_loss: 0.0843
Epoch 7/32
[1m19/19[0m [3

<keras.src.callbacks.history.History at 0x1ffeebdf910>

In [19]:
# Basic evaluation (loss, accuracy)
test_loss, test_acc = model.evaluate(x_test, y_test, verbose=0)
print(f"Test Loss: {test_loss:.4f}")
print(f"Test Accuracy: {test_acc:.4f}")


Test Loss: 0.0433
Test Accuracy: 0.9932


In [None]:
y_prob = model.predict(x_test, batch_size=128).ravel()
threshold = 0.5
y_pred = (y_prob >= threshold).astype(int)

[1m3/3[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 313ms/step


In [None]:
from sklearn.metrics import (
    classification_report,
    confusion_matrix,
    accuracy_score,
    precision_recall_fscore_support,
    roc_auc_score,
    roc_curve,
    precision_recall_curve
)

acc = accuracy_score(y_test, y_pred)
print(f"Accuracy (threshold={threshold}): {acc:.4f}")

print(classification_report(
    y_test,
    y_pred,
    target_names=["Human", "AI"]
))

cm = confusion_matrix(y_test, y_pred)
print("Confusion Matrix:")
print(cm)


Accuracy (threshold=0.5): 0.9932
              precision    recall  f1-score   support

       Human       0.99      1.00      1.00       272
          AI       1.00      0.90      0.95        20

    accuracy                           0.99       292
   macro avg       1.00      0.95      0.97       292
weighted avg       0.99      0.99      0.99       292

Confusion Matrix:
[[272   0]
 [  2  18]]
