In [1]:
!mkdir -p ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json

In [2]:
!kaggle datasets download -d shanegerami/ai-vs-human-text

Downloading ai-vs-human-text.zip to /content
100% 349M/350M [00:06<00:00, 91.4MB/s]
100% 350M/350M [00:06<00:00, 60.7MB/s]


In [3]:
from zipfile import ZipFile
dataset = '/content/ai-vs-human-text.zip'

with ZipFile(dataset,'r') as zip:
  zip.extractall()
  print('Extracted')

Extracted


Reading csv file

In [4]:
import pandas as pd

In [61]:
import numpy as np

In [5]:
table=pd.read_csv('AI_Human.csv')

In [6]:
table.head()

Unnamed: 0,text,generated
0,Cars. Cars have been around since they became ...,0.0
1,Transportation is a large necessity in most co...,0.0
2,"""America's love affair with it's vehicles seem...",0.0
3,How often do you ride in a car? Do you drive a...,0.0
4,Cars are a wonderful thing. They are perhaps o...,0.0


In [7]:
table['generated']=table['generated'].apply(int)

In [9]:
table.head()

Unnamed: 0,text,generated
0,Cars. Cars have been around since they became ...,0
1,Transportation is a large necessity in most co...,0
2,"""America's love affair with it's vehicles seem...",0
3,How often do you ride in a car? Do you drive a...,0
4,Cars are a wonderful thing. They are perhaps o...,0


In [10]:
len(table)

487235

In [14]:
max([len(i.split()) for i in table['text']])

1656

In [15]:
import string
import nltk
from nltk.stem import PorterStemmer
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [16]:
ps=PorterStemmer()

In [17]:
def punc_rem(text):
  for i in string.punctuation:
    text=text.replace(i,'')
  return text
def stemmer(text):
  new_text=[]
  for word in text.split():
    word=ps.stem(word)
    new_text.append(word)
  return " ".join(new_text)

Lowercasing, removing punctuation mark and stemming in all the text in the table

In [18]:
table['text']=table['text'].apply(lambda x:x.lower())
table['text']=table['text'].apply(punc_rem)
table['text']=table['text'].apply(stemmer)

In [19]:
table.head()

Unnamed: 0,text,generated
0,car car have been around sinc they becam famou...,0
1,transport is a larg necess in most countri wor...,0
2,america love affair with it vehicl seem to be ...,0
3,how often do you ride in a car do you drive a ...,0
4,car are a wonder thing they are perhap one of ...,0


Slicing up the table to 60000 data to prevent RAM crashing

In [20]:
table=table.head(60000)

In [21]:
table.head()

Unnamed: 0,text,generated
0,car car have been around sinc they becam famou...,0
1,transport is a larg necess in most countri wor...,0
2,america love affair with it vehicl seem to be ...,0
3,how often do you ride in a car do you drive a ...,0
4,car are a wonder thing they are perhap one of ...,0


In [41]:
table['generated'].value_counts()

1    32949
0    27051
Name: generated, dtype: int64

Now tokenizing all word in the text columns and converting all the sentences into a tokenized sequence

In [22]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

In [23]:
tokenizer=Tokenizer()

In [24]:
tokenizer.fit_on_texts(table['text'])

In [25]:
len(tokenizer.word_index)

78251

In [26]:
sequences=tokenizer.texts_to_sequences(table['text'])

In [27]:
max([len(x) for x in sequences])

1656

In [28]:
padded=pad_sequences(sequences,maxlen=max([len(x) for x in sequences]),padding='post')

In [29]:
padded.shape

(60000, 1656)

In [30]:
padded

array([[  20,   20,   15, ...,    0,    0,    0],
       [ 110,    8,    5, ...,    0,    0,    0],
       [ 721,  485, 3293, ...,    0,    0,    0],
       ...,
       [ 760, 1954, 1038, ...,    0,    0,    0],
       [ 760, 1250,   28, ...,    0,    0,    0],
       [ 760, 1038,  662, ...,    0,    0,    0]], dtype=int32)

Seperating the data column and the target column

In [31]:
X=padded
y=table['generated']

Splitting both into training and testing data in the ratio of 75:25

In [32]:
from sklearn.model_selection import train_test_split

In [33]:
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=21)

In [34]:
X_train.shape,X_test.shape

((45000, 1656), (15000, 1656))

Building the Model

In [35]:
from tensorflow.keras import models,layers

In [36]:
model=models.Sequential()

In [37]:
model.add(layers.Embedding(len(tokenizer.word_index)+1,100,mask_zero=True))
model.add(layers.LSTM(150,dropout=0.2))
model.add(layers.Dropout(0.4))
model.add(layers.Dense(1,activation='sigmoid'))

model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['accuracy'])

Training the model with the training data with 2 epochs and validation split of 10%

In [38]:
model.fit(X_train,y_train,epochs=2,validation_split=0.2)

Epoch 1/2
Epoch 2/2


<keras.src.callbacks.History at 0x7b50807157b0>

After training we will evaluate testing data

In [39]:
model.evaluate(X_test,y_test)



[0.060472894459962845, 0.9798666834831238]

Train Data Accuracy : 97.35% \
Validation Data Accuracy : 97.96% \
Test Data Accuracy : 97.98%

***

In [60]:
pred=model.predict(X)



In [62]:
pred=np.array([np.round(i) for i in pred])

In [63]:
from sklearn.metrics import accuracy_score

In [64]:
accuracy_score(table['generated'],pred)

0.9847833333333333

We got our overall accuracy of our model  as 98.47%

***

Exporting the model

In [40]:
import pickle
pickle.dump(model,open("AI_vs_Human.pkl",'wb'))

***

Testing the model with random sentences some generated by me and some by ChatGPT

In [48]:
text="Innovation in artificial intelligence is driving the future of technology"
text=text.lower()
text=punc_rem(text)
text=stemmer(text)
seq=tokenizer.texts_to_sequences([text])
pad=pad_sequences(seq,maxlen=max([len(i) for i in sequences]),padding='post')
pred=model.predict(pad)
print(pred)
if pred>=0.5:
  print('AI Generated Text')
else:
  print('Human Generated Text')

[[0.57946676]]
AI Generated Text


In [47]:
text="The ethical implications of AI development must be carefully considered to ensure a fair and just society."
text=text.lower()
text=punc_rem(text)
text=stemmer(text)
seq=tokenizer.texts_to_sequences([text])
pad=pad_sequences(seq,maxlen=max([len(i) for i in sequences]),padding='post')
pred=model.predict(pad)
print(pred)
if pred>=0.5:
  print('AI Generated Text')
else:
  print('Human Generated Text')

[[0.99829555]]
AI Generated Text


In [50]:
text="Hey! I made this project a week ago i wanted to share this with you. Can you please check it out?"
text=text.lower()
text=punc_rem(text)
text=stemmer(text)
seq=tokenizer.texts_to_sequences([text])
pad=pad_sequences(seq,maxlen=max([len(i) for i in sequences]),padding='post')
pred=model.predict(pad)
print(pred)
if pred>=0.5:
  print('AI Generated Text')
else:
  print('Human Generated Text')

[[0.02038683]]
Human Generated Text


In [51]:
text="Deep learning models have achieved human-level performance in tasks such as image recognition and language translation."
text=text.lower()
text=punc_rem(text)
text=stemmer(text)
seq=tokenizer.texts_to_sequences([text])
pad=pad_sequences(seq,maxlen=max([len(i) for i in sequences]),padding='post')
pred=model.predict(pad)
print(pred)
if pred>=0.5:
  print('AI Generated Text')
else:
  print('Human Generated Text')

[[0.99741626]]
AI Generated Text


In [52]:
text="I am thinking of going to New York for studying masters is this the right descision?"
text=text.lower()
text=punc_rem(text)
text=stemmer(text)
seq=tokenizer.texts_to_sequences([text])
pad=pad_sequences(seq,maxlen=max([len(i) for i in sequences]),padding='post')
pred=model.predict(pad)
print(pred)
if pred>=0.5:
  print('AI Generated Text')
else:
  print('Human Generated Text')

[[0.01168367]]
Human Generated Text


In [53]:
text="As technology continues to advance at an exponential pace, the integration of artificial intelligence into various aspects of our lives becomes increasingly pervasive, influencing everything from the way we communicate and work to how we interact with the world around us."
text=text.lower()
text=punc_rem(text)
text=stemmer(text)
seq=tokenizer.texts_to_sequences([text])
pad=pad_sequences(seq,maxlen=max([len(i) for i in sequences]),padding='post')
pred=model.predict(pad)
print(pred)
if pred>=0.5:
  print('AI Generated Text')
else:
  print('Human Generated Text')

[[0.9897719]]
AI Generated Text


In [54]:
text="As the global community grapples with pressing challenges such as climate change, resource scarcity, and public health crises, AI offers unprecedented opportunities to leverage data-driven insights and innovative technologies to address these complex issues and drive positive societal change."
text=text.lower()
text=punc_rem(text)
text=stemmer(text)
seq=tokenizer.texts_to_sequences([text])
pad=pad_sequences(seq,maxlen=max([len(i) for i in sequences]),padding='post')
pred=model.predict(pad)
print(pred)
if pred>=0.5:
  print('AI Generated Text')
else:
  print('Human Generated Text')

[[0.9985767]]
AI Generated Text


In [55]:
text="From self-driving cars and virtual assistants to personalized recommendations and medical diagnostics, the potential applications of AI are vast and far-reaching, promising to transform industries, enhance productivity, and improve quality of life for people around the globe."
text=text.lower()
text=punc_rem(text)
text=stemmer(text)
seq=tokenizer.texts_to_sequences([text])
pad=pad_sequences(seq,maxlen=max([len(i) for i in sequences]),padding='post')
pred=model.predict(pad)
print(pred)
if pred>=0.5:
  print('AI Generated Text')
else:
  print('Human Generated Text')

[[0.9969254]]
AI Generated Text


In [59]:
text="As we navigate the complexities of a rapidly evolving technological landscape characterized by exponential growth in computational power, vast amounts of digital data, and the proliferation of AI-driven applications across diverse domains ranging from finance and healthcare to transportation and entertainment, it becomes increasingly evident that the future of humanity is deeply intertwined with the trajectory of artificial intelligence, necessitating a collaborative and multidisciplinary approach to address the myriad challenges and opportunities that lie ahead in this transformative journey toward an AI-enabled society."
text=text.lower()
text=punc_rem(text)
text=stemmer(text)
seq=tokenizer.texts_to_sequences([text])
pad=pad_sequences(seq,maxlen=max([len(i) for i in sequences]),padding='post')
pred=model.predict(pad)
print(pred)
if pred>=0.5:
  print('AI Generated Text')
else:
  print('Human Generated Text')

[[0.9987736]]
AI Generated Text


In [58]:
text="This summer, I had the privilege of interning with Southwest Airlines as an audio and visual intern. I worked at Southwest’s headquarters in Dallas, Texas, and was part of the A/V Team at Southwest Airlines University."
text=text.lower()
text=punc_rem(text)
text=stemmer(text)
seq=tokenizer.texts_to_sequences([text])
pad=pad_sequences(seq,maxlen=max([len(i) for i in sequences]),padding='post')
pred=model.predict(pad)
print(pred)
if pred>=0.5:
  print('AI Generated Text')
else:
  print('Human Generated Text')

[[0.17208365]]
Human Generated Text
