In [1]:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler
import pandas as pd
import numpy as np

In [2]:
#check if cuda is available
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")


Using device: cuda


In [3]:
#fetch the data from the path
data_path = "DataSets/Processed_Data_small.csv"  
df = pd.read_csv(data_path)

In [4]:
#drop null records
df.dropna(inplace=True)

In [5]:
# segregate text data, numrical data and labels
text_data = df['text']
numerical_features = df[['avg_len_sentences', 'punctuations_count','readability_score','words_count','stop_word_ratio']]
labels = df['generated']  # 1 for AI-generated, 0 for human-generated


In [6]:
#scale the numerical features to be in the range of 0 to 1
scaler = MinMaxScaler()
numerical_features = scaler.fit_transform(numerical_features)

In [7]:
#split the data as 70% training data and 30% test data
X_train_text, X_test_text, X_train_num, X_test_num, y_train, y_test = train_test_split(
    text_data, numerical_features, labels, test_size=0.3, random_state=42, stratify=labels
)

In [8]:

from sklearn.feature_extraction.text import TfidfVectorizer

# Use TF-IDF to vectorize text with max features as 1000 and ngram range (1,2)
tfidf_vectorizer = TfidfVectorizer(max_features=1000, ngram_range=(1, 2)) 
X_train_text_tfidf = tfidf_vectorizer.fit_transform(X_train_text).toarray()
X_test_text_tfidf = tfidf_vectorizer.transform(X_test_text).toarray()


In [9]:
#Class and methods to convert each feature to torch tensor
class TextDataset(Dataset):
    def __init__(self, text_features, numerical_features, labels):
        self.text_features = torch.tensor(text_features, dtype=torch.float32)
        self.numerical_features = torch.tensor(numerical_features, dtype=torch.float32)
        self.labels = torch.tensor(labels.values, dtype=torch.long)

    def __len__(self):
        return len(self.labels)

    def __getitem__(self, idx):
        return self.text_features[idx], self.numerical_features[idx], self.labels[idx]


In [10]:
#convert train and test data separately
train_dataset = TextDataset(X_train_text_tfidf, X_train_num, y_train)
test_dataset = TextDataset(X_test_text_tfidf, X_test_num, y_test)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

In [11]:
#Add layers to the NN model with 3 linear layers using ReLU activation function and dropout of 30%
class TextClassificationModel(nn.Module):
    def __init__(self, text_input_dim, num_input_dim, num_classes=2):
        super(TextClassificationModel, self).__init__()
        self.text_fc = nn.Sequential(
            nn.Linear(text_input_dim, 512),
            nn.ReLU(),
            nn.Dropout(0.3)
        )
        self.num_fc = nn.Sequential(
            nn.Linear(num_input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3)
        )
        self.combined_fc = nn.Sequential(
            nn.Linear(512 + 128, 256),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(256, num_classes)
        )
    def forward(self, text_input, num_input):
        text_output = self.text_fc(text_input)
        num_output = self.num_fc(num_input)
        combined = torch.cat((text_output, num_output), dim=1)
        output = self.combined_fc(combined)
        return output

In [12]:
#Convert the model to use GPU
model = TextClassificationModel(text_input_dim=X_train_text_tfidf.shape[1], num_input_dim=X_train_num.shape[1])
model.to(device)


TextClassificationModel(
  (text_fc): Sequential(
    (0): Linear(in_features=1000, out_features=512, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
  )
  (num_fc): Sequential(
    (0): Linear(in_features=5, out_features=128, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
  )
  (combined_fc): Sequential(
    (0): Linear(in_features=640, out_features=256, bias=True)
    (1): ReLU()
    (2): Dropout(p=0.3, inplace=False)
    (3): Linear(in_features=256, out_features=2, bias=True)
  )
)

In [13]:
# add optimizer and loss functions to the neural netowrks
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)


In [14]:
#train the model by converting every feature to utilize GPU
from sklearn.metrics import accuracy_score

def train_model(model, train_loader, criterion, optimizer, device, epochs=5):
    model.train()
    for epoch in range(epochs):
        total_loss = 0
        correct = 0
        total = 0
        for text_features, numerical_features, labels in train_loader:
            text_features, numerical_features, labels = text_features.to(device), numerical_features.to(device), labels.to(device)

            outputs = model(text_features, numerical_features)
            loss = criterion(outputs, labels)
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            total_loss += loss.item()
            
            _, predicted = torch.max(outputs, 1)  # Get the class with the highest score
            correct += (predicted == labels).sum().item()
            total += labels.size(0)

        accuracy = 100 * correct / total
        print(f"Epoch [{epoch + 1}/{epochs}], Loss: {total_loss / len(train_loader):.4f}, Accuracy: {accuracy:.4f}")


In [15]:
#call the train function with all necessary data, model and optimizers
train_model(model, train_loader, criterion, optimizer, device, epochs=20)

Epoch [1/20], Loss: 0.2582, Accuracy: 88.8126
Epoch [2/20], Loss: 0.2029, Accuracy: 91.4647
Epoch [3/20], Loss: 0.1766, Accuracy: 92.6312
Epoch [4/20], Loss: 0.1522, Accuracy: 93.7523
Epoch [5/20], Loss: 0.1299, Accuracy: 94.7856
Epoch [6/20], Loss: 0.1097, Accuracy: 95.6241
Epoch [7/20], Loss: 0.0943, Accuracy: 96.3490
Epoch [8/20], Loss: 0.0819, Accuracy: 96.8538
Epoch [9/20], Loss: 0.0719, Accuracy: 97.2715
Epoch [10/20], Loss: 0.0645, Accuracy: 97.5930
Epoch [11/20], Loss: 0.0575, Accuracy: 97.8474
Epoch [12/20], Loss: 0.0530, Accuracy: 98.0464
Epoch [13/20], Loss: 0.0498, Accuracy: 98.1875
Epoch [14/20], Loss: 0.0463, Accuracy: 98.2997
Epoch [15/20], Loss: 0.0424, Accuracy: 98.4541
Epoch [16/20], Loss: 0.0405, Accuracy: 98.5387
Epoch [17/20], Loss: 0.0374, Accuracy: 98.6613
Epoch [18/20], Loss: 0.0367, Accuracy: 98.6909
Epoch [19/20], Loss: 0.0339, Accuracy: 98.7953
Epoch [20/20], Loss: 0.0339, Accuracy: 98.8006


In [16]:
#evaluat the model for test data
def evaluate_model(model, test_loader, device):
    model.eval()
    all_preds = []
    all_labels = []
    with torch.no_grad():
        for text_features, numerical_features, labels in test_loader:
            text_features, numerical_features, labels = text_features.to(device), numerical_features.to(device), labels.to(device)

            outputs = model(text_features, numerical_features)
            _, preds = torch.max(outputs, 1)
            all_preds.extend(preds.cpu().numpy())
            all_labels.extend(labels.cpu().numpy())

    return all_preds, all_labels


In [17]:
#classification report for the test data
from sklearn.metrics import classification_report, confusion_matrix

y_pred, y_true = evaluate_model(model, test_loader, device)
print(confusion_matrix(y_true, y_pred))
print(classification_report(y_true, y_pred))

[[54715  5285]
 [ 5367 54589]]
              precision    recall  f1-score   support

           0       0.91      0.91      0.91     60000
           1       0.91      0.91      0.91     59956

    accuracy                           0.91    119956
   macro avg       0.91      0.91      0.91    119956
weighted avg       0.91      0.91      0.91    119956



In [None]:
import pickle

# Save the model as a pickle file
def save_model_as_pickle(model, file_path="ai_vs_human_text_model.pkl"):
    with open(file_path, 'wb') as f:
        pickle.dump(model, f)
    print(f"Model saved as pickle to {file_path}")

save_model_as_pickle(model, file_path="ai_vs_human_text_model.pkl")

In [None]:
def save_model(model, file_path="trained_model.pth"):
    torch.save(model.state_dict(), file_path)
    print(f"Model saved to {file_path}")

save_model(model, file_path="ai_vs_human_text_model.pth")