In [3]:
import pandas as pd
from sklearn.model_selection import train_test_split

#Read
data = pd.read_csv('Training_Essay_data.csv')

#Split to train and test
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

print("Training Data:")
print(train_data.head())

print("\nTest Data:")
print(test_data.head())

Training Data:
                                                    text  generated
10607  If I were a scientist at NASA I will not belie...          0
27769  "America's love affair with it's vehicles seem...          0
7663   There is a great challenge when it comes to ex...          0
2916   Cell phones have become very popular over the ...          0
8409   From the research and development of Dr. Huang...          0

Test Data:
                                                    text  generated
17004  I would agree with Emerson's in this world be ...          0
14459  Advice is wonderful and helpful to everyone. S...          0
28492  I think that limiting car usage is great for t...          0
10134  Nobody know how the face got on mars because w...          0
23657  The student has studied lot of subjects in the...          1


In [7]:
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.feature_extraction.text import TfidfVectorizer

# Prepare the data
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(train_data['text'])
y_train = train_data['generated']
X_test = vectorizer.transform(test_data['text'])
y_test = test_data['generated']

# Initialize the model
model = LinearRegression()

# Train the model
model.fit(X_train, y_train)

# Predict on training and testing data
y_train_pred = model.predict(X_train)
y_test_pred = model.predict(X_test)

# Calculate training and testing accuracy
train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)

print(f"Training MSE: {train_mse}")
print(f"Testing MSE: {test_mse}")

# Calculate accuracy
train_accuracy = model.score(X_train, y_train)
test_accuracy = model.score(X_test, y_test)

print(f"Training Accuracy: {train_accuracy}")
print(f"Testing Accuracy: {test_accuracy}")

Training MSE: 5.447241673389782e-07
Testing MSE: 0.10461939701778478
Training Accuracy: 0.999997731978449
Testing Accuracy: 0.5613841936268535


In [9]:
import torch
from torch.utils.data import DataLoader, TensorDataset

import torch.nn as nn
import torch.optim as optim

# Convert the data to PyTorch tensors
X_train_tensor = torch.tensor(X_train.toarray(), dtype=torch.float32)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1)
X_test_tensor = torch.tensor(X_test.toarray(), dtype=torch.float32)
y_test_tensor = torch.tensor(y_test.values, dtype=torch.float32).view(-1, 1)

# Create DataLoader for training and testing data
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False)

# Define a simple neural network model
class SimpleNN(nn.Module):
    def __init__(self, input_dim):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)
        self.relu = nn.ReLU()
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.fc3(x)
        return x

# Initialize the model, loss function, and optimizer
input_dim = X_train.shape[1]
model = SimpleNN(input_dim)
criterion = nn.MSELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)

# Train the model
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    for X_batch, y_batch in train_loader:
        optimizer.zero_grad()
        y_pred = model(X_batch)
        loss = criterion(y_pred, y_batch)
        loss.backward()
        optimizer.step()
    print(f"Epoch {epoch+1}/{num_epochs}, Loss: {loss.item()}")

# Evaluate the model
model.eval()
with torch.no_grad():
    y_train_pred = model(X_train_tensor).numpy()
    y_test_pred = model(X_test_tensor).numpy()

train_mse = mean_squared_error(y_train, y_train_pred)
test_mse = mean_squared_error(y_test, y_test_pred)

print(f"Training MSE: {train_mse}")
print(f"Testing MSE: {test_mse}")

# Calculate accuracy
train_accuracy = 1 - train_mse
test_accuracy = 1 - test_mse

print(f"Training Accuracy: {train_accuracy}")
print(f"Testing Accuracy: {test_accuracy}")

Epoch 1/10, Loss: 0.003844954539090395
Epoch 2/10, Loss: 0.004386894404888153
Epoch 3/10, Loss: 3.540900070220232e-05
Epoch 4/10, Loss: 8.149359200615436e-05
Epoch 5/10, Loss: 3.334660505061038e-05
Epoch 6/10, Loss: 3.442191518843174e-05
Epoch 7/10, Loss: 2.8987047699047253e-05
Epoch 8/10, Loss: 2.604674045869615e-05
Epoch 9/10, Loss: 3.419326458242722e-05
Epoch 10/10, Loss: 1.0277997716912068e-05
Training MSE: 3.342935441346634e-05
Testing MSE: 0.0023896746468276887
Training Accuracy: 0.9999665706455866
Testing Accuracy: 0.9976103253531723


In [15]:
from transformers import pipeline
import pandas as pd

# Initialize the text generation pipeline
generator = pipeline('text-generation', model='gpt2')

# Generate sentences
generated_texts = []
for _ in range(50):  # Generate 50 longer texts
    generated = generator("Once upon a time", max_length=300, num_return_sequences=1)
    generated_texts.append(generated[0]['generated_text'])

# Generate additional sentences with different topics
topics = [
    "The future of renewable energy",
    "Advancements in artificial intelligence",
    "The impact of climate change on agriculture",
    "The role of technology in education",
    "The benefits of remote work",
    "The importance of mental health awareness",
    "The evolution of electric vehicles",
    "The significance of biodiversity conservation",
    "The challenges of space exploration",
    "The influence of social media on society"
]

for topic in topics:
    for _ in range(5):  # Generate 5 longer texts per topic
        generated = generator(topic, max_length=300, num_return_sequences=1)
        generated_texts.append(generated[0]['generated_text'])

# Create a DataFrame
generated_df = pd.DataFrame({'text': generated_texts, 'generated': [1] * len(generated_texts)})

# Save to CSV
generated_df.to_csv('huggingFaceGenerated.csv', index=False)


Hardware accelerator e.g. GPU is available in the environment, but no `device` argument is passed to the `Pipeline` object. Model will be on CPU.
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-end generation.
Setting `pad_token_id` to `eos_token_id`:None for open-

KeyboardInterrupt: 

In [13]:
# Load the huggingFaceGenerated.csv data
huggingface_data = pd.read_csv('huggingFaceGenerated.csv')

# Preprocess the text data using the same TfidfVectorizer
X_huggingface = vectorizer.transform(huggingface_data['text'])
y_huggingface = huggingface_data['generated']

# Convert the data to PyTorch tensors
X_huggingface_tensor = torch.tensor(X_huggingface.toarray(), dtype=torch.float32)
y_huggingface_tensor = torch.tensor(y_huggingface.values, dtype=torch.float32).view(-1, 1)

# Use the trained model to make predictions
model.eval()
with torch.no_grad():
    y_huggingface_pred = model(X_huggingface_tensor).numpy()

# Calculate MSE and accuracy
huggingface_mse = mean_squared_error(y_huggingface, y_huggingface_pred)
huggingface_accuracy = 1 - huggingface_mse

print(f"HuggingFace Generated Data MSE: {huggingface_mse}")
print(f"HuggingFace Generated Data Accuracy: {huggingface_accuracy}")

HuggingFace Generated Data MSE: 0.3483836183964712
HuggingFace Generated Data Accuracy: 0.6516163816035287
