In [8]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/llm-aigenerated/train_v2_drcat_02.csv
/kaggle/input/llm-aigenerated/llm-detect-ai-generated-text/sample_submission.csv
/kaggle/input/llm-aigenerated/llm-detect-ai-generated-text/train_prompts.csv
/kaggle/input/llm-aigenerated/llm-detect-ai-generated-text/test_essays.csv
/kaggle/input/llm-aigenerated/llm-detect-ai-generated-text/train_essays.csv


In [64]:
# load data
train_data = pd.read_csv('/kaggle/input/llm-aigenerated/llm-detect-ai-generated-text/train_essays.csv')
test_data = pd.read_csv('/kaggle/input/llm-aigenerated/llm-detect-ai-generated-text/test_essays.csv')
ex_data = pd.read_csv('/kaggle/input/llm-aigenerated/train_v2_drcat_02.csv')

In [65]:
# 为解决数据不平衡问题，合并train_data和ex_data
# rename ex_data
ex_data.rename(columns={'label':'generated'}, inplace = True)
# 截取train_data和ex_data的text, generated列
select_columns = ['text','generated']
train_data_2 = train_data[select_columns]
ex_data_2 = ex_data[select_columns]
# 垂直合并train_data & ex_data
train_data_2 = pd.concat([train_data_2,ex_data_2],axis=0)
print(train_data_2)
print(train_data_2['generated'].value_counts())

                                                    text  generated
0      Cars. Cars have been around since they became ...          0
1      Transportation is a large necessity in most co...          0
2      "America's love affair with it's vehicles seem...          0
3      How often do you ride in a car? Do you drive a...          0
4      Cars are a wonderful thing. They are perhaps o...          0
...                                                  ...        ...
44863  Dear Senator,\n\nI am writing to you today to ...          1
44864  Dear Senator,\n\nI am writing to you today to ...          1
44865  Dear Senator,\n\nI am writing to you today to ...          1
44866  Dear Senator,\n\nI am writing to you today to ...          1
44867  Dear Senator,\n\nI am writing to you today to ...          1

[46246 rows x 2 columns]
generated
0    28746
1    17500
Name: count, dtype: int64


In [34]:
train_X = train_data_2['text']
train_Y = train_data_2['generated']
print(train_X)
print(train_Y)

0        Cars. Cars have been around since they became ...
1        Transportation is a large necessity in most co...
2        "America's love affair with it's vehicles seem...
3        How often do you ride in a car? Do you drive a...
4        Cars are a wonderful thing. They are perhaps o...
                               ...                        
44863    Dear Senator,\n\nI am writing to you today to ...
44864    Dear Senator,\n\nI am writing to you today to ...
44865    Dear Senator,\n\nI am writing to you today to ...
44866    Dear Senator,\n\nI am writing to you today to ...
44867    Dear Senator,\n\nI am writing to you today to ...
Name: text, Length: 46246, dtype: object
0        0
1        0
2        0
3        0
4        0
        ..
44863    1
44864    1
44865    1
44866    1
44867    1
Name: generated, Length: 46246, dtype: int64


In [45]:
# 特征表示（向量表示）
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn import model_selection

In [41]:
# 创建TfidfVectorizer对象
# 设置词汇表大小为10000（可修改参数）
vectorizer = TfidfVectorizer(max_features = 10000)
# 将文本转换为TF-IDF向量表示
train_Xtf = vectorizer.fit_transform(train_X)
# 将TF-IDF向量表示和类别值转换为numpy数组
train_Xtf = train_Xtf.toarray()
train_Y = np.array(train_Y)
print(train_Xtf)
print(train_Xtf.shape)
print(train_Y)
print(train_Y.shape)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]
(46246, 10000)
[0 0 0 ... 1 1 1]
(46246,)


In [None]:
print(np.count_nonzero(train_Xtf))
print(vectorizer.vocabulary_)

In [None]:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers

In [40]:
# 定义文本分类模型
def create_model(input_dim, output_dim):
    model = keras.Sequential()
    model.add(layers.Embedding(input_dim=input_dim, output_dim=64))
    model.add(layers.Conv1D(128, 5, activation='relu'))
    model.add(layers.GlobalMaxPooling1D())
    model.add(layers.Dense(64, activation='relu'))
    model.add(layers.Dense(output_dim, activation='softmax'))
    return model

In [55]:
# split train data for validation
x_train, x_valid, y_train, y_valid = model_selection.train_test_split(train_Xtf, train_Y,
train_size=0.80, test_size=0.20, random_state=4487)
input_dim = 10000 # 词汇表大小为10000，即输入维度为10000
output_dim = 2  # 二分类问题

# 对文本序列进行填充，使其长度一致：压缩无效的填充值 0
max_length = 1000  # 假设最大序列长度为200
x_train = keras.preprocessing.sequence.pad_sequences(x_train, maxlen=max_length)
x_valid = keras.preprocessing.sequence.pad_sequences(x_valid, maxlen=max_length)

# 创建模型
model = create_model(input_dim=input_dim, output_dim=output_dim)
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

In [56]:
# 训练模型
model.fit(x_train, y_train, batch_size=64, epochs=5, validation_data=(x_valid, y_valid))

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.src.callbacks.History at 0x7e8449f34130>

In [60]:
# predit 
test_Xtf = vectorizer.fit_transform(test_data['text'])
test_Xtf = test_Xtf.toarray()
print(test_Xtf)
predY = model.predict(test_Xtf)
print(predY)

[[0.72033345 0.54783215 0.42544054 0.         0.        ]
 [0.         0.61980538 0.48133417 0.61980538 0.        ]
 [0.         0.         0.42544054 0.54783215 0.72033345]]
[[0.6317123  0.36828765]
 [0.6317123  0.36828765]
 [0.6317123  0.36828765]]


In [63]:
# write to csv
# 数组转化为数据帧
predY_df = pd.DataFrame(predY)
print(predY_df)
pd.DataFrame({'id':test_data['id'], 'generated':predY_df[1]}).to_csv('submission.csv', index=False)

          0         1
0  0.631712  0.368288
1  0.631712  0.368288
2  0.631712  0.368288
