In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python Docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

# Input data files are available in the read-only "../input/" directory
# For example, running this (by clicking run or pressing Shift+Enter) will list all files under the input directory

import os
for dirname, _, filenames in os.walk('/kaggle/input'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

# You can write up to 20GB to the current directory (/kaggle/working/) that gets preserved as output when you create a version using "Save & Run All" 
# You can also write temporary files to /kaggle/temp/, but they won't be saved outside of the current session

/kaggle/input/glovetwitter27b/glove.twitter.27B.200d.txt
/kaggle/input/glovetwitter27b/glove.twitter.27B.25d.txt
/kaggle/input/glovetwitter27b/glove.twitter.27B.50d.txt
/kaggle/input/glovetwitter27b/glove.twitter.27B.100d.txt
/kaggle/input/nlp-getting-started/sample_submission.csv
/kaggle/input/nlp-getting-started/train.csv
/kaggle/input/nlp-getting-started/test.csv
/kaggle/input/disaster-tweets-text-embeddings/test_text_embeddings.csv
/kaggle/input/disaster-tweets-text-embeddings/train_text_embeddings.csv
/kaggle/input/disaster-tweets-chatgpt-output/chatgpt_submission.csv


In [3]:
!pip install openai

Collecting openai
  Downloading openai-0.27.8-py3-none-any.whl (73 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m73.6/73.6 kB[0m [31m2.6 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: openai
Successfully installed openai-0.27.8
[0m

In [4]:
import nltk
nltk.data.path.append('/kaggle/working')
nltk.download("wordnet", download_dir='/kaggle/working')

import os
os.environ['NLTK_DATA'] = '/kaggle/working'

!mkdir /kaggle/working/corpora/wordnet
!unzip /kaggle/working/corpora/wordnet.zip -d /kaggle/working/corpora

[nltk_data] Downloading package wordnet to /kaggle/working...
Archive:  /kaggle/working/corpora/wordnet.zip
  inflating: /kaggle/working/corpora/wordnet/lexnames  
  inflating: /kaggle/working/corpora/wordnet/data.verb  
  inflating: /kaggle/working/corpora/wordnet/index.adv  
  inflating: /kaggle/working/corpora/wordnet/adv.exc  
  inflating: /kaggle/working/corpora/wordnet/index.verb  
  inflating: /kaggle/working/corpora/wordnet/cntlist.rev  
  inflating: /kaggle/working/corpora/wordnet/data.adj  
  inflating: /kaggle/working/corpora/wordnet/index.adj  
  inflating: /kaggle/working/corpora/wordnet/LICENSE  
  inflating: /kaggle/working/corpora/wordnet/citation.bib  
  inflating: /kaggle/working/corpora/wordnet/noun.exc  
  inflating: /kaggle/working/corpora/wordnet/verb.exc  
  inflating: /kaggle/working/corpora/wordnet/README  
  inflating: /kaggle/working/corpora/wordnet/index.sense  
  inflating: /kaggle/working/corpora/wordnet/data.noun  
  inflating: /kaggle/working/corpora/wor

In [5]:
train_file_path = "/kaggle/input/nlp-getting-started/train.csv"
train_data = pd.read_csv(train_file_path)

print(train_data.columns)

test_file_path = "/kaggle/input/nlp-getting-started/test.csv"
test_data = pd.read_csv(test_file_path)

print(test_data.columns)

Index(['id', 'keyword', 'location', 'text', 'target'], dtype='object')
Index(['id', 'keyword', 'location', 'text'], dtype='object')


In [6]:
import re
import os
from nltk.stem import WordNetLemmatizer
from sklearn.feature_extraction.text import TfidfVectorizer
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

from openai.embeddings_utils import get_embedding

# Class responsible for handling the input data
class Dataset:
    learn_len = None
    
    combined_data = None
    train_data = None
    test_data = None
    learn_data = None
    assess_data = None
    
    train_target_data = None
    learn_target_data = None
    assess_target_data = None
    
    combined_encoded_text = None
    train_encoded_text = None
    test_encoded_text = None
    learn_encoded_text = None
    assess_encoded_text = None
    
    combined_sequences = None
    train_sequences = None
    test_sequences = None
    learn_sequences = None
    assess_sequences = None
    
    vocab_size = None
    tokenizer = None
    max_sequence_length = 28
    
    train_embeddings_read_file_path = "/kaggle/input/disaster-tweets-text-embeddings/train_text_embeddings.csv"
    test_embeddings_read_file_path = "/kaggle/input/disaster-tweets-text-embeddings/test_text_embeddings.csv"
    train_embeddings_file_path = "/kaggle/working/train_text_embeddings.csv"
    test_embeddings_file_path = "/kaggle/working/test_text_embeddings.csv"
    combined_text_embeddings = None
    train_text_embeddings = None
    test_text_embeddings = None
    learn_text_embeddings = None
    assess_text_embeddings = None

    # data is pandas DataFrame
    def __init__(self, train_data, test_data, learn_ratio, assess_ratio):
        assert learn_ratio + assess_ratio == 1, \
        "The sum of learn_ratio and assess_ratio should be equal to 1"
        
        self.train_data = train_data
        self.test_data = test_data
        self.learn_len = int(learn_ratio * len(self.train_data))
        self.learn_data  = self.train_data.iloc[:self.learn_len]
        self.assess_data = self.train_data.iloc[self.learn_len:]
        self.combined_data = pd.concat([self.train_data.iloc[:, :-1], self.test_data])
        
        self.train_target_data = self.train_data.iloc[:, -1]
        self.learn_target_data = self.train_target_data[:self.learn_len]
        self.assess_target_data = self.train_target_data[self.learn_len:]
        
        self.tokenizer = Tokenizer()
        self.tokenizer.fit_on_texts(self.combined_data['text'])
        self.vocab_size = len(self.tokenizer.word_index) + 1

        self.create_sequences()
        self.create_encoded_texts()
        self.create_text_embeddings()
        
    def create_sequences(self):
        self.combined_sequences = pad_sequences(
            self.tokenizer.texts_to_sequences(self.combined_data['text']), 
            maxlen=self.max_sequence_length, 
            padding='post')
        self.train_sequences = pad_sequences(
            self.tokenizer.texts_to_sequences(self.train_data['text']),
            maxlen=self.max_sequence_length,
            padding='post')
        self.test_sequences = pad_sequences(
            self.tokenizer.texts_to_sequences(self.test_data['text']),
            maxlen=self.max_sequence_length,
            padding='post')
        self.learn_sequences = pad_sequences(
            self.tokenizer.texts_to_sequences(self.learn_data['text']),
            maxlen=self.max_sequence_length,
            padding='post')
        self.assess_sequences = pad_sequences(
            self.tokenizer.texts_to_sequences(self.assess_data['text']),
            maxlen=self.max_sequence_length,
            padding='post')
        
    def create_encoded_texts(self):
        encoded_documents = self.tfidf_encode_documents(self.combined_data['text'])
        self.combined_encoded_text = pd.DataFrame.sparse.from_spmatrix(encoded_documents)
        self.train_encoded_text = self.combined_encoded_text.iloc[:len(self.train_data)]
        self.test_encoded_text = self.combined_encoded_text.iloc[len(self.train_data):]
        self.learn_data_encoded_text = self.train_encoded_text[:self.learn_len]
        self.assess_data_encoded_text = self.train_encoded_text[self.learn_len:]
        
    def create_text_embeddings(self):
        if self.try_to_load_embeddings():
            print("Successfully loaded text embeddings from local storage !")
            return
        else:
            print("Failed to load text embeddings from local storage ... ")
            self.generate_embeddings()
            self.save_embeddings()
            
    def lemmatize(self, text):
        words = re.findall(r'\w+', text.lower())
        lemmatizer = WordNetLemmatizer()
        return [lemmatizer.lemmatize(word) for word in words]
    
    def tfidf_encode_documents(self, documents):
        vectorizer = TfidfVectorizer(tokenizer = self.lemmatize)
        return vectorizer.fit_transform(documents)
    
    def try_to_load_embeddings(self):
        if self.try_to_read_embeddings(self.train_embeddings_file_path, self.test_embeddings_file_path) or \
                self.try_to_read_embeddings(self.train_embeddings_read_file_path, self.test_embeddings_read_file_path):
            self.combined_text_embeddings = pd.concat([self.train_text_embeddings, self.test_text_embeddings])
            self.learn_text_embeddings = self.train_text_embeddings[:self.learn_len]
            self.assess_text_embeddings = self.train_text_embeddings[self.learn_len:]
            return True
        return False
    
    def try_to_read_embeddings(self, train_file_path, test_file_path):
        if os.path.exists(train_file_path) and os.path.exists(test_file_path):
            self.train_text_embeddings = pd.read_csv(train_file_path)
            self.test_text_embeddings = pd.read_csv(test_file_path)
            return True
        return False
    
    def generate_embeddings(self, model="text-embedding-ada-002"):
        print("Generating embeddings using OpenAI API ... ")
        self.combined_text_embeddings = pd.DataFrame(self.combined_data.text.apply(
            lambda x: get_embedding(x.replace("\n", " "), engine=model)).tolist())
        self.train_text_embeddings = self.combined_text_embeddings.iloc[:len(self.train_data)]
        self.test_text_embeddings = self.combined_text_embeddings.iloc[len(self.train_data):]
        self.learn_text_embeddings = self.train_text_embeddings.iloc[:self.learn_len]
        self.assess_text_embeddings = self.train_text_embeddings.iloc[self.learn_len:]
        print("Generated embeddings using OpenAI API ... ")

    def save_embeddings(self):
        print("Savings embeddings to local storage ... ")
        self.train_text_embeddings.to_csv(self.train_embeddings_file_path)
        self.test_text_embeddings.to_csv(self.test_embeddings_file_path)
        

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [7]:
dataset = Dataset(train_data, test_data, 0.8, 0.2)

assert len(dataset.learn_data) + len(dataset.assess_data) == len(dataset.train_data), \
"The size of the learn_data and assess_data should total to the size of data"

print("Learn dataset len={}".format(len(dataset.learn_data)))
print("Assess dataset len={}".format(len(dataset.assess_data)))
print("Total dataset len={}".format(len(dataset.train_data)))



Successfully loaded text embeddings from local storage !
Learn dataset len=6090
Assess dataset len=1523
Total dataset len=7613


In [8]:
print(dataset.combined_text_embeddings.shape)
print(dataset.train_text_embeddings.shape)
print(dataset.test_text_embeddings.shape)
print(dataset.learn_text_embeddings.shape)
print(dataset.assess_text_embeddings.shape)

(10876, 1537)
(7613, 1537)
(3263, 1537)
(6090, 1537)
(1523, 1537)


In [9]:
# Neural Network model

import tensorflow as tf
from tensorflow import keras

print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

input_size = dataset.train_encoded_text.shape[1]
hidden_layer_size = 10

tfidf_model = keras.models.Sequential()
tfidf_model.add(keras.layers.Dense(hidden_layer_size, activation='swish', input_shape=(input_size,)))
tfidf_model.add(keras.layers.Dense(2, activation='softmax'))
tfidf_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

tfidf_model.fit(dataset.train_encoded_text, dataset.train_target_data, epochs = 10)

Num GPUs Available:  0
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x79ba068e37c0>

In [10]:
import numpy as np
import collections

def load_glove_model(file_path, size, verbose = 1):
    model = collections.defaultdict(lambda: np.array([0.0 for _ in range(size)]))
    with open(file_path) as f:
        for line in f:
            tokens = line.split(' ')
            word = tokens[0]
            embeddings = np.array([float(value) for value in tokens[1:]])
            model[word] = embeddings
    if verbose >= 1:
        print("Words loaded!")
    return model

In [11]:
# Read the GloVe with pandas
import numpy as np

embedding_size = 100
glove_file_path = "/kaggle/input/glovetwitter27b/glove.twitter.27B.100d.txt"
glove_embeddings = load_glove_model(glove_file_path, embedding_size)

Words loaded!


In [12]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, LSTM, Dense
from tensorflow.keras.initializers import Constant
import numpy as np

print(dataset.vocab_size)
embedding_matrix = np.zeros((dataset.vocab_size, embedding_size))
for word, i in dataset.tokenizer.word_index.items():
    embedding_matrix[i] = glove_embeddings[word]
    
embedding_model = Sequential()
embedding_layer = Embedding(
    dataset.vocab_size,
    embedding_size,
    embeddings_initializer=Constant(embedding_matrix),
    input_length=dataset.max_sequence_length,
    trainable=False)
embedding_model.add(embedding_layer)
embedding_model.add(LSTM(100))
embedding_model.add(Dense(2, activation='softmax'))
embedding_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

embedding_model.fit(dataset.train_sequences, dataset.train_target_data, epochs = 20)

29320
Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<keras.callbacks.History at 0x79b99f63ae90>

In [46]:
# Neural Network model

import tensorflow as tf
from tensorflow import keras

print("Num GPUs Available: ", len(tf.config.experimental.list_physical_devices('GPU')))

print(dataset.test_text_embeddings)

input_size = dataset.train_text_embeddings.shape[1]
hidden_layer_size = 10

openai_model = keras.models.Sequential()
openai_model.add(keras.layers.Dense(hidden_layer_size, activation='swish', input_shape=(input_size,)))
openai_model.add(keras.layers.Dense(2, activation='softmax'))
openai_model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])

openai_model.fit(dataset.train_text_embeddings, dataset.train_target_data, epochs = 200)

Num GPUs Available:  0
      Unnamed: 0         0         1         2         3         4         5  \
0           7613 -0.021635  0.008594  0.009741  0.003018 -0.026378  0.011286   
1           7614 -0.002266 -0.001679  0.021356 -0.014359  0.002140  0.009852   
2           7615 -0.025502 -0.008412 -0.014390  0.005832 -0.020620  0.023935   
3           7616 -0.010744 -0.013325 -0.012183  0.004142 -0.024717  0.009615   
4           7617  0.003850 -0.018499  0.005596  0.017528 -0.012945  0.008153   
...          ...       ...       ...       ...       ...       ...       ...   
3258       10871 -0.005213 -0.010868 -0.000101 -0.029862 -0.000932 -0.002580   
3259       10872 -0.023784 -0.007963 -0.018151  0.011523 -0.007818  0.012971   
3260       10873 -0.009089 -0.008814  0.016513 -0.018191 -0.026394  0.014942   
3261       10874 -0.013321  0.005856 -0.016371 -0.018341 -0.008559  0.016996   
3262       10875 -0.007721 -0.030153 -0.022838 -0.016321 -0.016713  0.025137   

             6  

<keras.callbacks.History at 0x7b72490b3ac0>

In [14]:
embedding_output = embedding_model.predict(dataset.test_sequences)
tfidf_output = tfidf_model.predict(dataset.test_encoded_text)
chatgpt_output = pd.read_csv("/kaggle/input/disaster-tweets-chatgpt-output/chatgpt_submission.csv")

# print the number of test cases, on which the combined model is >90% sure
answer = []
for i in range(len(dataset.test_data)):
    tfidf_row = tfidf_output[i]
    embedding_row = embedding_output[i]
    chatgpt_answer = chatgpt_output.iloc[i]['target']

    cnt = [0, 0]
    cnt[0] += 1 if tfidf_row[0] > 0.9 else 0
    cnt[1] += 1 if tfidf_row[1] > 0.9 else 1

    cnt[0] += 1 if embedding_row[0] > 0.9 else 0
    cnt[1] += 1 if embedding_row[1] > 0.9 else 0

    if cnt[0] != cnt[1]:
        answer.append(1 if cnt[0] > cnt[1] else 0)
    else:
        answer.append(chatgpt_answer)

[0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 1, 1, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 1, 0, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 1, 0, 1, 1, 0, 1, 1, 1, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 1, 1, 

In [51]:
predictions = pd.DataFrame({
    'id': test_data['id'],
    'target': answer
})

predictions.to_csv("submission.csv", index = False)

In [8]:
import openai

openai.api_key = ""

def get_chatgpts_opinion(tweet, model_name="gpt-3.5-turbo"):
    chat_completion = openai.ChatCompletion.create(
    model=model_name,
    messages=[
        {
            "role": "user",
            "content": "Output a single letter, 1 if the following tweet is about a real disaster and 0 if not: " + tweet.replace("\n", " ")
        }])
    return chat_completion.choices[0].message.content

In [11]:
# Print sample tweets on which ChatGPT is makings mistakes
import time

index = 0
cnt_mistakes = 0
while index < len(dataset.train_data) and cnt_mistakes < 10:
    try:
        text = dataset.train_data.iloc[index]['text']
        expected_output = dataset.train_target_data.iloc[index]
        output = get_chatgpts_opinion(text)
        index += 1
        
        if int(expected_output) != int(output):
            print(dataset.train_data.iloc[index])
            print("Expected output: {}, Real output: {}\n\n".format(expected_output, output))
            cnt_mistakes += 1
    except:
        print("Failed, will try again soon ...")
        time.sleep(5)
        

id                                                         18
keyword                                                   NaN
location                                                  NaN
text        #raining #flooding #Florida #TampaBay #Tampa 1...
target                                                      1
Name: 12, dtype: object
Expected output: 1, Real output: 0


id                                               19
keyword                                         NaN
location                                        NaN
text        #Flood in Bago Myanmar #We arrived Bago
target                                            1
Name: 13, dtype: object
Expected output: 1, Real output: 0


id                                                         77
keyword                                                ablaze
location                                              Anaheim
text        Police: Arsonist Deliberately Set Black Church...
target                                                      

In [14]:
wrong_indexes = [12, 13, 53, 56, 69, 74, 78, 80, 89, 90]
for index in wrong_indexes:
    print(dataset.train_data.iloc[index]['text'])
    print("REAL" if dataset.train_target_data.iloc[index] == 1 else "FAKE")
    print("\n\n\n")


#raining #flooding #Florida #TampaBay #Tampa 18 or 19 days. I've lost count 
REAL




#Flood in Bago Myanmar #We arrived Bago
REAL




Police: Arsonist Deliberately Set Black Church In North CarolinaåÊAblaze http://t.co/pcXarbH9An
REAL




TRUCK ABLAZE : R21. VOORTREKKER AVE. OUTSIDE OR TAMBO INTL. CARGO SECTION. http://t.co/8kscqKfKkF
REAL




Accident center lane blocked in #SantaClara on US-101 NB before Great America Pkwy #BayArea #Traffic http://t.co/pmlOhZuRWR
REAL




I-77 Mile Marker 31 South Mooresville  Iredell Vehicle Accident Ramp Closed at 8/6 1:18 PM
REAL




I-77 Mile Marker 31 to 40 South Mooresville  Iredell Vehicle Accident Congestion at 8/6 1:18 PM
REAL




mom: 'we didn't get home as fast as we wished' 
me: 'why is that?'
mom: 'there was an accident and some truck spilt mayonnaise all over ??????
FAKE




;ACCIDENT PROPERTY DAMAGE; PINER RD/HORNDALE DR
REAL




???? it was an accident http://t.co/Oia5fxi4gM
FAKE






In [None]:
# Set up and test the OpenAI API
import openai
import time

while len(chatgpt_output) != len(dataset.test_data):
    try:
        index = len(chatgpt_output)
        text = dataset.test_data.iloc[index]['text']
        chatgpt_output.append(1 if get_chatgpts_opinion(text) == '1' else 0)
        if len(chatgpt_output) % 20 == 0:
            print("Finished with {} tweets ...".format(len(chatgpt_output)))
    except:
        print("Failed, will try again soon ...")
        print("Current output length: {}".format(len(chatgpt_output)))
        time.sleep(5)

print(len(chatgpt_output))
print(chatgpt_output[:10])
print(dataset.test_data.head(10)['text'])

In [None]:
# tfidf_output = tfidf_model.predict(dataset.test_encoded_text)

cnt_correct = 0
for i, x in enumerate(chatgpt_output):
    y = 0 if tfidf_output[i, 0] > tfidf_output[i, 1] else 1
    if x == y:
        cnt_correct += 1
        
# print(chatgpt_output)
# print(tfidf_output[i, 0])
        
print(cnt_correct / len(chatgpt_output))

In [None]:
legacy_chatgpt_output = chatgpt_output
print(legacy_chatgpt_output)
print(len(legacy_chatgpt_output))

In [None]:
predictions = pd.DataFrame({
    'id': test_data['id'],
    'target': chatgpt_output
})

predictions.to_csv("submission.csv", index = False)