In [1]:
%pip install pandas numpy tensorflow transformers scikit-learn matplotlib

Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd

# Load the training data
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')



###########################################


import re # Regular Expression

def clean_text(text):
    text = re.sub(r'http\S+', '', text)  # Remove URLs
    text = re.sub(r'@\w+', '', text)     # Remove mentions
    text = re.sub(r'\d+', '', text)      # Remove numbers
    text = re.sub(r'[^\w\s#]', '', text)  # Remove punctuation except hashtags
    text = text.lower()                  # Convert to lowercase
    return text

train_data['clean_text'] = train_data['text'].apply(clean_text) # Apply the data cleaning process to training data
test_data['clean_text'] = test_data['text'].apply(clean_text)# Apply the data cleaning process to testing data



###########################################



from transformers import BertTokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences

# Initialize the BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

# Example raw tweet
tweet = "There is a disaster happening"

# Tokenize the tweet
tokenized_tweet = tokenizer.encode(tweet, add_special_tokens=True)
print("Tokenized Tweet:", tokenized_tweet)


# Tokenize the clean text including hashtags
train_data['tokens'] = train_data['clean_text'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))
test_data['tokens'] = test_data['clean_text'].apply(lambda x: tokenizer.encode(x, add_special_tokens=True))



###########################################


# Define the custom maximum length for the sequences based on analysis
MAX_LEN = 64

# Pad the token sequences and truncate longer sequences
padded_train_tokens = pad_sequences(train_data['tokens'].tolist(), maxlen=MAX_LEN, padding='post', truncating='post')
padded_test_tokens = pad_sequences(test_data['tokens'].tolist(), maxlen=MAX_LEN, padding='post', truncating='post')

# Convert the 2D arrays to lists of lists
train_data['padded_tokens'] = list(padded_train_tokens)
test_data['padded_tokens'] = list(padded_test_tokens)

###########################################



from transformers import TFBertForSequenceClassification

model = TFBertForSequenceClassification.from_pretrained('bert-base-uncased', num_labels=2) # num_labels = 2 since this is a binary classification


###########################################


from tensorflow.keras.optimizers import Adam

optimizer = Adam(learning_rate=3e-5)
model.compile(optimizer=optimizer, loss=model.compute_loss, metrics=['accuracy'])

###########################################


from sklearn.model_selection import train_test_split
import tensorflow as tf

X_train, X_val, y_train, y_val = train_test_split(train_data['padded_tokens'], train_data['target'], test_size=0.1, random_state=42)

train_dataset = tf.data.Dataset.from_tensor_slices((X_train.tolist(), y_train.tolist())).shuffle(len(X_train)).batch(256)
val_dataset = tf.data.Dataset.from_tensor_slices((X_val.tolist(), y_val.tolist())).batch(256)


###########################################


history = model.fit(train_dataset, epochs=3, validation_data=val_dataset)


###########################################


test_dataset = tf.data.Dataset.from_tensor_slices(test_data['padded_tokens'].tolist()).batch(256)
predictions = model.predict(test_dataset).logits
predictions = tf.nn.softmax(predictions, axis=1)
predicted_labels = tf.argmax(predictions, axis=1).numpy()


###########################################


submission = pd.DataFrame({'id': test_data['id'], 'target': predicted_labels})
submission.to_csv('../data/submission.csv', index=False)

  from .autonotebook import tqdm as notebook_tqdm



Tokenized Tweet: [101, 2045, 2003, 1037, 7071, 6230, 102]



All PyTorch model weights were used when initializing TFBertForSequenceClassification.

Some weights or buffers of the TF 2.0 model TFBertForSequenceClassification were not initialized from the PyTorch model and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


AttributeError: in user code:

    File "c:\Users\Micha\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\src\engine\training.py", line 1401, in train_function  *
        return step_function(self, iterator)
    File "c:\Users\Micha\AppData\Local\Programs\Python\Python310\lib\site-packages\transformers\modeling_tf_utils.py", line 1588, in compute_loss  *
        return super().compute_loss(*args, **kwargs)
    File "c:\Users\Micha\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\src\engine\training.py", line 1209, in compute_loss  **
        return self.compiled_loss(
    File "c:\Users\Micha\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\src\engine\compile_utils.py", line 275, in __call__
        y_t, y_p, sw = match_dtype_and_rank(y_t, y_p, sw)
    File "c:\Users\Micha\AppData\Local\Programs\Python\Python310\lib\site-packages\keras\src\engine\compile_utils.py", line 854, in match_dtype_and_rank
        if (y_t.dtype.is_floating and y_p.dtype.is_floating) or (

    AttributeError: 'NoneType' object has no attribute 'dtype'


In [3]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn import feature_extraction, linear_model, model_selection, preprocessing

###########################################


train_df = pd.read_csv("/kaggle/input/nlp-getting-started/train.csv")
test_df = pd.read_csv("/kaggle/input/nlp-getting-started/test.csv")

###########################################


count_vectorizer = feature_extraction.text.CountVectorizer()

## let's get counts for the first 5 tweets in the data
example_train_vectors = count_vectorizer.fit_transform(train_df["text"][0:5])

###########################################


## we use .todense() here because these vectors are "sparse" (only non-zero elements are kept to save space)
print(example_train_vectors[0].todense().shape)
print(example_train_vectors[0].todense())

###########################################


train_vectors = count_vectorizer.fit_transform(train_df["text"])

## note that we're NOT using .fit_transform() here. Using just .transform() makes sure
# that the tokens in the train vectors are the only ones mapped to the test vectors - 
# i.e. that the train and test vectors use the same set of tokens.
test_vectors = count_vectorizer.transform(test_df["text"])



###########################################

## Our vectors are really big, so we want to push our model's weights
## toward 0 without completely discounting different words - ridge regression 
## is a good way to do this.
clf = linear_model.RidgeClassifier()

###########################################

scores = model_selection.cross_val_score(clf, train_vectors, train_df["target"], cv=3, scoring="f1")
scores

###########################################

clf.fit(train_vectors, train_df["target"])

sample_submission = pd.read_csv("/kaggle/input/nlp-getting-started/sample_submission.csv")

sample_submission["target"] = clf.predict(test_vectors)

sample_submission.head()

sample_submission.to_csv("submission.csv", index=False)