In [46]:
import pandas as pd 
import nltk 
from nltk.corpus import stopwords 
from nltk.tokenize import word_tokenize 
from nltk.stem import WordNetLemmatizer 
import string 
from sklearn.metrics import accuracy_score, precision_score, recall_score
from simpletransformers.classification import ClassificationModel, ClassificationArgs

In [None]:
# pip install simpletransformers

In [47]:
# Read the dataset from the CSV file.
df = pd.read_csv("train.csv")
df.drop(columns=['id','keyword','location'],inplace = True)
df.head()


Unnamed: 0,text,target
0,Our Deeds are the Reason of this #earthquake M...,1
1,Forest fire near La Ronge Sask. Canada,1
2,All residents asked to 'shelter in place' are ...,1
3,"13,000 people receive #wildfires evacuation or...",1
4,Just got sent this photo from Ruby #Alaska as ...,1


In [48]:
df.rename(columns={'text':'input_text','target':'target_text'},inplace=True)

In [38]:
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...


True

In [50]:
# Apply text preprocessing.

stop_words = set(stopwords.words('english')) # read all stopwords
lemmatizer = WordNetLemmatizer()

# function to apply pre-processing techniques on the words
def preprocess_text(text):
    # extract the words from the text
    tokens = word_tokenize(text)
    # remove punctuation from the text
    tokens = [word for word in tokens if word not in string.punctuation]
    # remove stopwords
    tokens = [word for word in tokens if word.lower() not in stop_words]
    # apply limatize on each word
    tokens = [lemmatizer.lemmatize(word) for word in tokens]
    # connect each word again in one string
    preprocessed_text = ' '.join(tokens)
    return preprocessed_text

# apply the preprocess frunction in all columns
df['input_text'] = df['input_text'].apply(preprocess_text)


In [51]:
df.head()

Unnamed: 0,input_text,target_text
0,Deeds Reason earthquake May ALLAH Forgive u,1
1,Forest fire near La Ronge Sask Canada,1
2,resident asked 'shelter place notified officer...,1
3,"13,000 people receive wildfire evacuation orde...",1
4,got sent photo Ruby Alaska smoke wildfire pour...,1


In [60]:
# Define hyperparameters and model arguments
model_args = ClassificationArgs()
model_args.num_train_epochs = 8
model_args.evaluate_during_training = False

In [61]:
model = ClassificationModel(
    "bert",  # Use BERT model for classification
    "bert-base-uncased",  # Pretrained BERT model
    args=model_args,
    use_cuda=True
)


Some weights of BertForSequenceClassification were not initialized from the model checkpoint at bert-base-uncased and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [64]:
model.train_model(df)




  0%|          | 0/7613 [00:00<?, ?it/s]

Epoch:   0%|          | 0/8 [00:00<?, ?it/s]

Running Epoch 0 of 8:   0%|          | 0/952 [00:00<?, ?it/s]

Running Epoch 1 of 8:   0%|          | 0/952 [00:00<?, ?it/s]

Running Epoch 2 of 8:   0%|          | 0/952 [00:00<?, ?it/s]

Running Epoch 3 of 8:   0%|          | 0/952 [00:00<?, ?it/s]

Running Epoch 4 of 8:   0%|          | 0/952 [00:00<?, ?it/s]

Running Epoch 5 of 8:   0%|          | 0/952 [00:00<?, ?it/s]

Running Epoch 6 of 8:   0%|          | 0/952 [00:00<?, ?it/s]

Running Epoch 7 of 8:   0%|          | 0/952 [00:00<?, ?it/s]

(7616, 0.2907576467340495)

In [86]:
predictions, _ = model.predict(df['input_text'].tolist())

# Calculate accuracy
accuracy = accuracy_score(df['target_text'], predictions)

# Calculate precision and recall
precision = precision_score(df['target_text'], predictions)
recall = recall_score(df['target_text'], predictions)

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")

  0%|          | 0/7613 [00:00<?, ?it/s]

  0%|          | 0/952 [00:00<?, ?it/s]

Accuracy: 0.9566530933928806
Precision: 0.9531587057010786
Recall: 0.9455823907062061


In [71]:
model.save_model("bert")
