###  Building a machine learning model that predicts which Tweets are about real disasters and which one’s aren’t. A dataset of 10,000 tweets that were hand classified is available.

In [14]:
import sys
sys.version_info

## Had to install tensorflow-text from build , pip command doesn't work

sys.version_info(major=3, minor=11, micro=6, releaselevel='final', serial=0)

In [15]:
!pip install keras-core --upgrade
!pip install -q keras-nlp --upgrade



In [16]:
import os
os.environ['KERAS_BACKEND'] = 'tensorflow'

In [17]:
import numpy as np
import pandas as pd
import tensorflow as tf
import keras_core as keras
import keras_nlp
from sklearn.metrics import ConfusionMatrixDisplay, confusion_matrix
from sklearn.model_selection import train_test_split

import seaborn as sns
import matplotlib.pyplot as plt

print("TensorFlow version:", tf.__version__)
print("KerasNLP version:", keras_nlp.__version__)

TensorFlow version: 2.15.0
KerasNLP version: 0.6.4


In [18]:
df_train = pd.read_csv("train.csv")
df_test = pd.read_csv("test.csv")

print(f"Training set shape : ({df_train.shape[0]},{df_train.shape[1]})")
print(f"Testing set shape : ({df_test.shape[0]},{df_test.shape[1]})")

Training set shape : (7613,5)
Testing set shape : (3263,4)


In [19]:
df_train.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [20]:
df_train.columns

Index(['id', 'keyword', 'location', 'text', 'target'], dtype='object')

In [21]:
df_test.columns

Index(['id', 'keyword', 'location', 'text'], dtype='object')

## Exploring the dataset

In [22]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 7613 entries, 0 to 7612
Data columns (total 5 columns):
 #   Column    Non-Null Count  Dtype 
---  ------    --------------  ----- 
 0   id        7613 non-null   int64 
 1   keyword   7552 non-null   object
 2   location  5080 non-null   object
 3   text      7613 non-null   object
 4   target    7613 non-null   int64 
dtypes: int64(2), object(3)
memory usage: 297.5+ KB


#### Checking for missing values 

In [23]:
def summary(df):
    summ = pd.DataFrame(df.dtypes,columns=["dtypes"])
    summ["missing#"] = df.isna().sum()
    summ["missing%"] = df.isna().sum() / len(df)
    return summ


In [24]:
summary(df_train)

Unnamed: 0,dtypes,missing#,missing%
id,int64,0,0.0
keyword,object,61,0.008013
location,object,2533,0.33272
text,object,0,0.0
target,int64,0,0.0


missing keyword/location is not an issue , as song as we have text and target ! 

In [31]:
X = df_train["text"]
y = df_train["target"]
X_test = df_test["text"]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

In [32]:
# Loading DistilBERT model.
preset= "distil_bert_base_en_uncased"

# Use a shorter sequence length.
preprocessor = keras_nlp.models.DistilBertPreprocessor.from_preset(preset,
                                                                   sequence_length=160,
                                                                   name="preprocessor_4_tweets"
                                                                  )

# Pretrained classifier.
classifier = keras_nlp.models.DistilBertClassifier.from_preset(preset,
                                                               preprocessor = preprocessor, 
                                                               num_classes=2)

classifier.summary()



In [None]:
classifier.compile(
loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
optimizer=tf.keras.optimizers.legacy.Adam(1e-6),
    metrics= ["accuracy"]  
)
history = classifier.fit(X_train,y_train,batch_size=32,epochs=2,validation_data=(X_val, y_val))

Epoch 1/2
 17/191 [=>............................] - ETA: 14:01 - loss: 0.6967 - accuracy: 0.4430