In [1]:
import pandas as pd
import numpy as np
import re
import string
import nltk
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras import models
from keras.layers import Embedding, Dense, Conv1D, MaxPooling1D, Flatten

In [2]:
data = pd.read_csv("combined_tweets.csv")
data.head()

Unnamed: 0,Tweet ID,Tweet Text,Information Source,Information Type,Informativeness
0,242883454050648064,"#earthquake M 3.3, Virgin Islands region: Sept...",Not labeled,Not labeled,Not related
1,242887379944366080,@EarthquakeTest update your #earthquake s more,Not labeled,Not labeled,Not related
2,242919634125328384,"RT @RedazioneWebAL: #Terremoto, Costi (Pd): ta...",Not labeled,Not labeled,Not related
3,242920737223106561,"#Earthquake M 2.6, Southern Alaska http://t.co...",Not labeled,Not labeled,Not related
4,242936558158757889,５年６ヶ月長期保存可能なえいようかん5本。http://t.co/ZlSVctfi #eqj...,Not labeled,Not labeled,Not applicable


In [3]:
data = data[[" Tweet Text", " Informativeness"]]
data.head()

Unnamed: 0,Tweet Text,Informativeness
0,"#earthquake M 3.3, Virgin Islands region: Sept...",Not related
1,@EarthquakeTest update your #earthquake s more,Not related
2,"RT @RedazioneWebAL: #Terremoto, Costi (Pd): ta...",Not related
3,"#Earthquake M 2.6, Southern Alaska http://t.co...",Not related
4,５年６ヶ月長期保存可能なえいようかん5本。http://t.co/ZlSVctfi #eqj...,Not applicable


In [4]:
def clean_text(text):
  """
  fungsi untuk menghilangkan link, whitespace, melower case,
  dan menghapus tanda baca dan angka
  """
  #menghapus link
  text = re.sub("\w+:\/{2}[\d\w-]+(\.[\d\w-]+)*(?:(?:\/[^\s/]*))*", " ", text)
  #menghilangkan whitespace
  text = re.sub('\s+', ' ', text).strip()
  #menghapus tanda baca
  text = text.translate(str.maketrans(string.punctuation, ' '*len(string.punctuation)))
  #menjadikan huruf kecil
  text = text.lower()
  #menghapus angka
  text = re.sub(r'\d+', '', text)
  return text

In [5]:
data[" Tweet Text"] = data[" Tweet Text"].apply(lambda x:clean_text(x)) 
data.head()

Unnamed: 0,Tweet Text,Informativeness
0,earthquake m virgin islands region septem...,Not related
1,earthquaketest update your earthquake s more,Not related
2,rt redazionewebal terremoto costi pd ta...,Not related
3,earthquake m southern alaska,Not related
4,年ヶ月長期保存可能なえいようかん本。 eqjp quake saigai earth...,Not applicable


In [6]:
data[" Informativeness"].value_counts()

Not related                      3586
Related and informative          2547
Related - but not informative    1210
Not applicable                    116
Name:  Informativeness, dtype: int64

In [7]:
#mengubah informativeness menjadi 1 jika "Related and informative" dan 0 jika tidak
data[" Informativeness"] = data[" Informativeness"].apply(lambda x:1 if x=="Related and informative" else 0)
data.head()

Unnamed: 0,Tweet Text,Informativeness
0,earthquake m virgin islands region septem...,0
1,earthquaketest update your earthquake s more,0
2,rt redazionewebal terremoto costi pd ta...,0
3,earthquake m southern alaska,0
4,年ヶ月長期保存可能なえいようかん本。 eqjp quake saigai earth...,0


In [8]:
#split data menjadi train dan test
train, test = train_test_split(data, test_size=0.2, random_state=42)
X_train = train[" Tweet Text"]
y_train = train[" Informativeness"]
X_test = test[" Tweet Text"]
y_test = test[" Informativeness"]

In [9]:
NB_WORDS = 10000 #jumlah kata di dictionary
tk = Tokenizer(num_words=NB_WORDS, split=" ")
tk.fit_on_texts(X_train)
X_train_seq = tk.texts_to_sequences(X_train)
X_test_seq = tk.texts_to_sequences(X_test)

In [10]:
MAX_LEN = 30 #jumlah kata maksimum di setiap sequence
X_train_seq_trunc = pad_sequences(X_train_seq, maxlen=MAX_LEN)
X_test_seq_trunc = pad_sequences(X_test_seq, maxlen=MAX_LEN)

In [11]:
#memecah data train menjadi train dan validation
X_train_emb, X_valid_emb, y_train_emb, y_valid_emb = train_test_split(
                                                        X_train_seq_trunc, 
                                                        y_train, 
                                                        test_size=0.1,
                                                        random_state=42)

LOAD MODEL

In [14]:
# load json and create model
path = ""
json_file = open(path+'model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = models.model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights(path+"model.h5")
print("Loaded model from disk")

Loaded model from disk


In [15]:
y_preds = loaded_model.predict_classes(X_test_seq_trunc)
print(classification_report(y_test, y_preds))

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).
              precision    recall  f1-score   support

           0       0.93      0.93      0.93       972
           1       0.87      0.87      0.87       521

    accuracy                           0.91      1493
   macro avg       0.90      0.90      0.90      1493
weighted avg       0.91      0.91      0.91      1493

