# Detecting whether a tweet is about a disaster or not

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
pd.set_option('display.max_colwidth', None)

In [2]:
train = pd.read_csv('train.csv')
test = pd.read_csv('test.csv')

In [3]:
import nltk
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords

## Removing Punctuation

In [4]:
import string
np.array(string.punctuation)

array('!"#$%&\'()*+,-./:;<=>?@[\\]^_`{|}~', dtype='<U32')

In [5]:
def remove_punc(text):
    text = text.lower()
    tokens = nltk.word_tokenize(text)
    for i in string.punctuation:
        if i in tokens:
            tokens = list(filter(lambda x:x!=i,tokens))
    return tokens        

## Remove hyperlinks in the text

In [6]:
def remove_links(tokens):
    indices = [i for i, x in enumerate(tokens) if x == "http"]
    indices1 = [i for i, x in enumerate(tokens) if x == "https"]
    a = []
    for i in indices:
        a.append(tokens[i+1])
    for i in indices1:
        a.append(tokens[i+1])
    tokens = list(filter(lambda x:(x!='http') and (x!='https')and (x not in a) ,tokens))
    return tokens    

## Stemming the text

In [7]:
porter = PorterStemmer()
def stem_tokens(tokens, porter):
    stemmed = []
    for item in tokens:
        stemmed.append(porter.stem(item))
    return stemmed

## Removing Digits

In [8]:
def remove_digits(tokens):
    for i in tokens:
        if i.isdigit()==True:
            tokens.remove(i)
    return tokens        

## Removing Stopwords

In [9]:
stopwords_json = {"en":["a","a's","able","about","above","according","accordingly","across","actually","after","afterwards","again","against","ain't","all","allow","allows","almost","alone","along","already","also","although","always","am","among","amongst","an","and","another","any","anybody","anyhow","anyone","anything","anyway","anyways","anywhere","apart","appear","appreciate","appropriate","are","aren't","around","as","aside","ask","asking","associated","at","available","away","awfully","b","be","became","because","become","becomes","becoming","been","before","beforehand","behind","being","believe","below","beside","besides","best","better","between","beyond","both","brief","but","by","c","c'mon","c's","came","can","can't","cannot","cant","cause","causes","certain","certainly","changes","clearly","co","com","come","comes","concerning","consequently","consider","considering","contain","containing","contains","corresponding","could","couldn't","course","currently","d","definitely","described","despite","did","didn't","different","do","does","doesn't","doing","don't","done","down","downwards","during","e","each","edu","eg","eight","either","else","elsewhere","enough","entirely","especially","et","etc","even","ever","every","everybody","everyone","everything","everywhere","ex","exactly","example","except","f","far","few","fifth","first","five","followed","following","follows","for","former","formerly","forth","four","from","further","furthermore","g","get","gets","getting","given","gives","go","goes","going","gone","got","gotten","greetings","h","had","hadn't","happens","hardly","has","hasn't","have","haven't","having","he","he's","hello","help","hence","her","here","here's","hereafter","hereby","herein","hereupon","hers","herself","hi","him","himself","his","hither","hopefully","how","howbeit","however","i","i'd","i'll","i'm","i've","ie","if","ignored","immediate","in","inasmuch","inc","indeed","indicate","indicated","indicates","inner","insofar","instead","into","inward","is","isn't","it","it'd","it'll","it's","its","itself","j","just","k","keep","keeps","kept","know","known","knows","l","last","lately","later","latter","latterly","least","less","lest","let","let's","like","liked","likely","little","look","looking","looks","ltd","m","mainly","many","may","maybe","me","mean","meanwhile","merely","might","more","moreover","most","mostly","much","must","my","myself","n","name","namely","nd","near","nearly","necessary","need","needs","neither","never","nevertheless","new","next","nine","no","nobody","non","none","noone","nor","normally","not","nothing","novel","now","nowhere","o","obviously","of","off","often","oh","ok","okay","old","on","once","one","ones","only","onto","or","other","others","otherwise","ought","our","ours","ourselves","out","outside","over","overall","own","p","particular","particularly","per","perhaps","placed","please","plus","possible","presumably","probably","provides","q","que","quite","qv","r","rather","rd","re","really","reasonably","regarding","regardless","regards","relatively","respectively","right","s","said","same","saw","say","saying","says","second","secondly","see","seeing","seem","seemed","seeming","seems","seen","self","selves","sensible","sent","serious","seriously","seven","several","shall","she","should","shouldn't","since","six","so","some","somebody","somehow","someone","something","sometime","sometimes","somewhat","somewhere","soon","sorry","specified","specify","specifying","still","sub","such","sup","sure","t","t's","take","taken","tell","tends","th","than","thank","thanks","thanx","that","that's","thats","the","their","theirs","them","themselves","then","thence","there","there's","thereafter","thereby","therefore","therein","theres","thereupon","these","they","they'd","they'll","they're","they've","think","third","this","thorough","thoroughly","those","though","three","through","throughout","thru","thus","to","together","too","took","toward","towards","tried","tries","truly","try","trying","twice","two","u","un","under","unfortunately","unless","unlikely","until","unto","up","upon","us","use","used","useful","uses","using","usually","uucp","v","value","various","very","via","viz","vs","w","want","wants","was","wasn't","way","we","we'd","we'll","we're","we've","welcome","well","went","were","weren't","what","what's","whatever","when","whence","whenever","where","where's","whereafter","whereas","whereby","wherein","whereupon","wherever","whether","which","while","whither","who","who's","whoever","whole","whom","whose","why","will","willing","wish","with","within","without","won't","wonder","would","wouldn't","x","y","yes","yet","you","you'd","you'll","you're","you've","your","yours","yourself","yourselves","z","zero"]}
stopwords_json_en = set(stopwords_json['en'])
stopwords_nltk_en = set(stopwords.words('english'))
# Combine the stopwords. Its a lot longer so I'm not printing it out...
stoplist_combined = set.union(stopwords_json_en, stopwords_nltk_en)

In [10]:
def remove_stopwords(tokens):
    for i in tokens:
        if i in stoplist_combined:
            tokens.remove(i)
    return tokens        

Combining all above five functions into one tokenizer function

In [11]:
def tokenizer(text):
    a = remove_stopwords(remove_digits(remove_links(remove_punc(text))))
    #b  = stem_tokens(a,porter)
    return a

## TF-IDF tokenizer

In [12]:
from sklearn.feature_extraction.text import TfidfVectorizer

vectorizer = TfidfVectorizer(analyzer=tokenizer,ngram_range=(1,1)
                            ,min_df=0)
#matrix = vectorizer.fit_transform(list(train['text']))
from sklearn.model_selection import train_test_split
x,y = train_test_split(train,test_size=0.2)

x_train = vectorizer.fit_transform(list(x['text']))
y_train = x['target']

x_test = vectorizer.transform(list(y['text']))
y_test = y['target']

from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB() 

clf.fit(x_train,y_train) 

predictions_valid = clf.predict(x_test)
from sklearn.metrics import accuracy_score
print('Pizza reception accuracy = {}'.format(
        accuracy_score(predictions_valid, y_test) * 100)
     )


Pizza reception accuracy = 80.03939592908733


## Word2Vec embeddings

In [13]:
from gensim.models import Word2Vec, KeyedVectors
a  =train['text'].apply(tokenizer)
b = test['text'].apply(tokenizer)
df = pd.concat([a,b])
corpus = np.array(df)

model = Word2Vec(sentences=corpus,min_count=1,size = 100,window=5)

In [14]:
from keras.preprocessing.text import Tokenizer
tokenizer_obj = Tokenizer()
tokenizer_obj.fit_on_texts(corpus)

word_ind = tokenizer_obj.word_index

matrix  = pd.DataFrame(np.zeros((len(word_ind)+1,100)))
for word,i in word_ind.items():
    matrix.loc[i,:] = model[word]
    
matrix = matrix.drop([0],axis=0)
matrix = np.array(matrix)
max_length = max([len(i) for i in corpus])

Using TensorFlow backend.
  if __name__ == '__main__':


In [15]:
from keras.models import Sequential
from keras.layers import Dense,Embedding,LSTM,GRU
from keras.layers.embeddings import Embedding
from keras.initializers import Constant

model = Sequential()
embedding_layer = Embedding(len(word_ind),100,embeddings_initializer = Constant(matrix),input_length =max_length,trainable=False)
model.add(embedding_layer)
model.add(GRU(units=32,dropout=0.2,recurrent_dropout=0.2))
model.add(Dense(1,activation='sigmoid'))

model.compile(optimizer='adam',loss='binary_crossentropy',metrics = ['accuracy'])

In [16]:
model.summary()

Model: "sequential_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 24, 100)           2312400   
_________________________________________________________________
gru_1 (GRU)                  (None, 32)                12768     
_________________________________________________________________
dense_1 (Dense)              (None, 1)                 33        
Total params: 2,325,201
Trainable params: 12,801
Non-trainable params: 2,312,400
_________________________________________________________________


In [17]:
from keras.preprocessing.sequence import pad_sequences
train_tokens = tokenizer_obj.texts_to_sequences(train['text'])
x = pad_sequences(train_tokens,maxlen=max_length,padding='post')
y = train['target']

In [18]:
from sklearn.model_selection import train_test_split
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2)

In [19]:
model.fit(x_train,y_train,batch_size=32,epochs=25,validation_data =(x_test,y_test),verbose=2)

Train on 6090 samples, validate on 1523 samples
Epoch 1/25
 - 10s - loss: 0.7028 - accuracy: 0.5509 - val_loss: 0.6849 - val_accuracy: 0.5601
Epoch 2/25
 - 6s - loss: 0.6832 - accuracy: 0.5522 - val_loss: 0.6854 - val_accuracy: 0.5601
Epoch 3/25
 - 6s - loss: 0.6762 - accuracy: 0.5598 - val_loss: 0.6848 - val_accuracy: 0.5601
Epoch 4/25
 - 6s - loss: 0.6735 - accuracy: 0.5647 - val_loss: 0.6830 - val_accuracy: 0.5601
Epoch 5/25
 - 6s - loss: 0.6741 - accuracy: 0.5639 - val_loss: 0.6827 - val_accuracy: 0.5601
Epoch 6/25
 - 7s - loss: 0.6729 - accuracy: 0.5681 - val_loss: 0.6837 - val_accuracy: 0.5601
Epoch 7/25
 - 6s - loss: 0.6718 - accuracy: 0.5695 - val_loss: 0.6840 - val_accuracy: 0.5601
Epoch 8/25
 - 7s - loss: 0.6694 - accuracy: 0.5721 - val_loss: 0.6871 - val_accuracy: 0.5489
Epoch 9/25
 - 7s - loss: 0.6677 - accuracy: 0.5719 - val_loss: 0.6865 - val_accuracy: 0.5594
Epoch 10/25
 - 7s - loss: 0.6663 - accuracy: 0.5750 - val_loss: 0.6831 - val_accuracy: 0.5601
Epoch 11/25
 - 7s - 

<keras.callbacks.callbacks.History at 0x2513dc7a248>

In [21]:
test1 = vectorizer.transform(list(test['text']))
y_pred = clf.predict(test1)
samp = pd.read_csv('sample_submission.csv')
samp['target'] = y_pred
samp.to_csv('submission.csv',index=False)