In [1]:
from keras.layers import Input, Dense, Embedding, Convolution1D, MaxPooling1D, MaxPooling2D, Convolution2D, LSTM
from keras.layers import Reshape, Flatten, Dropout, Concatenate
from keras.layers import Lambda, GlobalAveragePooling1D, Dense, Embedding
from keras.regularizers import l2, l1
from keras import regularizers
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam
from keras.models import Sequential
from sklearn.model_selection import train_test_split
import numpy as np
import matplotlib.pyplot as plt
from keras.preprocessing.text import one_hot as oneHOT
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [2]:
import pandas as pd
import numpy as np
df_tweet = pd.read_csv("data/processed_tweet2.csv")
import ast
df_tweet['tweet_pr_st1'] = df_tweet.apply(lambda x: ast.literal_eval(x.tweet_pr_st1),axis =1)
df_tweet['tweet_pr_st2'] = df_tweet.apply(lambda x: ast.literal_eval(x.tweet_pr_st2),axis =1)

df_tweet['subtask_a'] = df_tweet.apply(lambda x: 0 if x.subtask_a=='NOT' else 1,axis = 1)
df_tweet['subtask_b'] = df_tweet.apply(lambda x: 1 if x.subtask_b=='UNT' else 2 if x.subtask_b=='TIN' else 0,axis = 1)
df_tweet['subtask_c'] = df_tweet.apply(lambda x: 2 if x.subtask_c=='IND' else 3 if x.subtask_c=='GRP' else 4 if x.subtask_c =='OTH' else 0,axis = 1)



In [3]:
normalised_corpus = list(df_tweet['tweet_pr_st2'])
labels = list(df_tweet['subtask_a'])

In [4]:
normalised_corpus[:3]

[['ask', 'native', 'american', 'take'],
 ['go',
  'home',
  'drunk',
  'maga',
  'trump',
  'oncoming',
  'fist',
  'united',
  'state',
  'oncoming',
  'fist'],
 ['amazon',
  'investigate',
  'chinese',
  'employee',
  'sell',
  'internal',
  'data',
  'third',
  'party',
  'seller',
  'look',
  'edge',
  'competitive',
  'marketplace',
  'amazon',
  'maga',
  'kag',
  'china',
  'tcot']]

In [5]:
import collections, itertools
word_counter = collections.Counter(list(itertools.chain.from_iterable(normalised_corpus)))

In [6]:
word_ids = list(set(list(itertools.chain.from_iterable(normalised_corpus))))
len(word_ids)

13910

In [62]:
word2idx = {}
word2idx = {token:id+3 for id,token in enumerate(word_ids)}
word2idx['<PAD>'] = 0
word2idx['<START>'] = 1
word2idx['<UNK>'] = 2
word2idx['<UNUSED>'] = 3

# word_index = {k:(v+3) for k,v in word_index.items()}
# word_index["<PAD>"] = 0
# word_index["<START>"] = 1
# word_index["<UNK>"] = 2  
# word_index["<UNUSED>"] = 3

In [63]:
idx2word = {v: k for k,v in word2idx.items()}

In [64]:
sents_as_ids = []
for i in range(len(normalised_corpus)):
    sents_as_ids.append([word2idx[j] for j in normalised_corpus[i]])

In [65]:
normalised_corpus[0:3]

[['ask', 'native', 'american', 'take'],
 ['go',
  'home',
  'drunk',
  'maga',
  'trump',
  'oncoming',
  'fist',
  'united',
  'state',
  'oncoming',
  'fist'],
 ['amazon',
  'investigate',
  'chinese',
  'employee',
  'sell',
  'internal',
  'data',
  'third',
  'party',
  'seller',
  'look',
  'edge',
  'competitive',
  'marketplace',
  'amazon',
  'maga',
  'kag',
  'china',
  'tcot']]

In [66]:
sents_as_ids[0:3]

[[6980, 10805, 5741, 6546],
 [5262, 13760, 2911, 7884, 3089, 8992, 582, 9484, 7373, 8992, 582],
 [2956,
  2959,
  10462,
  6329,
  2507,
  8689,
  13136,
  9394,
  12842,
  9038,
  11515,
  13229,
  1089,
  998,
  2956,
  7884,
  3648,
  6382,
  11126]]

In [67]:
vocab_size = len(word_ids) + 3

In [42]:
# def one_hot(labels):
#     from sklearn.preprocessing import OneHotEncoder
#     encoder = OneHotEncoder()
#     return encoder.fit_transform(np.array(labels).reshape(-1,1)).toarray()

# labels = one_hot(labels)

In case you used a LabelEncoder before this OneHotEncoder to convert the categories to integers, then you can now use the OneHotEncoder directly.


In [7]:
def readGloveFile(gloveFile):
    with open(gloveFile, 'r') as f:
        wordToGlove = {}  
        wordToIndex = {}  
        indexToWord = {}  

        for line in f:
            record = line.strip().split()
            token = record[0] 
            wordToGlove[token] = np.array(record[1:], dtype=np.float64) 
            
        tokens = sorted(wordToGlove.keys())
        for idx, tok in enumerate(tokens):
            kerasIdx = idx + 1  
            wordToIndex[tok] = kerasIdx 
            indexToWord[kerasIdx] = tok 

    return wordToIndex, indexToWord, wordToGlove

In [8]:
def createPretrainedEmbeddingLayer(wordToGlove, wordToIndex, isTrainable):
    vocabLen = len(wordToIndex) + 1  
    embDim = next(iter(wordToGlove.values())).shape[0]  
   
    embeddingMatrix = np.zeros((vocabLen, embDim))  
    for word, index in wordToIndex.items():
        embeddingMatrix[index, :] = wordToGlove[word] 

    embeddingLayer = Embedding(vocabLen, embDim, embeddings_initializer=Constant(embeddingMatrix), trainable=isTrainable)
    return embeddingLayer

In [None]:
train_data,test_data,train_labels,test_labels = train_test_split(sents_as_ids,labels,test_size=0.2,stratify=labels)

X_train_enc = keras.preprocessing.sequence.pad_sequences(train_data,
                                                        value=word_index["<PAD>"],
                                                        padding='post',
                                                        maxlen=50)
X_test_enc = keras.preprocessing.sequence.pad_sequences(test_data,
                                                        value=word_index["<PAD>"],
                                                        padding='post',
                                                        maxlen=50)


In [10]:
# put the code here
# !pip install chakin
# import chakin
# chakin.download(number=12, save_dir='embeddings/')

# import os
# os.system("unzip 'embeddings/glove.6B.zip' ")
from tensorflow.contrib.keras.api.keras.initializers import Constant
wordToIndex, indexToWord, wordToGlove = readGloveFile('embeddings/glove.6B.300d.txt')
embeddingLayer = createPretrainedEmbeddingLayer(wordToGlove, wordToIndex, isTrainable=True)
# os.system("rm 'embeddings/glove.6B.zip'")

You should consider upgrading via the 'pip install --upgrade pip' command.[0m


In [68]:
# from keras.preprocessing.sequence import pad_sequences
# from sklearn.model_selection import train_test_split
# MAXIMUM_LENGTH = 500
# train_data,test_data,train_labels,test_labels = train_test_split(sents_as_ids,labels,test_size=0.2,stratify=labels)

# preprocessed_train_data = pad_sequences(train_data,maxlen=MAXIMUM_LENGTH)
# processed_test_data = pad_sequences(test_data,maxlen=MAXIMUM_LENGTH)

In [72]:
# print('Length of sample train_data before preprocessing:', len(train_data[1]), type(train_data[1]))

Length of sample train_data before preprocessing: 27


In [71]:
# print('Length of sample train_data after preprocessing:', len(preprocessed_train_data[1]), type(train_data[1]))

Length of sample train_data after preprocessing: 500


In [12]:
model = Sequential()
EMBED_SIZE = 100
model.add(embeddingLayer)
# model.add(Embedding(vocab_size,EMBED_SIZE,input_length=MAXIMUM_LENGTH))
model.add(LSTM(100, activation='tanh'))
model.add(Dense(1,activation='sigmoid',input_shape=(1,)))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
# put the code here
# model = Sequential()
# model.add(Embedding(vocab_size, 30, input_length=MAXIMUM_LENGTH))
# model.add(Convolution1D(64,5,activation="relu"))
# model.add(Dropout(0.5))
# model.add(Convolution1D(32,3,activation="relu"))
# model.add(Dropout(0.5))
# model.add(Convolution1D(16,3,activation="sigmoid"))
# model.add(MaxPooling1D(5))
# model.add(Flatten())
# model.add(Dense(train_labels.shape[1],activation='softmax'))
# model.compile(loss='categorical_crossentropy', optimizer='adam',metrics=['accuracy'])
model.summary()



_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, None, 300)         120000300 
_________________________________________________________________
lstm_2 (LSTM)                (None, 100)               160400    
_________________________________________________________________
dense_2 (Dense)              (None, 1)                 101       
Total params: 120,160,801
Trainable params: 120,160,801
Non-trainable params: 0
_________________________________________________________________


In [None]:
model.compile(optimizer='adam',
              loss='binary_crossentropy',
              metrics=['accuracy'])

# X_val = np.array(X_train_enc[:10000])
# partial_X_train = np.array(X_train_enc[10000:])

history = model.fit(train_data,
                    train_labels,
                    epochs=15,
                    batch_size=64,
                    validation_split=0.08,
                    verbose=1)

results = model.evaluate(X_test_enc, y_test)
print(results)