In [45]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

dtype = torch.float
device = torch.device("cpu")
device = torch.device("cuda:0") # Uncomment this to run on GPU

# Any results you write to the current directory are saved as output.

['embeddings', 'train.csv', 'sample_submission.csv', 'test.csv']


In [46]:
print(os.listdir("../input/embeddings/"))

['paragram_300_sl999', 'wiki-news-300d-1M', 'GoogleNews-vectors-negative300', 'glove.840B.300d']


In [49]:
class LSTMTagger(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, extra_feature_dim, tagset_size):
        super(LSTMTagger, self).__init__()
        self.hidden_dim = hidden_dim

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2tag = nn.Linear(hidden_dim+extra_feature_dim, tagset_size)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (torch.zeros(1, 1, self.hidden_dim, device=device),
                torch.zeros(1, 1, self.hidden_dim, device=device))

    def forward(self, sentence_vec,extra_features):
        lstm_out, self.hidden = self.lstm(
            sentence_vec.view(len(sentence_vec), 1, -1), self.hidden)        
        tag_space = self.hidden2tag(torch.cat((self.hidden[0].view(1,-1),extra_features),dim=1))
        tag_scores = F.log_softmax(tag_space, dim=1)
        return tag_scores

In [50]:
model = LSTMTagger(300, 200, 16, 2)
if device==torch.device("cuda:0"):
    model.cuda()

loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

In [51]:
def extract_features_and_target_from_sample(sample):
    sentence_vec=sample['qfeatures']['word_vectors']
    extra_features=sample['qfeatures']['additional_features']
    target=sample['target']
    tesnor_sentence_vec=torch.tensor(sentence_vec, device=device,dtype=torch.float32).view(len(sentence_vec),1,-1)
    tensor_extra_feature=torch.tensor(extra_features, device=device,dtype=torch.float32).view(1,-1)
    tensor_target=torch.tensor([target], device=device,dtype=torch.long)
    return tesnor_sentence_vec,tensor_extra_feature,tensor_target

In [52]:
def train_on_q_set(training_data):
    i=0
    for qid in training_data:
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()

        # Also, we need to clear out the hidden state of the LSTM,
        # detaching it from its history on the last instance.
        model.hidden = model.init_hidden()

        # Step 2. Get our inputs ready for the network, that is, turn them into
        # Tensors of word indices.
        sentence_vec,extra_features,targets=extract_features_and_target_from_sample(training_data[qid])

        # Step 3. Run our forward pass.
        tag_scores = model(sentence_vec,extra_features)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(tag_scores, targets)
        loss.backward()
        optimizer.step()
        if(i % 1000)==0:
            print('trained upto sample %s' % i)
        i+=1

In [53]:
def train_on_epochs(epoch_num=10):
    for e in range(epoch_num):
        
        train_on_q_set(train_set)

In [54]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 
print('start1')

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gensim
import nltk
import re
from nltk.corpus import stopwords as stp
from textblob import TextBlob
import multiprocessing
from multiprocessing import Process
import json
import urllib
import bz2

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))
print(os.listdir("../input/embeddings"))
print(os.listdir("../input/embeddings/GoogleNews-vectors-negative300"))



# Any results you write to the current directory are saved as output.

def create_directory(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
    print(os.listdir("../"))
create_directory("../clean_data/")
print(os.listdir("../clean_data/"))

stop_words = set(stp.words('english'))
punctuations= ["\"","(",")","*",",","-","_",".","~","%","^","&","!","#",'@'
               "=","\'","\\","+","/",":","[","]","«","»","،","؛","?",".","…","$",
               "|","{","}","٫",";",">","<","1","2","3","4","5","6","7","8","9","0"]

def load_data(filename):

    data = pd.read_csv('../input/%s' % filename  #, encoding='ISO-8859-1'
                        , engine="python")

    return data

def load_google_vector():
    model = gensim.models.KeyedVectors.load_word2vec_format(
        '../input/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin',binary=True)
    return model

def tweet2v(list_words, model):
    sentence_vec = []
    if len(list_words)!=0:
        for word in list_words:
            if word in model:
                sentence_vec.append(model[word].tolist())
    return sentence_vec

def tweets2tokens(tweet_text,model):
    tokens = nltk.word_tokenize(re.sub(r'([^\s\w]|_)+','', tweet_text.lower()))
    words=[]
    for token in tokens:
        if token.startswith( 'http' ):
            url=1
        else:
            url=0
            if  '@' not in token and token in model and token not in stop_words and token != "" and token not in punctuations:
            # if  '@' not in token and token not in stop_words and token != "" and token not in punctuations:
                words.append(token)
    return tokens,url

def tweet_text2features(tweet_text,model):
    tokens,url=tweets2tokens(tweet_text,model)
    
    features=[]
    
    sentence_vec=tweet2v(tokens,model)
    list1=punctuationanalysis(tweet_text)
    for item in list1:
        features.append(item)
    features.append(negativewordcount(tokens))
    features.append(positivewordcount(tokens))
    features.append(capitalratio(tweet_text))
    features.append(contentlength(tokens))
    features.append(sentimentscore(tweet_text))
    list1=poscount(tweet_text)
    for item in list1:
        features.append(item)
    features.append(url)
    qfeatures={'word_vectors':sentence_vec,'additional_features':features}
    return qfeatures

def batch_of_items2json_files(q_batch,model,batch_number,run_id):
    print('starting run:%s batch:%s' % (run_id,batch_number))
    batch_clean_data={}
    for index, sample in q_batch.iterrows():
        tweet_text=sample['question_text']
        qid=sample['qid']
        target=sample['target']
        qfeatures=tweet_text2features(tweet_text,model)
        #print(qfeatures)
        batch_clean_data[qid]={'qfeatures':qfeatures,'target':target}
    
    
    print('Done batch %s'% batch_number)
    return batch_clean_data    
        
def batch_of_items2features_dict(q_batch,model,batch_number,run_id): # The same as batch_of_items2json_files but returns the data instead of storing in the json files
    print('starting run:%s batch:%s' % (run_id,batch_number))
    batch_clean_data={}
    for index, sample in q_batch.iterrows():
        tweet_text=sample['question_text']
        qid=sample['qid']
        target=sample['target']
        qfeatures=tweet_text2features(tweet_text,model)
        #print(qfeatures)
        batch_clean_data[qid]={'qfeatures':qfeatures,'target':target}
    
    with open('../clean_data/%s-%s.json.tar.bz2' % (run_id,batch_number), 'wb') as fp:
        s=json.dumps(batch_clean_data)
        fp.write(bz2.compress(s.encode()))
        print('Done batch %s'% batch_number)

#punctuations
def punctuationanalysis(tweet_text):
    hasqmark =sum(c =='?' for c in tweet_text)
    hasemark =sum(c =='!' for c in tweet_text)
    hasperiod=sum(c =='.' for c in tweet_text)
    hasstar=sum(c =='*' for c in tweet_text)
    number_punct=sum(c in punctuations for c in tweet_text)
    return hasqmark,hasemark,hasperiod,hasstar,number_punct

def negativewordcount(tokens):
    count = 0
    negativeFeel = ['dick','penis','god']
    for negative in negativeFeel:
        if negative in tokens:
            count += 1
    return count

def positivewordcount(tokens):
    count = 0
    positivewords = []
    for pos in positivewords:
        if pos in tokens:
            count += 1
    return count

def capitalratio(tweet_text):
    uppers = [l for l in tweet_text if l.isupper()]
    capitalratio = len(uppers) / len(tweet_text)
    return capitalratio

def contentlength(words):
    wordcount = len(words)
    return wordcount

def sentimentscore(tweet_text):
    analysis = TextBlob(tweet_text)
    return analysis.sentiment.polarity

def poscount(tweet_text):
    postag = []
    poscount = {}
    poscount['Noun']=0
    poscount['Verb']=0
    poscount['Adjective'] = 0
    poscount['Pronoun']=0
    poscount['Adverb']=0
    Nouns = {'NN','NNS','NNP','NNPS'}
    Verbs={'VB','VBP','VBZ','VBN','VBG','VBD','To'}
    word_tokens = nltk.word_tokenize(re.sub(r'([^\s\w]|_)+', '', tweet_text))
    postag = nltk.pos_tag(word_tokens)
    for g1 in postag:
     if g1[1] in Nouns:
        poscount['Noun'] += 1
     elif g1[1] in Verbs:
         poscount['Verb']+= 1
     elif g1[1]=='ADJ'or g1[1]=='JJ':
         poscount['Adjective']+=1
     elif g1[1]=='PRP' or g1[1]=='PRON':
         poscount['Pronoun']+=1
     elif g1[1]=='ADV':
         poscount['Adverb']+=1
    return poscount.values()

def store_features_for_data(model,data,run_id,batch_size=10000,start=0,stop=1):
    
    def chunker(seq, size):
        return (seq[pos:pos + size] for pos in range(0, len(seq), size))
    
    
    i=0
    processes=[]
    for batch in chunker(data,batch_size):
        if i>=start and i<stop:    
            #batch_of_items2json_files(batch,model,i,run_id)
            p=Process(target=batch_of_items2json_files,args=(batch,model,i,run_id))
            p.start()
            processes.append(p)
        
        i+=1
    for p in processes:
        p.join()

def get_features_for_data_and_train_on_every_batch(word2vec_model,data,run_id,batch_size=10000,start=0,stop=1):
    
    def chunker(seq, size):
        return (seq[pos:pos + size] for pos in range(0, len(seq), size))
    
    
    i=0
    for batch in chunker(data,batch_size):
        if i>=start and i<stop:    
            q_set=batch_of_items2features_dict(batch,word2vec_model,i,run_id)
            train_on_q_set(q_set)
        
        i+=1
        
def load_test_data():
    pass






start1
['embeddings', 'train.csv', 'sample_submission.csv', 'test.csv']
['paragram_300_sl999', 'wiki-news-300d-1M', 'GoogleNews-vectors-negative300', 'glove.840B.300d']
['GoogleNews-vectors-negative300.bin']
['lib', 'config', 'clean_data', 'input', 'working']
['ali-0.json.tar.bz2']


In [55]:
print(os.listdir("../clean_data/"))

['ali-0.json.tar.bz2']


In [9]:
print('start')
model1 = load_google_vector()
print("load_google_vector loaded!")




start
load_google_vector loaded!
[]


In [12]:
data =load_data('train.csv')
print(data.columns)
print(data.head())




Index(['qid', 'question_text', 'target'], dtype='object')
                    qid  ...   target
0  00002165364db923c7e6  ...        0
1  000032939017120e6e44  ...        0
2  0000412ca6e4628ce2cf  ...        0
3  000042bf85aa498cd78e  ...        0
4  0000455dfa3e01eae3af  ...        0

[5 rows x 3 columns]


In [14]:
store_features_for_data(model1,data,'ali')

starting run:ali batch:0
Done batch 0


In [15]:
def unzip_q_set(run_id,batch_number):
    with open('../clean_data/%s-%s.json.tar.bz2' % (run_id,batch_number),'rb') as fb:
        s0=fb.read()
        s1=bz2.decompress(s0)
        print(type(s1))
        q_set=json.loads(s1.decode())
        return q_set

In [56]:
q_set=unzip_q_set('ali',0)

KeyboardInterrupt: 

In [57]:
print(len(q_set['00002165364db923c7e6']['qfeatures']['word_vectors']))
for t in q_set['00002165364db923c7e6']['qfeatures']:
    print(t)

11
word_vectors
additional_features


In [58]:
#train_on_q_set(q_set)
get_features_for_data_and_train_on_every_batch()

trained upto sample 0
trained upto sample 1000
trained upto sample 2000
trained upto sample 3000
trained upto sample 4000
trained upto sample 5000
trained upto sample 6000
trained upto sample 7000
trained upto sample 8000
trained upto sample 9000


0