In [2]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import gensim
import nltk
import re
from nltk.corpus import stopwords as stp
from textblob import TextBlob
import multiprocessing
from multiprocessing import Process
import json

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))
print(os.listdir("../input/embeddings"))
print(os.listdir("../input/embeddings/GoogleNews-vectors-negative300"))



# Any results you write to the current directory are saved as output.

['embeddings', 'README.MD', 'sample_submission.csv', 'test.csv', 'train.csv']
['GoogleNews-vectors-negative300', 'README.MD']
[]


In [3]:
print(os.listdir("../clean_data/"))

['README.MD']


In [4]:
def create_directory(directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
    print(os.listdir("../"))

In [5]:
stop_words = set(stp.words('english'))
punctuations= ["\"","(",")","*",",","-","_",".","~","%","^","&","!","#",'@'
               "=","\'","\\","+","/",":","[","]","«","»","،","؛","?",".","…","$",
               "|","{","}","٫",";",">","<","1","2","3","4","5","6","7","8","9","0"]

def load_data(filename):

    data = pd.read_csv('../input/%s' % filename  #, encoding='ISO-8859-1'
                        , engine="python")

    return data

def load_google_vector():
    model = gensim.models.KeyedVectors.load_word2vec_format(
        '../input/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin.gz',binary=True)
    return model

def tweet2v(list_words, model):
    sentence_vec = []
    if len(list_words)!=0:
        for word in list_words:
            if word in model:
                sentence_vec.append(model[word].tolist())
    return sentence_vec

def tweets2tokens(tweet_text,model):
    tokens = nltk.word_tokenize(re.sub(r'([^\s\w]|_)+','', tweet_text.lower()))
    words=[]
    for token in tokens:
        if token.startswith( 'http' ):
            url=1
        else:
            url=0
            if  '@' not in token and token in model and token not in stop_words and token != "" and token not in punctuations:
            # if  '@' not in token and token not in stop_words and token != "" and token not in punctuations:
                words.append(token)
    return tokens,url

def tweet_text2features(tweet_text,model):
    tokens,url=tweets2tokens(tweet_text,model)
    
    features=[]
    
    sentence_vec=tweet2v(tokens,model)
    list1=punctuationanalysis(tweet_text)
    for item in list1:
        features.append(item)
    features.append(negativewordcount(tokens))
    features.append(positivewordcount(tokens))
    features.append(capitalratio(tweet_text))
    features.append(contentlength(tokens))
    features.append(sentimentscore(tweet_text))
    list1=poscount(tweet_text)
    for item in list1:
        features.append(item)
    features.append(url)
    qfeatures={'word_vectors':sentence_vec,'additional_features':features}
    return qfeatures

def batch_of_items2json_files(q_batch,model,batch_number,run_id):
    print('starting run:%s batch:%s' % (run_id,batch_number))
    batch_clean_data={}
    for index, sample in q_batch.iterrows():
        tweet_text=sample['question_text']
        qid=sample['qid']
        target=sample['target']
        qfeatures=tweet_text2features(tweet_text,model)
        #print(qfeatures)
        batch_clean_data[qid]={'qfeatures':qfeatures,'target':target}
    
    with open('../clean_data/%s-%s.json' % (run_id,batch_number), 'w') as fp:
        json.dump(batch_clean_data, fp)
        print('Done batch %s'% batch_number)

#punctuations
def punctuationanalysis(tweet_text):
    hasqmark =sum(c =='?' for c in tweet_text)
    hasemark =sum(c =='!' for c in tweet_text)
    hasperiod=sum(c =='.' for c in tweet_text)
    hasstar=sum(c =='*' for c in tweet_text)
    number_punct=sum(c in punctuations for c in tweet_text)
    return hasqmark,hasemark,hasperiod,hasstar,number_punct

def negativewordcount(tokens):
    count = 0
    negativeFeel = ['dick','penis','god']
    for negative in negativeFeel:
        if negative in tokens:
            count += 1
    return count

def positivewordcount(tokens):
    count = 0
    positivewords = []
    for pos in positivewords:
        if pos in tokens:
            count += 1
    return count

def capitalratio(tweet_text):
    uppers = [l for l in tweet_text if l.isupper()]
    capitalratio = len(uppers) / len(tweet_text)
    return capitalratio

def contentlength(words):
    wordcount = len(words)
    return wordcount

def sentimentscore(tweet_text):
    analysis = TextBlob(tweet_text)
    return analysis.sentiment.polarity

def poscount(tweet_text):
    postag = []
    poscount = {}
    poscount['Noun']=0
    poscount['Verb']=0
    poscount['Adjective'] = 0
    poscount['Pronoun']=0
    poscount['Adverb']=0
    Nouns = {'NN','NNS','NNP','NNPS'}
    Verbs={'VB','VBP','VBZ','VBN','VBG','VBD','To'}
    word_tokens = nltk.word_tokenize(re.sub(r'([^\s\w]|_)+', '', tweet_text))
    postag = nltk.pos_tag(word_tokens)
    for g1 in postag:
     if g1[1] in Nouns:
        poscount['Noun'] += 1
     elif g1[1] in Verbs:
         poscount['Verb']+= 1
     elif g1[1]=='ADJ'or g1[1]=='JJ':
         poscount['Adjective']+=1
     elif g1[1]=='PRP' or g1[1]=='PRON':
         poscount['Pronoun']+=1
     elif g1[1]=='ADV':
         poscount['Adverb']+=1
    return poscount.values()

In [6]:
def store_features_for_data(model,data,run_id):
    batch_size=3
    
    def chunker(seq, size):
        return (seq[pos:pos + size] for pos in range(0, len(seq), size))
    
    
    i=0
    processes=[]
    for batch in chunker(data,batch_size):
        batch_of_items2json_files(batch,model,i,run_id)
        #p=Process(target=batch_of_items2json_iles,args=(batch,model,i,run_id))
        #p.start()
        #processes.append(p)
        if i>2:
            break
        i+=1
    for p in processes:
        p.join()

def load_test_data():
    pass


In [7]:

data =load_data('train.csv')
print(data.columns)
print(data.head())
print(data.describe())


Index(['qid', 'question_text', 'target'], dtype='object')
                    qid                                      question_text  \
0  00002165364db923c7e6  How did Quebec nationalists see their province...   
1  000032939017120e6e44  Do you have an adopted dog, how would you enco...   
2  0000412ca6e4628ce2cf  Why does velocity affect time? Does velocity a...   
3  000042bf85aa498cd78e  How did Otto von Guericke used the Magdeburg h...   
4  0000455dfa3e01eae3af  Can I convert montra helicon D to a mountain b...   

   target  
0       0  
1       0  
2       0  
3       0  
4       0  
             target
count  1.306122e+06
mean   6.187018e-02
std    2.409197e-01
min    0.000000e+00
25%    0.000000e+00
50%    0.000000e+00
75%    0.000000e+00
max    1.000000e+00


In [8]:
model = load_google_vector()
print("load_google_vector loaded!")

FileNotFoundError: [Errno 2] No such file or directory: '../input/embeddings/GoogleNews-vectors-negative300/GoogleNews-vectors-negative300.bin'

In [33]:
model = load_google_vector()
print("load_google_vector loaded!")

          a         b         c         d
0  0.436794  0.669541  0.013012  0.998230
1  0.327544  0.505453  0.340200  0.228876
2  0.203800  0.263133  0.200873  0.115157
3  0.725138  0.428829  0.902273  0.371280
4  0.217536  0.738412  0.633183  0.332921
          a         b         c         d
5  0.916964  0.645652  0.975994  0.335463
6  0.754606  0.527807  0.015716  0.034400
7  0.985832  0.425420  0.243878  0.503374
8  0.190398  0.927735  0.711029  0.797765
9  0.148144  0.063372  0.463337  0.549564
           a         b         c         d
10  0.299028  0.522398  0.245013  0.654803
11  0.787374  0.998540  0.688639  0.772203
12  0.169016  0.712681  0.721149  0.522241
13  0.553292  0.819158  0.324067  0.692155
14  0.599879  0.475290  0.259008  0.904126
           a        b         c         d
15  0.056268  0.80973  0.948875  0.435467


In [None]:
store_features_for_data(model,data,'ali')