In [1]:
import en_core_web_sm
spacy_nlp = en_core_web_sm.load()

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)

train_df = pd.read_csv("../input/train.csv")
test_df = pd.read_csv("../input/test.csv")

# change ne to tag
def get_spacy_text(s):
    pos,tag,dep = '','',''
    for token in spacy_nlp(s):
        pos = pos + ' ' + token.pos_
        tag = tag + ' ' + token.tag_
        dep = dep + ' ' + token.dep_

    return pos,tag,dep

import time
start_t = time.time()
poss,tags,deps = [],[],[]
for s in train_df["comment_text"].values:
    pos,tag,dep = get_spacy_text(s)
    poss.append(pos)
    tags.append(tag)
    deps.append(dep)
train_df['pos_txt'],train_df['tag_txt'],train_df['dep_txt'] = poss, tags, deps
print('train done',time.time() - start_t)


start_t = time.time()
poss,tags,deps = [],[],[]
for s in test_df["comment_text"].values:
    pos,tag,dep = get_spacy_text(s)
    poss.append(pos)
    tags.append(tag)
    deps.append(dep)
test_df['pos_txt'],test_df['tag_txt'],test_df['dep_txt'] = poss, tags, deps
print('test done', time.time() - start_t)

KeyboardInterrupt: 

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
# cnt on tag
c_vec3 = CountVectorizer(lowercase=False,ngram_range=(1,1))
c_vec3.fit(train_df['tag_txt'].values.tolist() + test_df['tag_txt'].values.tolist())
train_cvec3 = c_vec3.transform(train_df['tag_txt'].values.tolist()).toarray()
test_cvec3 = c_vec3.transform(test_df['tag_txt'].values.tolist()).toarray()
print(train_cvec3.shape,test_cvec3.shape)

# cnt on ne
c_vec4 = CountVectorizer(lowercase=False,ngram_range=(1,2))
c_vec4.fit(train_df['pos_txt'].values.tolist() + test_df['pos_txt'].values.tolist())
train_cvec4 = c_vec4.transform(train_df['pos_txt'].values.tolist()).toarray()
test_cvec4 = c_vec4.transform(test_df['pos_txt'].values.tolist()).toarray()
print(train_cvec4.shape,test_cvec4.shape)

# cnt on dep
c_vec7 = CountVectorizer(lowercase=False,ngram_range=(1,1))
c_vec7.fit(train_df['dep_txt'].values.tolist() + test_df['dep_txt'].values.tolist())
train_cvec7 = c_vec7.transform(train_df['dep_txt'].values.tolist()).toarray()
test_cvec7 = c_vec7.transform(test_df['dep_txt'].values.tolist()).toarray()
print(train_cvec7.shape,test_cvec7.shape)

# tfidf on tag
tf_vec5 = TfidfVectorizer(lowercase=False,ngram_range=(1,1))
tf_vec5.fit(train_df['tag_txt'].values.tolist() + test_df['tag_txt'].values.tolist())
train_tf5 = tf_vec5.transform(train_df['tag_txt'].values.tolist()).toarray()
test_tf5 = tf_vec5.transform(test_df['tag_txt'].values.tolist()).toarray()
print(train_tf5.shape,test_tf5.shape)

# tfidf on ne
tf_vec6 = TfidfVectorizer(lowercase=False,ngram_range=(1,2))
tf_vec6.fit(train_df['pos_txt'].values.tolist() + test_df['pos_txt'].values.tolist())
train_tf6 = tf_vec6.transform(train_df['pos_txt'].values.tolist()).toarray()
test_tf6 = tf_vec6.transform(test_df['pos_txt'].values.tolist()).toarray()
print(train_tf6.shape,test_tf6.shape)

# tfidf on dep
tf_vec8 = TfidfVectorizer(lowercase=False,ngram_range=(1,1))
tf_vec8.fit(train_df['dep_txt'].values.tolist() + test_df['dep_txt'].values.tolist())
train_tf8 = tf_vec8.transform(train_df['dep_txt'].values.tolist()).toarray()
test_tf8 = tf_vec8.transform(test_df['dep_txt'].values.tolist()).toarray()
print(train_tf8.shape,test_tf8.shape)

In [None]:
all_nlp_train = np.hstack([train_cvec3,train_cvec4,train_tf5,train_tf6,train_cvec7, train_tf8]) 
all_nlp_test = np.hstack([test_cvec3,test_cvec4,test_tf5,test_tf6, test_cvec7, test_tf8]) 
print('nlp feat done')
import pickle
with open('../features/nlp_feat.pkl','wb') as fout:
    pickle.dump([all_nlp_train,all_nlp_test],fout)