In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing import sequence, text
from keras.layers import Input, Embedding
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
from nltk import word_tokenize
from nltk.corpus import stopwords
from textblob import TextBlob
import string
from sentence_transformers import SentenceTransformer, LoggingHandler
import numpy as np
import logging

Using TensorFlow backend.
[nltk_data] Downloading package stopwords to /home/ryan/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ryan/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/ryan/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [137]:
stop_words = list(set(stopwords.words('english')))
punctuation = string.punctuation

In [138]:
train = pd.read_csv('../MachineHackElectronicPrice/Train.csv', index_col =False)
test = pd.read_csv('../MachineHackElectronicPrice/Test.csv', index_col =False)

# Embedding bert

In [139]:
np.set_printoptions(threshold=100)

logging.basicConfig(format='%(asctime)s - %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S',
                    level=logging.INFO,
                    handlers=[LoggingHandler()])

In [140]:
model = SentenceTransformer('bert-base-nli-mean-tokens')

2020-06-08 03:31:19 - Load pretrained SentenceTransformer: bert-base-nli-mean-tokens
2020-06-08 03:31:19 - Did not find a '/' or '\' in the name. Assume to download model from server.
2020-06-08 03:31:20 - Load SentenceTransformer from folder: /home/ryan/.cache/torch/sentence_transformers/public.ukp.informatik.tu-darmstadt.de_reimers_sentence-transformers_v0.2_bert-base-nli-mean-tokens.zip
2020-06-08 03:31:20 - loading configuration file /home/ryan/.cache/torch/sentence_transformers/public.ukp.informatik.tu-darmstadt.de_reimers_sentence-transformers_v0.2_bert-base-nli-mean-tokens.zip/0_BERT/config.json
2020-06-08 03:31:20 - Model config BertConfig {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "bert",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "type

In [141]:
train['text'] = train.apply(lambda row: ' '.join ([str(row['Model_Info']), str(row['Additional_Description'])]), 
                                          axis=1)
test['text'] = test.apply(lambda row: ' '.join ([str(row['Model_Info']), str(row['Additional_Description'])]), 
                                          axis=1)


In [142]:
index_train=range(len(train))
index_test=range(len(test))

columns=[i for i in range(0,768)]

emedding_train = pd.DataFrame(index=index_train, columns=columns)
emedding_test = pd.DataFrame(index=index_test, columns=columns)

In [8]:
sentences_train = list(train['text'].values)
sentences_test = list(test['text'].values)

In [9]:
sentence_embeddings_train = model.encode(sentences_train)

Batches: 100%|██████████| 291/291 [01:38<00:00,  2.96it/s]


In [10]:
sentence_embeddings_test = model.encode(sentences_test)

Batches: 100%|██████████| 125/125 [00:42<00:00,  2.97it/s]


In [11]:
for i, embedding in zip(range(len(train)), sentence_embeddings_train):
    emedding_train.loc[i] = embedding
    
for i, embedding in zip(range(len(test)), sentence_embeddings_test):
    emedding_test.loc[i] = embedding    

In [12]:
emedding_train.to_csv('train_embedding.csv' , index=False)
emedding_test.to_csv('test_embedding.csv' , index=False)

# topic modeling

In [8]:
# create count vectorizer first
cvectorizer = CountVectorizer(min_df=4, max_features=4000, ngram_range=(1,2))

cvz_train = cvectorizer.fit_transform(train['text'])

cvz_test = cvectorizer.transform(test['text'])

# generate topic models using Latent Dirichlet Allocation
lda_model = LatentDirichletAllocation(n_components=20, learning_method='online', max_iter=20, random_state=42)

train_topics = lda_model.fit_transform(cvz_train)
test_topics = lda_model.transform(cvz_test)

In [13]:
train_topics_df = pd.DataFrame(train_topics) 
test_topics_df = pd.DataFrame(test_topics) 

In [14]:
train_topics_df = train_topics_df.add_suffix('_topic')
test_topics_df = test_topics_df.add_suffix('_topic')

In [15]:
train_topics_df.head()

Unnamed: 0,0_topic,1_topic,2_topic,3_topic,4_topic,5_topic,6_topic,7_topic,8_topic,9_topic,10_topic,11_topic,12_topic,13_topic,14_topic,15_topic,16_topic,17_topic,18_topic,19_topic
0,0.002778,0.149627,0.002778,0.002778,0.002778,0.002778,0.002778,0.002778,0.002778,0.131722,0.375614,0.186018,0.002778,0.002778,0.002778,0.002778,0.002778,0.115351,0.002778,0.002778
1,0.003846,0.003846,0.003846,0.003846,0.003846,0.003846,0.003846,0.003846,0.849281,0.003846,0.003846,0.003846,0.003846,0.003846,0.003846,0.003846,0.003846,0.003846,0.003846,0.081488
2,0.028672,0.001163,0.075154,0.034648,0.001163,0.001163,0.001163,0.001163,0.210762,0.001163,0.001163,0.001163,0.001163,0.001163,0.041302,0.001163,0.593183,0.001163,0.001163,0.001163
3,0.00125,0.00125,0.00125,0.00125,0.185583,0.00125,0.00125,0.00125,0.361575,0.00125,0.270013,0.00125,0.00125,0.00125,0.162828,0.00125,0.00125,0.00125,0.00125,0.00125
4,0.001852,0.001852,0.001852,0.001852,0.27116,0.001852,0.001852,0.001852,0.060468,0.116126,0.164639,0.001852,0.001852,0.001852,0.249198,0.001852,0.001852,0.001852,0.001852,0.112483


In [16]:
train_topics_df.to_csv('train_topic.csv', index=False)
test_topics_df.to_csv('test_topic.csv', index=False)

# Other NLP Features

In [16]:
train['Model_Infolen'] = train['Model_Info'].apply(len)
train['Additional_Descriptionlen'] = train['Additional_Description'].apply(len)

test['Model_Infolen'] = test['Model_Info'].apply(len)
test['Additional_Descriptionlen'] = test['Additional_Description'].apply(len)

In [17]:
train['char_count'] = train['text'].apply(len)
train['word_count'] = train['text'].apply(lambda x: len(x.split()))
train['word_density'] = train['char_count'] / (train['word_count']+1)

test['char_count'] = test['text'].apply(len)
test['word_count'] = test['text'].apply(lambda x: len(x.split()))
test['word_density'] = test['char_count'] / (test['word_count']+1)

2020-06-08 01:19:22 - NumExpr defaulting to 4 threads.


In [18]:
pos_dic = {
    'noun' : ['NN','NNS','NNP','NNPS'],
    'pron' : ['PRP','PRP$','WP','WP$'],
    'verb' : ['VB','VBD','VBG','VBN','VBP','VBZ'],
    'adj' :  ['JJ','JJR','JJS'],
    'adv' : ['RB','RBR','RBS','WRB']
}

# function to check and get the part of speech tag count of a words in a given sentence
def pos_check(x, flag):
    cnt = 0
    try:
        wiki = TextBlob(x)
        for tup in wiki.tags:
            ppo = list(tup)[1]
            if ppo in pos_dic[flag]:
                cnt += 1
    except:
        pass
    return cnt

In [19]:
train['noun_count'] = train['text'].apply(lambda x: pos_check(x, 'noun'))
train['verb_count'] = train['text'].apply(lambda x: pos_check(x, 'verb'))
train['adj_count'] = train['text'].apply(lambda x: pos_check(x, 'adj'))
train['adv_count'] = train['text'].apply(lambda x: pos_check(x, 'adv'))


test['noun_count'] = test['text'].apply(lambda x: pos_check(x, 'noun'))
test['verb_count'] = test['text'].apply(lambda x: pos_check(x, 'verb'))
test['adj_count'] = test['text'].apply(lambda x: pos_check(x, 'adj'))
test['adv_count'] = test['text'].apply(lambda x: pos_check(x, 'adv'))


In [20]:
train.columns


Index(['Brand', 'Model_Info', 'Additional_Description', 'Locality', 'City',
       'State', 'Price', 'text', 'Model_Infolen', 'Additional_Descriptionlen',
       'char_count', 'word_count', 'word_density', 'noun_count', 'verb_count',
       'adj_count', 'adv_count'],
      dtype='object')

In [21]:
trainNLP = train[['Model_Infolen', 'Additional_Descriptionlen',
       'char_count', 'word_count', 'word_density', 'noun_count', 'verb_count',
       'adj_count', 'adv_count']]

testNLP = test[['Model_Infolen', 'Additional_Descriptionlen',
       'char_count', 'word_count', 'word_density', 'noun_count', 'verb_count',
       'adj_count', 'adv_count']]

In [22]:
trainNLP.to_csv('train_NLP.csv', index=False)
testNLP.to_csv('test_NLP.csv', index=False)

# Aggregation Features

In [95]:
train = pd.read_csv('../MachineHackElectronicPrice/Train.csv', index_col =False)
test = pd.read_csv('../MachineHackElectronicPrice/Test.csv', index_col =False)

In [96]:
sf_train = train.copy()
sf_test = test.copy()

In [97]:
Stateaggtrain = train.groupby(['State'])['Price'].agg(['mean', 'median', 'sum'])
Stateaggtrain.columns = Stateaggtrain.columns.map(lambda x: 'state_' + str(x) )



In [98]:
train_sf =train.join(Stateaggtrain, on = ['State'], how ="inner")

test_sf =test.join(Stateaggtrain, on = ['State'], how ="inner")

In [99]:
Cityagg = train.groupby(['City'])['Price'].agg(['mean', 'median', 'sum'])
Cityagg.columns = Cityagg.columns.map(lambda x: 'city_' + str(x) )

In [100]:
train_sf = train_sf.join(Cityagg, on = ['City'], how ="inner")

test_sf = test_sf.join(Cityagg, on = ['City'], how ="inner")

In [101]:
Localityagg = train.groupby(['Locality'])['Price'].agg(['mean', 'median', 'sum'])
Localityagg.columns = Localityagg.columns.map(lambda x: 'locality_' + str(x) )

In [102]:
train_sf =train_sf.join(Localityagg, on = ['Locality'], how ="inner")

test_sf =test_sf.join(Localityagg, on = ['Locality'], how ="inner")

In [103]:
brandagg = train.groupby('Brand')['Price'].agg(['mean','median','sum'])
brandagg.columns = brandagg.columns.map(lambda x: 'brand_' + str(x) )

In [104]:
train_sf =train_sf.join(brandagg, on = ['Brand'], how ="inner")

test_sf =test_sf.join(brandagg, on = ['Brand'], how ="inner")

In [105]:
target = train.groupby(['Brand','State']).agg(['median','mean','sum']).reset_index()
target

Unnamed: 0_level_0,Brand,State,Locality,Locality,Locality,City,City,City,Price,Price,Price
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,median,mean,sum,median,mean,sum,median,mean,sum
0,0,0,580.0,596.181818,6558,4,4.0,44,7500,8359.0,91949
1,0,1,459.5,494.0,1976,0,0.0,0,6600,6612.5,26450
2,0,2,886.0,712.75,8553,8,8.0,96,10500,9441.666667,113300
3,0,3,575.5,598.75,2395,1,1.0,4,9000,10000.0,40000
4,0,4,761.0,755.076923,19632,11,11.0,286,6100,9330.615385,242596
5,0,5,597.0,581.636364,6398,13,13.545455,149,10000,21218.090909,233399
6,0,6,749.0,758.0,4548,2,2.0,12,7000,6499.833333,38999
7,0,7,547.0,567.181818,6239,10,10.0,110,7500,8645.272727,95098
8,1,0,491.0,540.253219,125879,4,4.0,932,17500,25768.781116,6004126
9,1,1,366.0,423.032922,102797,0,0.131687,32,19500,25831.786008,6277124


In [106]:
columns = ['Brand','State']

# Iterate through the variables names
for var in target.columns.levels[0]:
    # Skip the id name
    if var != 'Brand' and var != 'State':
        
        # Iterate through the stat names
        for stat in target.columns.levels[1][:-1]:
            # Make a new column name for the variable and stat
            columns.append('state%s_%s' % (var, stat))

In [107]:
columns

['Brand',
 'State',
 'stateLocality_median',
 'stateLocality_mean',
 'stateLocality_sum',
 'stateCity_median',
 'stateCity_mean',
 'stateCity_sum',
 'statePrice_median',
 'statePrice_mean',
 'statePrice_sum']

In [108]:
target.columns = columns
target.head()

Unnamed: 0,Brand,State,stateLocality_median,stateLocality_mean,stateLocality_sum,stateCity_median,stateCity_mean,stateCity_sum,statePrice_median,statePrice_mean,statePrice_sum
0,0,0,580.0,596.181818,6558,4,4.0,44,7500,8359.0,91949
1,0,1,459.5,494.0,1976,0,0.0,0,6600,6612.5,26450
2,0,2,886.0,712.75,8553,8,8.0,96,10500,9441.666667,113300
3,0,3,575.5,598.75,2395,1,1.0,4,9000,10000.0,40000
4,0,4,761.0,755.076923,19632,11,11.0,286,6100,9330.615385,242596


In [109]:
ntarget = target[['Brand','State','statePrice_median','statePrice_mean','statePrice_sum']]


In [110]:
train_sf = train_sf.merge(ntarget, on = ['Brand','State'] ,how ='inner')

test_sf = test_sf.merge(ntarget, on = ['Brand','State'] ,how ='inner')

In [111]:
target = train.groupby(['Brand','City']).agg(['median','mean','sum']).reset_index()
target

Unnamed: 0_level_0,Brand,City,Locality,Locality,Locality,State,State,State,Price,Price,Price
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,median,mean,sum,median,mean,sum,median,mean,sum
0,0,0,459.5,494.0,1976,1,1,4,6600,6612.5,26450
1,0,1,575.5,598.75,2395,3,3,12,9000,10000.0,40000
2,0,2,749.0,758.0,4548,6,6,36,7000,6499.833333,38999
3,0,4,580.0,596.181818,6558,0,0,0,7500,8359.0,91949
4,0,8,886.0,712.75,8553,2,2,24,10500,9441.666667,113300
5,0,10,547.0,567.181818,6239,7,7,77,7500,8645.272727,95098
6,0,11,761.0,755.076923,19632,4,4,104,6100,9330.615385,242596
7,0,13,619.0,636.0,5088,5,5,40,8450,18449.875,147599
8,0,15,503.0,436.666667,1310,5,5,15,15000,28600.0,85800
9,1,0,366.0,422.377593,101793,1,1,241,19500,25890.556017,6239624


In [112]:
columns = ['Brand','City']

# Iterate through the variables names
for var in target.columns.levels[0]:
    # Skip the id name
    if var != 'Brand' and var != 'City':
        
        # Iterate through the stat names
        for stat in target.columns.levels[1][:-1]:
            # Make a new column name for the variable and stat
            columns.append('city%s_%s' % (var, stat))

In [113]:
target.columns = columns
target.head()

Unnamed: 0,Brand,City,cityLocality_median,cityLocality_mean,cityLocality_sum,cityState_median,cityState_mean,cityState_sum,cityPrice_median,cityPrice_mean,cityPrice_sum
0,0,0,459.5,494.0,1976,1,1,4,6600,6612.5,26450
1,0,1,575.5,598.75,2395,3,3,12,9000,10000.0,40000
2,0,2,749.0,758.0,4548,6,6,36,7000,6499.833333,38999
3,0,4,580.0,596.181818,6558,0,0,0,7500,8359.0,91949
4,0,8,886.0,712.75,8553,2,2,24,10500,9441.666667,113300


In [114]:
ntarget = target[['Brand','City','cityPrice_median','cityPrice_mean','cityPrice_sum']]

In [115]:
train_sf = train_sf.merge(ntarget, on = ['Brand','City'] ,how ='inner')

test_sf = test_sf.merge(ntarget, on = ['Brand','City'] ,how ='inner')

In [116]:
target = train.groupby(['Brand','Locality']).agg(['median','mean','sum']).reset_index()
target.head()

Unnamed: 0_level_0,Brand,Locality,City,City,City,State,State,State,Price,Price,Price
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,median,mean,sum,median,mean,sum,median,mean,sum
0,0,0,0.0,0.0,0,1.0,1.0,1,6000.0,6000.0,6000
1,0,60,13.0,13.0,13,5.0,5.0,5,6200.0,6200.0,6200
2,0,79,11.0,11.0,11,4.0,4.0,4,5299.0,5299.0,5299
3,0,85,8.0,8.0,8,2.0,2.0,2,6200.0,6200.0,6200
4,0,122,10.0,10.0,10,7.0,7.0,7,9200.0,9200.0,9200


In [117]:
columns = ['Brand','Locality']

# Iterate through the variables names
for var in target.columns.levels[0]:
    # Skip the id name
    if var != 'Brand' and var != 'Locality':
        
        # Iterate through the stat names
        for stat in target.columns.levels[1][:-1]:
            # Make a new column name for the variable and stat
            columns.append('locality%s_%s' % (var, stat))

In [118]:
target.columns = columns
target.head()

Unnamed: 0,Brand,Locality,localityCity_median,localityCity_mean,localityCity_sum,localityState_median,localityState_mean,localityState_sum,localityPrice_median,localityPrice_mean,localityPrice_sum
0,0,0,0.0,0.0,0,1.0,1.0,1,6000.0,6000.0,6000
1,0,60,13.0,13.0,13,5.0,5.0,5,6200.0,6200.0,6200
2,0,79,11.0,11.0,11,4.0,4.0,4,5299.0,5299.0,5299
3,0,85,8.0,8.0,8,2.0,2.0,2,6200.0,6200.0,6200
4,0,122,10.0,10.0,10,7.0,7.0,7,9200.0,9200.0,9200


In [119]:
ntarget = target[['Brand','Locality','localityPrice_median','localityPrice_mean','localityPrice_sum']]

In [120]:
train_sf = train_sf.merge(ntarget, on = ['Brand','Locality'] ,how ='inner')

test_sf = test_sf.merge(ntarget, on = ['Brand','Locality'] ,how ='inner')

In [121]:
target = train.groupby(['State','City']).agg(['median','mean','sum']).reset_index()
target.head()

Unnamed: 0_level_0,State,City,Brand,Brand,Brand,Locality,Locality,Locality,Price,Price,Price
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,median,mean,sum,median,mean,sum,median,mean,sum
0,0,4,1,1.023346,263,493,546.365759,140416,16000.0,23984.330739,6163973
1,1,0,1,1.019763,258,366,419.146245,106044,18500.0,24890.802372,6297373
2,1,16,1,1.0,2,502,502.0,1004,18750.0,18750.0,37500
3,2,8,1,1.041667,275,695,643.068182,169770,18000.0,25613.621212,6761996
4,3,1,1,1.028571,216,601,562.371429,118098,20950.0,28488.352381,5982554


In [122]:
columns = ['State','City']

# Iterate through the variables names
for var in target.columns.levels[0]:
    # Skip the id name
    if var != 'State' and var != 'City':
        
        # Iterate through the stat names
        for stat in target.columns.levels[1][:-1]:
            # Make a new column name for the variable and stat
            columns.append('sc%s_%s' % (var, stat))

In [123]:
target.columns = columns
target.head()

Unnamed: 0,State,City,scBrand_median,scBrand_mean,scBrand_sum,scLocality_median,scLocality_mean,scLocality_sum,scPrice_median,scPrice_mean,scPrice_sum
0,0,4,1,1.023346,263,493,546.365759,140416,16000.0,23984.330739,6163973
1,1,0,1,1.019763,258,366,419.146245,106044,18500.0,24890.802372,6297373
2,1,16,1,1.0,2,502,502.0,1004,18750.0,18750.0,37500
3,2,8,1,1.041667,275,695,643.068182,169770,18000.0,25613.621212,6761996
4,3,1,1,1.028571,216,601,562.371429,118098,20950.0,28488.352381,5982554


In [124]:
ntarget = target[['State','City','scPrice_median','scPrice_mean','scPrice_sum']]

In [125]:
train_sf = train_sf.merge(ntarget, on = ['State','City'] ,how ='inner')

test_sf = test_sf.merge(ntarget, on = ['State','City'] ,how ='inner')

In [126]:
target = train.groupby(['City','Locality']).agg(['median','mean','sum']).reset_index()
target.head()

Unnamed: 0_level_0,City,Locality,Brand,Brand,Brand,State,State,State,Price,Price,Price
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,median,mean,sum,median,mean,sum,median,mean,sum
0,0,0,1.0,0.923077,12,1,1,13,17000.0,28384.461538,368998
1,0,27,1.0,1.0,4,1,1,4,14500.0,15258.75,61035
2,0,31,1.0,1.222222,11,1,1,9,15000.0,19944.222222,179498
3,0,33,1.0,1.0,2,1,1,2,20749.5,20749.5,41499
4,0,50,1.0,1.090909,12,1,1,11,13999.0,19008.636364,209095


In [127]:
columns = ['City','Locality']

# Iterate through the variables names
for var in target.columns.levels[0]:
    # Skip the id name
    if var != 'City' and var != 'Locality':
        
        # Iterate through the stat names
        for stat in target.columns.levels[1][:-1]:
            # Make a new column name for the variable and stat
            columns.append('cl%s_%s' % (var, stat))

In [128]:
target.columns = columns
target.head()

Unnamed: 0,City,Locality,clBrand_median,clBrand_mean,clBrand_sum,clState_median,clState_mean,clState_sum,clPrice_median,clPrice_mean,clPrice_sum
0,0,0,1.0,0.923077,12,1,1,13,17000.0,28384.461538,368998
1,0,27,1.0,1.0,4,1,1,4,14500.0,15258.75,61035
2,0,31,1.0,1.222222,11,1,1,9,15000.0,19944.222222,179498
3,0,33,1.0,1.0,2,1,1,2,20749.5,20749.5,41499
4,0,50,1.0,1.090909,12,1,1,11,13999.0,19008.636364,209095


In [129]:
columns

['City',
 'Locality',
 'clBrand_median',
 'clBrand_mean',
 'clBrand_sum',
 'clState_median',
 'clState_mean',
 'clState_sum',
 'clPrice_median',
 'clPrice_mean',
 'clPrice_sum']

In [130]:
ntarget = target[['City','Locality','clPrice_median','clPrice_mean','clPrice_sum']]

In [131]:
train_sf = train_sf.merge(ntarget, on = ['City','Locality'] ,how ='inner')

test_sf = test_sf.merge(ntarget, on = ['City','Locality'] ,how ='inner')

In [132]:
train_sf.columns

Index(['Brand', 'Model_Info', 'Additional_Description', 'Locality', 'City',
       'State', 'Price', 'state_mean', 'state_median', 'state_sum',
       'city_mean', 'city_median', 'city_sum', 'locality_mean',
       'locality_median', 'locality_sum', 'brand_mean', 'brand_median',
       'brand_sum', 'statePrice_median', 'statePrice_mean', 'statePrice_sum',
       'cityPrice_median', 'cityPrice_mean', 'cityPrice_sum',
       'localityPrice_median', 'localityPrice_mean', 'localityPrice_sum',
       'scPrice_median', 'scPrice_mean', 'scPrice_sum', 'clPrice_median',
       'clPrice_mean', 'clPrice_sum'],
      dtype='object')

In [133]:
train_sf = train_sf[['state_mean', 'state_median', 'state_sum',
       'city_mean', 'city_median', 'city_sum', 'locality_mean',
       'locality_median', 'locality_sum', 'brand_mean', 'brand_median',
       'brand_sum', 'statePrice_median', 'statePrice_mean', 'statePrice_sum',
       'cityPrice_median', 'cityPrice_mean', 'cityPrice_sum',
       'localityPrice_median', 'localityPrice_mean', 'localityPrice_sum',
       'scPrice_median', 'scPrice_mean', 'scPrice_sum', 'clPrice_median',
       'clPrice_mean', 'clPrice_sum']]

test_sf = test_sf[['state_mean', 'state_median', 'state_sum',
       'city_mean', 'city_median', 'city_sum', 'locality_mean',
       'locality_median', 'locality_sum', 'brand_mean', 'brand_median',
       'brand_sum', 'statePrice_median', 'statePrice_mean', 'statePrice_sum',
       'cityPrice_median', 'cityPrice_mean', 'cityPrice_sum',
       'localityPrice_median', 'localityPrice_mean', 'localityPrice_sum',
       'scPrice_median', 'scPrice_mean', 'scPrice_sum', 'clPrice_median',
       'clPrice_mean', 'clPrice_sum']]

In [134]:
train_sf.to_csv('train_AGGp.csv', index=False)
test_sf.to_csv('test_AGGp.csv', index=False)

In [135]:
train_full = pd.concat([train,train_sf,trainNLP,train_topics_df,emedding_train],axis =1)

NameError: name 'trainNLP' is not defined

In [53]:
test_full = pd.concat([test,test_sf,testNLP,test_topics_df,emedding_test],axis =1)

In [54]:
train_full = train_full.drop(['Model_Info', 'Additional_Description'],axis=1)

test_full = test_full.drop(['Model_Info', 'Additional_Description'],axis=1)

In [55]:
train_full.to_csv('train_FULL.csv', index=False)
test_full.to_csv('test_FULL.csv', index=False)