## using more categories of features
## sparse matrix calculations inspired by dune_dweller

In [9]:
%matplotlib inline
import numpy as np
import pandas as pd
from scipy.sparse import csr_matrix, hstack
from scipy import io
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder

In [10]:
train = pd.read_csv('gender_age_train.csv', index_col='device_id')
test = pd.read_csv('gender_age_test.csv', index_col='device_id')

In [11]:
app_events = pd.read_csv('app_events.csv')

In [12]:
app_labels = pd.read_csv('app_labels.csv')
events = pd.read_csv('events.csv')
categories = pd.read_csv('label_categories.csv')

In [13]:
phone = pd.read_csv('phone_brand_device_model.csv')

In [14]:
#create rownums for later use
train['trainrow'] = np.arange(train.shape[0])
test['testrow'] = np.arange(test.shape[0])

# one-hot all phone brands and devices using sparse matrix

In [15]:
#remove duplicates
phone = phone.drop_duplicates('device_id', keep='first').set_index('device_id')

In [16]:
phone.phone_brand = LabelEncoder().fit_transform(phone.phone_brand)
phone.device_model = LabelEncoder().fit_transform(phone.device_model)
phone.head()

Unnamed: 0_level_0,phone_brand,device_model
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1
-8890648629457979026,51,1517
1277779817574759137,51,749
5137427614288105724,15,560
3669464369358936369,9,1503
-5019277647504317457,15,536


In [17]:
train['brand'] = phone.phone_brand
test['brand'] = phone.phone_brand

In [18]:
train.head()

Unnamed: 0_level_0,gender,age,group,trainrow,brand
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
-8076087639492063270,M,35,M32-38,0,51
-2897161552818060146,M,35,M32-38,1,51
-8260683887967679142,M,35,M32-38,2,51
-4938849341048082022,M,30,M29-31,3,51
245133531816851882,M,30,M29-31,4,51


In [19]:
train['model'] = phone.device_model
test['model'] = phone.device_model

In [20]:
train.head()

Unnamed: 0_level_0,gender,age,group,trainrow,brand,model
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
-8076087639492063270,M,35,M32-38,0,51,749
-2897161552818060146,M,35,M32-38,1,51,749
-8260683887967679142,M,35,M32-38,2,51,749
-4938849341048082022,M,30,M29-31,3,51,1524
245133531816851882,M,30,M29-31,4,51,753


In [21]:
#create sparse matrices

train_brand_sparse = csr_matrix((np.ones(train.shape[0]), (np.arange(train.shape[0]), train.brand)))
test_brand_sparse = csr_matrix((np.ones(test.shape[0]), (np.arange(test.shape[0]), test.brand)))

In [22]:
train_model_sparse = csr_matrix((np.ones(train.shape[0]), (train.trainrow, train.model)))
test_model_sparse = csr_matrix((np.ones(test.shape[0]), (test.testrow, test.model)))

In [23]:
print(train_brand_sparse.shape)
print(test_brand_sparse.shape)
print(train_model_sparse.shape)
print(test_model_sparse.shape)

(74645, 131)
(112071, 131)
(74645, 1599)
(112071, 1599)


# One hot encode apps

In [24]:
appEncoder = LabelEncoder().fit(app_events.app_id)
app_events['app'] = appEncoder.transform(app_events.app_id)

In [25]:
numApps = len(appEncoder.classes_)

In [26]:
#merge device_id from events to app_events

app_events = app_events.merge(events[['event_id', 'device_id']], on='event_id', how='left')

In [27]:
app_events.head()

Unnamed: 0,event_id,app_id,is_installed,is_active,app,device_id
0,2,5927333115845830913,1,1,15408,-6401643145415154744
1,2,-5720078949152207372,1,0,3384,-6401643145415154744
2,2,-1633887856876571208,1,0,7620,-6401643145415154744
3,2,-653184325010919369,1,1,8902,-6401643145415154744
4,2,8693964245073640147,1,1,18686,-6401643145415154744


In [28]:
app_events = (app_events.merge(train[['trainrow']], left_on='device_id', how='left', right_index=True)
              .merge(test[['testrow']], left_on='device_id', how='left', right_index=True))
app_events.head()

Unnamed: 0,event_id,app_id,is_installed,is_active,app,device_id,trainrow,testrow
0,2,5927333115845830913,1,1,15408,-6401643145415154744,,68691.0
1,2,-5720078949152207372,1,0,3384,-6401643145415154744,,68691.0
2,2,-1633887856876571208,1,0,7620,-6401643145415154744,,68691.0
3,2,-653184325010919369,1,1,8902,-6401643145415154744,,68691.0
4,2,8693964245073640147,1,1,18686,-6401643145415154744,,68691.0


In [29]:
temp = app_events.dropna(subset=['trainrow'])
train_apps_sparse = csr_matrix((np.ones(temp.shape[0]), (temp.trainrow, temp.app)), shape=(train.shape[0],numApps))
temp = app_events.dropna(subset=['testrow'])
test_apps_sparse = csr_matrix((np.ones(temp.shape[0]), (temp.testrow, temp.app)), shape=(test.shape[0],numApps))
test_apps_sparse.shape

(112071, 19237)

In [30]:
print(train_apps_sparse.shape)
print(test_apps_sparse.shape)

(74645, 19237)
(112071, 19237)


# One hot encode app labels

In [31]:
app_events.shape

(32473067, 8)

In [32]:
app_labels = app_labels.loc[app_labels.app_id.isin(app_events.app_id.unique())]

In [33]:
app_labels['app'] = appEncoder.transform(app_labels.app_id)

In [34]:
labelEncoder = LabelEncoder().fit(app_labels.label_id)
numLabels = len(labelEncoder.classes_)
app_labels['label'] = labelEncoder.transform(app_labels.label_id)

In [35]:
app_event_labels = (app_events[['device_id', 'app', 'trainrow', 'testrow']].merge(app_labels[['app', 'label']], on='app')
                    #.merge(train[[]], left_on='device_id', right_index=True, how='left')
                    #.merge(test[['testrow']], left_on='device_id', right_index=True, how='left'))
                    )

In [36]:
app_event_labels.head(10)

Unnamed: 0,device_id,app,trainrow,testrow,label
0,-6401643145415154744,15408,,68691.0,250
1,-6401643145415154744,15408,,68691.0,276
2,-6401643145415154744,15408,,68691.0,270
3,-6401643145415154744,15408,,68691.0,249
4,-6401643145415154744,15408,,68691.0,138
5,1476664663289716375,15408,51154.0,,250
6,1476664663289716375,15408,51154.0,,276
7,1476664663289716375,15408,51154.0,,270
8,1476664663289716375,15408,51154.0,,249
9,1476664663289716375,15408,51154.0,,138


In [37]:
del(phone)
del(app_events)

In [38]:
temp = app_event_labels.dropna(subset=['trainrow'])
train_labels_sparse = csr_matrix((np.ones(temp.shape[0]), (temp.trainrow, temp.label)), shape=(train.shape[0],numLabels))
temp = app_event_labels.dropna(subset=['testrow'])
test_labels_sparse = csr_matrix((np.ones(temp.shape[0]), (temp.testrow, temp.label)), shape=(test.shape[0],numLabels))
train_labels_sparse.shape

(74645, 492)

In [39]:
print(train_labels_sparse.shape)
print(test_labels_sparse.shape)

(74645, 492)
(112071, 492)


In [40]:
del(app_event_labels)

## write files out now to test against dune_dweller's script

trainExtsparsemin = hstack((train_brand_sparse, train_model_sparse, train_apps_sparse, train_labels_sparse), format='csr')
testExtsparsemin = hstack((test_brand_sparse, test_model_sparse, test_apps_sparse, test_labels_sparse), format='csr')

np.savez('trainExtsparsemin', data = trainExtsparsemin.data ,indices=trainExtsparsemin.indices,
             indptr =trainExtsparsemin.indptr, shape=trainExtsparsemin.shape )
np.savez('testExtsparsemin', data = testExtsparsemin.data ,indices=testExtsparsemin.indices,
             indptr =testExtsparsemin.indptr, shape=testExtsparsemin.shape )

# bring in features from original preproc

In [41]:
# start with events; it has the most interesting feature possibilities

eventsByDevice = events.groupby('device_id')
totalEvents = eventsByDevice.event_id.aggregate({'totalEvents':'count'})
totalEvents.head()

Unnamed: 0_level_0,totalEvents
device_id,Unnamed: 1_level_1
-9222956879900151005,65
-9222661944218806987,8
-9222399302879214035,10
-9221825537663503111,99
-9221767098072603291,8


In [42]:
#num with location off

numLocOff = events[(events.longitude == 0) & (events.latitude == 0)].groupby('device_id').event_id.aggregate({'numEventsLocOff':'count'})
numLocOff.head()

Unnamed: 0_level_0,numEventsLocOff
device_id,Unnamed: 1_level_1
-9222956879900151005,13
-9222661944218806987,8
-9222399302879214035,10
-9221825537663503111,1
-9221767098072603291,8


In [43]:
locOn = events[(events.longitude != 0) & (events.latitude != 0)].groupby('device_id')['latitude', 'longitude'].aggregate({'max':'max', 'min':'min', 'mean':'mean'})

In [44]:
locOn['latRange'] = locOn[('max', 'latitude')] - locOn[('min', 'latitude')]
locOn['lonRange'] = locOn[('max', 'longitude')] - locOn[('min', 'longitude')]
locOn.head()

Unnamed: 0_level_0,max,max,mean,mean,min,min,latRange,lonRange
Unnamed: 0_level_1,latitude,longitude,latitude,longitude,latitude,longitude,Unnamed: 7_level_1,Unnamed: 8_level_1
device_id,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
-9222956879900151005,23.19,113.24,23.19,113.24,23.19,113.24,0.0,0.0
-9221825537663503111,34.92,113.77,34.204592,113.446735,33.46,113.36,1.46,0.41
-9221026417907250887,30.89,114.37,30.871515,114.362348,30.87,114.36,0.02,0.01
-9220452176650064280,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
-9220061629197656378,46.65,124.91,46.618974,124.888462,46.6,124.85,0.05,0.06


In [45]:
locOn.columns = [' '.join(col).strip() for col in locOn.columns.values]
locOn.head()

Unnamed: 0_level_0,max latitude,max longitude,mean latitude,mean longitude,min latitude,min longitude,latRange,lonRange
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
-9222956879900151005,23.19,113.24,23.19,113.24,23.19,113.24,0.0,0.0
-9221825537663503111,34.92,113.77,34.204592,113.446735,33.46,113.36,1.46,0.41
-9221026417907250887,30.89,114.37,30.871515,114.362348,30.87,114.36,0.02,0.01
-9220452176650064280,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0
-9220061629197656378,46.65,124.91,46.618974,124.888462,46.6,124.85,0.05,0.06


In [46]:
events.timestamp = pd.to_datetime(events.timestamp)
events['dayOfWeek'] = events.timestamp.dt.dayofweek
events['hourOfDay'] = events.timestamp.dt.hour
events.head()

Unnamed: 0,event_id,device_id,timestamp,longitude,latitude,dayOfWeek,hourOfDay
0,1,29182687948017175,2016-05-01 00:55:25,121.38,31.24,6,0
1,2,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97,6,0
2,3,-4833982096941402721,2016-05-01 00:08:05,106.6,29.7,6,0
3,4,-6815121365017318426,2016-05-01 00:06:40,104.27,23.28,6,0
4,5,-5373797595892518570,2016-05-01 00:07:18,115.88,28.66,6,0


In [47]:
dayOneHot = pd.get_dummies(events.dayOfWeek, prefix='day', prefix_sep='_')

In [48]:
hourOneHot = pd.get_dummies(events.hourOfDay, prefix='hour', prefix_sep='_')

In [49]:
eventsExt = pd.concat([events, dayOneHot, hourOneHot], axis=1)
eventsExt.head()

Unnamed: 0,event_id,device_id,timestamp,longitude,latitude,dayOfWeek,hourOfDay,day_0,day_1,day_2,...,hour_14,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23
0,1,29182687948017175,2016-05-01 00:55:25,121.38,31.24,6,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,2,-6401643145415154744,2016-05-01 00:54:12,103.65,30.97,6,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,3,-4833982096941402721,2016-05-01 00:08:05,106.6,29.7,6,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,4,-6815121365017318426,2016-05-01 00:06:40,104.27,23.28,6,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5,-5373797595892518570,2016-05-01 00:07:18,115.88,28.66,6,0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [50]:
timeActivity = eventsExt.groupby('device_id')[[col for col in eventsExt.columns if 'day_' in col or 'hour_' in col]].sum()
timeActivity.head()

Unnamed: 0_level_0,day_0,day_1,day_2,day_3,day_4,day_5,day_6,hour_0,hour_1,hour_2,...,hour_14,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-9222956879900151005,0.0,0.0,0.0,0.0,30.0,35.0,0.0,0.0,0.0,0.0,...,5.0,22.0,0.0,0.0,0.0,0.0,4.0,6.0,0.0,3.0
-9222661944218806987,1.0,1.0,1.0,1.0,1.0,1.0,2.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.0,4.0,1.0,0.0,1.0,1.0,0.0
-9222399302879214035,2.0,2.0,5.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,3.0
-9221825537663503111,17.0,0.0,4.0,32.0,12.0,5.0,29.0,0.0,0.0,0.0,...,3.0,0.0,0.0,0.0,0.0,2.0,0.0,4.0,0.0,0.0
-9221767098072603291,1.0,1.0,3.0,1.0,0.0,0.0,2.0,0.0,0.0,0.0,...,1.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [51]:
# need to merge into train:
# totalEvents, numLocOff, LocOn(range columns), timeActivity

#train.set_index('device_id')
#trainExt = train.join([totalEvents, numLocOff, locOn[['latRange', 'lonRange', 'mean latitude', 'mean longitude']], timeActivity])
trainExt = train.merge(totalEvents, left_index=True, right_index=True, how='left')
trainExt = trainExt.merge(numLocOff, left_index=True, right_index=True, how='left')
trainExt = trainExt.merge(locOn[['latRange', 'lonRange', 'mean latitude', 'mean longitude']], left_index=True, right_index=True, how='left')
trainExt = trainExt.merge(timeActivity, left_index=True, right_index=True, how='left')
trainExt.head()

Unnamed: 0_level_0,gender,age,group,trainrow,brand,model,totalEvents,numEventsLocOff,latRange,lonRange,...,hour_14,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-8076087639492063270,M,35,M32-38,0,51,749,,,,,...,,,,,,,,,,
-2897161552818060146,M,35,M32-38,1,51,749,,,,,...,,,,,,,,,,
-8260683887967679142,M,35,M32-38,2,51,749,1.0,1.0,,,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-4938849341048082022,M,30,M29-31,3,51,1524,,,,,...,,,,,,,,,,
245133531816851882,M,30,M29-31,4,51,753,,,,,...,,,,,,,,,,


In [52]:
testExt = test.merge(totalEvents, left_index=True, right_index=True, how='left')
testExt = testExt.merge(numLocOff, left_index=True, right_index=True, how='left')
testExt = testExt.merge(locOn[['latRange', 'lonRange', 'mean latitude', 'mean longitude']], left_index=True, right_index=True, how='left')
testExt = testExt.merge(timeActivity, left_index=True, right_index=True, how='left')
testExt.head()

Unnamed: 0_level_0,testrow,brand,model,totalEvents,numEventsLocOff,latRange,lonRange,mean latitude,mean longitude,day_0,...,hour_14,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1002079943728939269,0,51,1482,7.0,7.0,,,,,1.0,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,2.0,0.0
-1547860181818787117,1,51,1519,8.0,8.0,,,,,2.0,...,3.0,0.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0,0.0
7374582448058474277,2,31,1371,5.0,5.0,,,,,2.0,...,0.0,0.0,2.0,0.0,0.0,2.0,0.0,1.0,0.0,0.0
-6220210354783429585,3,31,1544,9.0,9.0,,,,,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
-5893464122623104785,4,51,749,,,,,,,,...,,,,,,,,,,


In [53]:
trainExt = trainExt.fillna(0)
testExt = testExt.fillna(0)
trainExt.head()

Unnamed: 0_level_0,gender,age,group,trainrow,brand,model,totalEvents,numEventsLocOff,latRange,lonRange,...,hour_14,hour_15,hour_16,hour_17,hour_18,hour_19,hour_20,hour_21,hour_22,hour_23
device_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
-8076087639492063270,M,35,M32-38,0,51,749,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-2897161552818060146,M,35,M32-38,1,51,749,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-8260683887967679142,M,35,M32-38,2,51,749,1.0,1.0,0.0,0.0,...,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
-4938849341048082022,M,30,M29-31,3,51,1524,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
245133531816851882,M,30,M29-31,4,51,753,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [54]:
print(trainExt.columns)
print(testExt.columns)

Index(['gender', 'age', 'group', 'trainrow', 'brand', 'model', 'totalEvents',
       'numEventsLocOff', 'latRange', 'lonRange', 'mean latitude',
       'mean longitude', 'day_0', 'day_1', 'day_2', 'day_3', 'day_4', 'day_5',
       'day_6', 'hour_0', 'hour_1', 'hour_2', 'hour_3', 'hour_4', 'hour_5',
       'hour_6', 'hour_7', 'hour_8', 'hour_9', 'hour_10', 'hour_11', 'hour_12',
       'hour_13', 'hour_14', 'hour_15', 'hour_16', 'hour_17', 'hour_18',
       'hour_19', 'hour_20', 'hour_21', 'hour_22', 'hour_23'],
      dtype='object')
Index(['testrow', 'brand', 'model', 'totalEvents', 'numEventsLocOff',
       'latRange', 'lonRange', 'mean latitude', 'mean longitude', 'day_0',
       'day_1', 'day_2', 'day_3', 'day_4', 'day_5', 'day_6', 'hour_0',
       'hour_1', 'hour_2', 'hour_3', 'hour_4', 'hour_5', 'hour_6', 'hour_7',
       'hour_8', 'hour_9', 'hour_10', 'hour_11', 'hour_12', 'hour_13',
       'hour_14', 'hour_15', 'hour_16', 'hour_17', 'hour_18', 'hour_19',
       'hour_20', 'hour_2

In [55]:
trainExtsparse = hstack((trainExt.drop(['gender', 'age', 'group', 'trainrow'], axis=1), train_brand_sparse, train_model_sparse, train_apps_sparse, train_labels_sparse), format='csr')
testExtsparse = hstack((testExt.drop('testrow', axis=1), test_brand_sparse, test_model_sparse, test_apps_sparse, test_labels_sparse), format='csr')

In [56]:
np.savez('trainExtsparse', data = trainExtsparse.data ,indices=trainExtsparse.indices,
             indptr =trainExtsparse.indptr, shape=trainExtsparse.shape )
np.savez('testExtsparse', data = testExtsparse.data ,indices=testExtsparse.indices,
             indptr =testExtsparse.indptr, shape=testExtsparse.shape )

#to restore sparse
loader = np.load(filename)
    return csr_matrix((  loader['data'], loader['indices'], loader['indptr']),
                         shape = loader['shape'])

In [None]:
io.mmwrite("trainExtsparse2.mtx", trainExtsparse)
io.mmwrite('testExtsparse2.mtx', testExtsparse)