In [None]:
import pandas as pd
import numpy as np
import matplotlib 
import seaborn as sns
import matplotlib.pyplot as plt
import os
from sklearn.preprocessing import LabelEncoder
from scipy.sparse import csr_matrix, hstack
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import StratifiedKFold
from sklearn.metrics import log_loss
import datetime as dt
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve

datadir = 'C:/Users/Haisu Cai/Desktop/Downloads/machine learning'
gatrain = pd.read_csv(os.path.join(datadir,'gender_age_train.csv'),
                      index_col='device_id')
gatest = pd.read_csv(os.path.join(datadir,'gender_age_test.csv'),
                     index_col = 'device_id')
phone = pd.read_csv(os.path.join(datadir,'phone_brand_device_model.csv'))
# Get rid of duplicate device ids in phone
phone = phone.drop_duplicates('device_id',keep='first').set_index('device_id')
events = pd.read_csv(os.path.join(datadir,'events.csv'),
                     parse_dates=['timestamp'], index_col='event_id')
appevents = pd.read_csv(os.path.join(datadir,'app_events.csv'), 
                        usecols=['event_id','app_id','is_active'],
                        dtype={'is_active':bool})
applabels = pd.read_csv(os.path.join(datadir,'app_labels.csv'))


gatrain['trainrow']= np.arange(gatrain.shape[0])
gatest ['testrow'] =np.arange(gatest.shape[0])

brandencoder = LabelEncoder().fit(phone.phone_brand)
phone['brand'] = brandencoder.transform(phone['phone_brand'])
gatrain['brand']= phone['brand']  #为什么可以直接等于，两个dataset的rows并不相等，是根据id链接的吗
gatest['brand'] = phone['brand'] 

Xtr_brand = csr_matrix((np.ones(gatrain.shape[0]), 
                       (gatrain.trainrow, gatrain.brand)))
Xte_brand = csr_matrix((np.ones(gatest.shape[0]), 
                       (gatest.testrow, gatest.brand)))
print('Brand features: train shape {}, test shape {}'.format(Xtr_brand.shape, Xte_brand.shape))

m = phone.phone_brand.str.cat(phone.device_model)
modelencoder = LabelEncoder().fit(m)
phone['model'] = modelencoder.transform(m)
gatrain['model'] = phone['model']
gatest['model'] = phone['model']
Xtr_model = csr_matrix((np.ones(gatrain.shape[0]), 
                       (gatrain.trainrow, gatrain.model)))
Xte_model = csr_matrix((np.ones(gatest.shape[0]), 
                       (gatest.testrow, gatest.model)))
print('Model features: train shape {}, test shape {}'.format(Xtr_model.shape, Xte_model.shape))


appencoder = LabelEncoder().fit(appevents.app_id)
appevents['app'] = appencoder.transform(appevents.app_id)
napps = len(appencoder.classes_)
deviceapps = (appevents.merge(events[['device_id']], how='left',left_on='event_id',right_index=True)
                       .groupby(by=['device_id','app'])['app'].agg(['size'])
                       .merge(gatrain[['trainrow']], how='left', left_index=True, right_index=True)
                       .merge(gatest[['testrow']], how='left', left_index=True, right_index=True)
                       .reset_index())

d = deviceapps.dropna(subset=['trainrow'])
Xtr_app = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.app)), 
                      shape=(gatrain.shape[0],napps))
d = deviceapps.dropna(subset=['testrow'])
Xte_app = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.app)), 
                      shape=(gatest.shape[0],napps))
print('Apps data: train shape {}, test shape {}'.format(Xtr_app.shape, Xte_app.shape))
#####label feature####
applabels = applabels.loc[applabels.app_id.isin(appevents.app_id.unique())]
applabels['app'] = appencoder.transform(applabels.app_id)
labelencoder = LabelEncoder().fit(applabels.label_id)
applabels['label'] = labelencoder.transform(applabels.label_id)
nlabels = len(labelencoder.classes_)

devicelabels = (deviceapps[['device_id','app']]
                .merge(applabels[['app','label']])
                .groupby(['device_id','label'])['app'].agg(['size'])
                .merge(gatrain[['trainrow']], how='left', left_index=True, right_index=True)
                .merge(gatest[['testrow']], how='left', left_index=True, right_index=True)
                .reset_index())

d = devicelabels.dropna(subset=['trainrow'])
Xtr_label = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.label)), 
                      shape=(gatrain.shape[0],nlabels))
d = devicelabels.dropna(subset=['testrow'])
Xte_label = csr_matrix((np.ones(d.shape[0]), (d.testrow, d.label)), 
                      shape=(gatest.shape[0],nlabels))
print('Labels data: train shape {}, test shape {}'.format(Xtr_label.shape, Xte_label.shape))




######## encode the timestamp into 6 groups regardless date.

events['hour'] = events['timestamp'].dt.hour
a= list(events.hour)

length = len(a)
for i in range(0,length):
    if a[i]<=3:
        a[i]=4
    elif a[i]<=7:
        a[i]=5
    elif a[i]<=11:
        a[i]=0
    elif a[i]<=15:
        a[i]=1
    elif a[i]<=19:
        a[i]=2
    else:
        a[i]=3
        
a= pd.Series(a)
events = events.reset_index()
events['timestamp_new']= a
del events['hour']
events = events.set_index('event_id')

#create the matrix for timestamp of each device id
timeencoder = LabelEncoder().fit(events.timestamp_new)
ntimestamp = len(timeencoder.classes_)
devicetimestamp = events.groupby(by=['device_id','timestamp_new'])['timestamp'].agg(['size']).merge(gatrain[['trainrow']],how='left', left_index=True, right_index=True).reset_index()


d = devicetimestamp.dropna(subset=['trainrow'])
Xtr_timestamp = csr_matrix((np.ones(d.shape[0]), (d.trainrow, d.timestamp_new)), 
                      shape=(gatrain.shape[0],ntimestamp))#用trainrow把每一个device id对应的位置固定下来。

#concate all matrixes 
Xtrain = hstack((Xtr_brand, Xtr_model, Xtr_app, Xtr_label,Xtr_timestamp), format='csr')


#visualization of age distribution
gatrain['age'].describe() #check distribution and visualize it
plt.hist(gatrain['age'],bins=20, cumulative=False)

#convert age into ranges
alist = list(gatrain['age'])
length = len(alist)

for i in range(0,length):
    if alist[i] <18:
        alist[i]=0
    elif alist[i]<25:
        alist[i]=1
    elif alist[i]<29:
        alist[i]=2
    elif alist[i] <36:
        alist[i]=3
    elif alist[i]<50:
        alist[i]=4
    else:
        alist[i]=5
        
alist = pd.Series(alist)

gatrain= gatrain.reset_index()
gatrain['new_age']=alist
gatrain= gatrain.set_index('device_id')


########predict gender
targetencoder = LabelEncoder().fit(gatrain.gender)
y = targetencoder.transform(gatrain.gender)


######predict age range
#y= gatrain['new_age']

##########ANN modeling########
X_train, X_test, y_train, y_test = train_test_split(
Xtrain, y, test_size=0.3, random_state=42)

ANNModel = MLPClassifier(activation='relu',hidden_layer_sizes=1)

ANNModel.fit(X_train,y_train)

score = ANNModel.score(X_test, y_test)
print(score)

y_pred = ANNModel.predict(X_test)

######3confusion matrix and ROC
from sklearn.metrics import confusion_matrix
confusion_matrix = confusion_matrix(y_test, y_pred)
print(confusion_matrix)




y_pred_proba = ANNModel.predict_proba(X_test)[:,1]

ANN_roc_auc = roc_auc_score(y_test, y_pred)
[fpr, tpr, thr] = roc_curve(y_test, y_pred_proba)

plt.figure()
plt.plot(fpr, tpr, label='ANN model (area = %0.2f)' % ANN_roc_auc)
plt.plot([0, 1], [0, 1],'r--')
plt.xlim([0.0, 1.0])
plt.ylim([0.0, 1.05])
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.title('Gender Prediction')
plt.legend(loc="lower right")
plt.savefig('ANN_ROC')
plt.show()
