In [1]:
import pandas as pd
import numpy as np

In [2]:
#用pandas载入csv训练数据，并解析第一列为日期格式
train=pd.read_csv('train.csv', parse_dates = ['Dates'])
test=pd.read_csv('test.csv', parse_dates = ['Dates'])

In [3]:
test


Unnamed: 0,Id,Dates,DayOfWeek,PdDistrict,Address,X,Y
0,0,2015-05-10 23:59:00,Sunday,BAYVIEW,2000 Block of THOMAS AV,-122.399588,37.735051
1,1,2015-05-10 23:51:00,Sunday,BAYVIEW,3RD ST / REVERE AV,-122.391523,37.732432
2,2,2015-05-10 23:50:00,Sunday,NORTHERN,2000 Block of GOUGH ST,-122.426002,37.792212
3,3,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412
4,4,2015-05-10 23:45:00,Sunday,INGLESIDE,4700 Block of MISSION ST,-122.437394,37.721412
5,5,2015-05-10 23:40:00,Sunday,TARAVAL,BROAD ST / CAPITOL AV,-122.459024,37.713172
6,6,2015-05-10 23:30:00,Sunday,INGLESIDE,100 Block of CHENERY ST,-122.425616,37.739351
7,7,2015-05-10 23:30:00,Sunday,INGLESIDE,200 Block of BANKS ST,-122.412652,37.739750
8,8,2015-05-10 23:10:00,Sunday,MISSION,2900 Block of 16TH ST,-122.418700,37.765165
9,9,2015-05-10 23:10:00,Sunday,CENTRAL,TAYLOR ST / GREEN ST,-122.413935,37.798886


In [4]:
#有六个特征同时出现在两个数据中，仔细观察会发现x，y成对的跟地址绑定在一起，所以我们只有4个特征可以使用

In [5]:
#他们是Dates 、DayOfWeek 、PdDistrict、 Address

In [7]:
print(pd.Series(train['Address']).describe(),'\n ')
print(pd.Series(train['PdDistrict']).describe(),'\n')
print(pd.Series(train['DayOfWeek']).describe(),'\n')
print(pd.Series(train['Dates']).describe(),'\n')

count                     878049
unique                     23228
top       800 Block of BRYANT ST
freq                       26533
Name: Address, dtype: object 
 
count       878049
unique          10
top       SOUTHERN
freq        157182
Name: PdDistrict, dtype: object 

count     878049
unique         7
top       Friday
freq      133734
Name: DayOfWeek, dtype: object 

count                  878049
unique                 389257
top       2011-01-01 00:01:00
freq                      185
first     2003-01-06 00:01:00
last      2015-05-13 23:53:00
Name: Dates, dtype: object 



In [8]:
#Address看来不能使用了，太多了，会引起矩阵维数爆炸，我对date进行特殊的处理使其能够使用，下文会讲

In [9]:
#除了id以外没有可以直接使用的数值特征，所以我们要进行特征预处理，因为要用朴素贝叶斯来分类，这里我采用one-hot的编码方式来应对大量的字符串分类

In [10]:
#Pandas里有一个函数可以给类别编号

In [11]:
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing

#用LabelEncoder对不同的犯罪类型编号
leCrime = preprocessing.LabelEncoder()
crime = leCrime.fit_transform(train.Category)



In [12]:
#我觉得日期对结果可能影响不大所以只取时间点，也就是小时，这样的会就不会出现维数爆炸的情况了。将街区，星期，和小时分别因子化
#这里同样使用Pandas的方法get_dummies()来使其因子化

In [13]:
days = pd.get_dummies(train.DayOfWeek)
district = pd.get_dummies(train.PdDistrict)
#只取小时
hour = train.Dates.dt.hour
hour = pd.get_dummies(hour)

In [14]:
#将数据合并起来
traindata = pd.concat([days, district, hour], axis=1)
traindata['crime'] = crime 
traindata


Unnamed: 0,Friday,Monday,Saturday,Sunday,Thursday,Tuesday,Wednesday,BAYVIEW,CENTRAL,INGLESIDE,...,15,16,17,18,19,20,21,22,23,crime
0,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,37
1,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,21
2,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,21
3,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,16
4,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,16
5,0,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,1,16
6,0,0,0,0,0,0,1,0,0,1,...,0,0,0,0,0,0,0,0,1,36
7,0,0,0,0,0,0,1,1,0,0,...,0,0,0,0,0,0,0,0,1,36
8,0,0,0,0,0,0,1,0,0,0,...,0,0,0,0,0,0,0,0,1,16
9,0,0,0,0,0,0,1,0,1,0,...,0,0,0,0,0,0,0,0,1,16


In [15]:
#对于测试数据做同样的处理
days = pd.get_dummies(test.DayOfWeek)
district = pd.get_dummies(test.PdDistrict)

hour = test.Dates.dt.hour
hour = pd.get_dummies(hour) 

testData = pd.concat([hour, days, district], axis=1)
testData

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,BAYVIEW,CENTRAL,INGLESIDE,MISSION,NORTHERN,PARK,RICHMOND,SOUTHERN,TARAVAL,TENDERLOIN
0,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
5,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1,0
6,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
7,0,0,0,0,0,0,0,0,0,0,...,0,0,1,0,0,0,0,0,0,0
8,0,0,0,0,0,0,0,0,0,0,...,0,0,0,1,0,0,0,0,0,0
9,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0


In [19]:
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
from sklearn.metrics import log_loss
from sklearn.naive_bayes import BernoulliNB
from sklearn.linear_model import LogisticRegression
import time

features = traindata.columns.tolist()
features = features[:len(features) - 1] 
# 分割训练集(4/5)和测试集(1/5)
training, validation = train_test_split(traindata, train_size=.80)

# 朴素贝叶斯建模，计算log_loss
model0 = BernoulliNB()
nbStart = time.time()
model0.fit(training[features], training['crime'])
nbCostTime = time.time() - nbStart
predicted = np.array(model0.predict_proba(validation[features]))
print ("朴素贝叶斯建模耗时 %f 秒" %(nbCostTime))
print ("朴素贝叶斯log损失为 %f" %(log_loss(validation['crime'], predicted)))



朴素贝叶斯建模耗时 1.671531 秒
朴素贝叶斯log损失为 2.584372


In [22]:
#逻辑回归建模，计算log_loss
model1 = LogisticRegression(C=.01)
lrStart= time.time()
model1.fit(training[features], training['crime'])
lrCostTime = time.time() - lrStart
predicted = np.array(model1.predict_proba(validation[features]))
log_loss(validation['crime'], predicted)
print ("逻辑回归建模耗时 %f 秒" %(lrCostTime))
print( "逻辑回归log损失为 %f" %(log_loss(validation['crime'], predicted)))

逻辑回归建模耗时 124.033358 秒
逻辑回归log损失为 2.590564


In [23]:
#可以看到如果选用逻辑回归运算的时间非常庞大

In [24]:
test_predicted = np.array(model0.predict_proba(testData[features]))

In [25]:
test_predicted 

array([[6.34135218e-03, 1.27731981e-01, 2.36411627e-05, ...,
        1.20179275e-01, 3.92967493e-02, 2.38106060e-02],
       [6.34135218e-03, 1.27731981e-01, 2.36411627e-05, ...,
        1.20179275e-01, 3.92967493e-02, 2.38106060e-02],
       [1.80715134e-03, 8.78723326e-02, 3.64666033e-05, ...,
        8.64556044e-02, 3.51762326e-02, 8.57051279e-03],
       ...,
       [2.51315381e-03, 1.11299580e-01, 1.51228165e-03, ...,
        8.17241473e-02, 2.62318083e-02, 1.33736792e-02],
       [5.91353362e-03, 1.15706805e-01, 1.50101470e-03, ...,
        5.55168832e-02, 4.21181579e-02, 1.88462871e-02],
       [2.21236876e-03, 8.35286288e-02, 2.24063762e-03, ...,
        6.41818724e-02, 2.02528035e-02, 7.55767932e-03]])

In [26]:
col_names = np.sort(train['Category'].unique())

In [30]:
col_names

array(['ARSON', 'ASSAULT', 'BAD CHECKS', 'BRIBERY', 'BURGLARY',
       'DISORDERLY CONDUCT', 'DRIVING UNDER THE INFLUENCE',
       'DRUG/NARCOTIC', 'DRUNKENNESS', 'EMBEZZLEMENT', 'EXTORTION',
       'FAMILY OFFENSES', 'FORGERY/COUNTERFEITING', 'FRAUD', 'GAMBLING',
       'KIDNAPPING', 'LARCENY/THEFT', 'LIQUOR LAWS', 'LOITERING',
       'MISSING PERSON', 'NON-CRIMINAL', 'OTHER OFFENSES',
       'PORNOGRAPHY/OBSCENE MAT', 'PROSTITUTION', 'RECOVERED VEHICLE',
       'ROBBERY', 'RUNAWAY', 'SECONDARY CODES', 'SEX OFFENSES FORCIBLE',
       'SEX OFFENSES NON FORCIBLE', 'STOLEN PROPERTY', 'SUICIDE',
       'SUSPICIOUS OCC', 'TREA', 'TRESPASS', 'VANDALISM', 'VEHICLE THEFT',
       'WARRANTS', 'WEAPON LAWS'], dtype=object)

In [27]:
result = pd.DataFrame(data=test_predicted, columns=col_names)  
# 合成DataFrame数据结构的表 

In [28]:
result['Id'] = test['Id'].astype(int) 
# 从 dtype: int64 变为 dtype: int32 并且在最后加一列result['Id']

In [29]:
result

Unnamed: 0,ARSON,ASSAULT,BAD CHECKS,BRIBERY,BURGLARY,DISORDERLY CONDUCT,DRIVING UNDER THE INFLUENCE,DRUG/NARCOTIC,DRUNKENNESS,EMBEZZLEMENT,...,STOLEN PROPERTY,SUICIDE,SUSPICIOUS OCC,TREA,TRESPASS,VANDALISM,VEHICLE THEFT,WARRANTS,WEAPON LAWS,Id
0,0.006341,0.127732,0.000024,0.000932,0.029455,0.001788,0.006808,0.033324,0.005623,0.000224,...,0.003560,0.000151,0.029830,6.861297e-07,0.003419,0.087856,0.120179,0.039297,0.023811,0
1,0.006341,0.127732,0.000024,0.000932,0.029455,0.001788,0.006808,0.033324,0.005623,0.000224,...,0.003560,0.000151,0.029830,6.861297e-07,0.003419,0.087856,0.120179,0.039297,0.023811,1
2,0.001807,0.087872,0.000036,0.000221,0.039563,0.003543,0.008465,0.027495,0.007239,0.000271,...,0.005971,0.000231,0.020151,7.209486e-08,0.003763,0.075365,0.086456,0.035176,0.008571,2
3,0.002635,0.120118,0.000023,0.001068,0.027390,0.001706,0.008452,0.018413,0.004871,0.000193,...,0.003441,0.000299,0.026040,9.554799e-08,0.002561,0.098865,0.172954,0.023927,0.016518,3
4,0.002635,0.120118,0.000023,0.001068,0.027390,0.001706,0.008452,0.018413,0.004871,0.000193,...,0.003441,0.000299,0.026040,9.554799e-08,0.002561,0.098865,0.172954,0.023927,0.016518,4
5,0.002412,0.093762,0.000036,0.000371,0.036588,0.001929,0.010825,0.015020,0.008612,0.000303,...,0.002865,0.000353,0.030583,1.207044e-07,0.003032,0.112118,0.141276,0.019214,0.009709,5
6,0.002635,0.120118,0.000023,0.001068,0.027390,0.001706,0.008452,0.018413,0.004871,0.000193,...,0.003441,0.000299,0.026040,9.554799e-08,0.002561,0.098865,0.172954,0.023927,0.016518,6
7,0.002635,0.120118,0.000023,0.001068,0.027390,0.001706,0.008452,0.018413,0.004871,0.000193,...,0.003441,0.000299,0.026040,9.554799e-08,0.002561,0.098865,0.172954,0.023927,0.016518,7
8,0.001297,0.105920,0.000025,0.001024,0.020293,0.008832,0.010279,0.049554,0.014816,0.000204,...,0.004432,0.000206,0.020808,1.477095e-07,0.004256,0.062176,0.086517,0.046333,0.013423,8
9,0.001649,0.091518,0.000050,0.000217,0.036954,0.005233,0.006326,0.012900,0.014431,0.000475,...,0.005126,0.000255,0.022297,2.191497e-07,0.005745,0.076336,0.070837,0.025828,0.006658,9


In [None]:
#上面就是测试集每一个样本各种Category的概率