In [35]:
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn import preprocessing
from sklearn.model_selection import cross_val_score
from __future__ import division

train = pd.read_csv('train.csv')

weekdays = {'Monday':0., 'Tuesday':1., 'Wednesday':2., 'Thursday': 3., 'Friday':4., 'Saturday':5., 'Sunday':6.}
categories = {c:i for i,c in enumerate(train['Category'].unique())}
resolutions = {c:i for i,c in enumerate(train['Resolution'].unique())}
cat_rev = {i:c for i,c in enumerate(train['Category'].unique())}
districts = {c:i for i,c in enumerate(train['PdDistrict'].unique())}
dis_rev = {i:c for i,c in enumerate(train['PdDistrict'].unique())}

# Extract features from given information
train['Hour'] = list(map(lambda x: float(int(x.split(' ')[1].split(':')[0])),
                                  train.Dates))

train['Minute'] = list(map(lambda x: float(int(x.split(' ')[1].split(':')[1])),
                                  train.Dates))

train['Month'] = list(map(lambda x: float(x.split(' ')[0].split('-')[1]), train.Dates))

train['Year'] = list(map(lambda x: float(x.split(' ')[0].split('-')[0])-2003., train.Dates))


train['Day'] = list(map(lambda x: float(x.split(' ')[0].split('-')[2]), train.Dates))

train['Day_Num'] = [float(weekdays[w]) for w in train.DayOfWeek]

train['District_Num'] = [float(districts[t]) for t in train.PdDistrict]

train['Category_Num'] = [float(categories[t]) for t in train.Category]

train['Resolution_Num'] = [float(resolutions[t]) for t in train.Resolution]

# Center X,Y coordinates
train['X'] = preprocessing.scale(list(map(lambda x: x+122.4194, train.X)))
train['Y'] = preprocessing.scale(list(map(lambda x: x-37.7749, train.Y)))

# Assign binary value to address by type
def define_address(addr):
    addr_type = 0.
    # Address types:
    #  Intersection: 1
    #  Residence: 0
    if '/' in addr and 'of' not in addr:
        addr_type = 1.
    else:
        add_type = 0.
    return addr_type
    
# Define address feature
train['Address_Num'] = list(map(define_address, train.Address))

# Feature selection
X_loc = ['X', 'Y', 'District_Num', 'Address_Num']
X_time = ['Minute', 'Hour']
X_date = ['Year','Month', 'Day', 'Day_Num']
X_resol = ['Resolution_Num']
X_all = X_date + X_time + X_loc + X_resol

# Category column we want to predict
y = 'Category_Num'

print type(train)
print train[X_all]



<class 'pandas.core.frame.DataFrame'>
        Year  Month   Day  Day_Num  Minute  Hour         X         Y  \
0       12.0    5.0  13.0      2.0    53.0  23.0 -0.107902  0.007832   
1       12.0    5.0  13.0      2.0    53.0  23.0 -0.107902  0.007832   
2       12.0    5.0  13.0      2.0    33.0  23.0 -0.057541  0.064335   
3       12.0    5.0  13.0      2.0    30.0  23.0 -0.144262  0.065338   
4       12.0    5.0  13.0      2.0    30.0  23.0 -0.531112  0.001140   
5       12.0    5.0  13.0      2.0    30.0  23.0  0.637950 -0.126046   
6       12.0    5.0  13.0      2.0    30.0  23.0 -0.023408 -0.100422   
7       12.0    5.0  13.0      2.0    30.0  23.0  1.691468 -0.095113   
8       12.0    5.0  13.0      2.0     0.0  23.0 -2.819355  0.012215   
9       12.0    5.0  13.0      2.0     0.0  23.0  0.116256  0.080503   
10      12.0    5.0  13.0      2.0    58.0  22.0  0.116256  0.080503   
11      12.0    5.0  13.0      2.0    30.0  22.0 -2.153504 -0.073001   
12      12.0    5.0  13.0 

In [36]:
train_value = train[X_all].values
print type(train_value)
print train_value.shape

<type 'numpy.ndarray'>
(878049, 11)


In [40]:
f_data = open("crime.data","w")
for i in xrange(len(train_value)):
    ex = ''
    l = len(train_value[i])
    c =0
    for d in train_value[i]:
        c+=1
        if c!=l:
            ex +=str(d)+" "
        else: ex +=str(d)
    print >> f_data,"%s"%(ex)
f_data.close()

In [13]:
print len(train_value)

878049


In [2]:
print categories
print len(categories)
print test.shape
print train.shape

{'KIDNAPPING': 19, 'WEAPON LAWS': 8, 'SECONDARY CODES': 15, 'WARRANTS': 0, 'PROSTITUTION': 23, 'EMBEZZLEMENT': 29, 'LOITERING': 31, 'SUICIDE': 30, 'DRIVING UNDER THE INFLUENCE': 21, 'SEX OFFENSES FORCIBLE': 22, 'ROBBERY': 6, 'BURGLARY': 9, 'SUSPICIOUS OCC': 10, 'FAMILY OFFENSES': 26, 'BRIBERY': 28, 'FORGERY/COUNTERFEITING': 12, 'BAD CHECKS': 35, 'DRUNKENNESS': 11, 'GAMBLING': 34, 'OTHER OFFENSES': 1, 'RECOVERED VEHICLE': 37, 'FRAUD': 18, 'ARSON': 25, 'DRUG/NARCOTIC': 13, 'TRESPASS': 16, 'LARCENY/THEFT': 2, 'VANDALISM': 4, 'NON-CRIMINAL': 5, 'EXTORTION': 33, 'PORNOGRAPHY/OBSCENE MAT': 38, 'LIQUOR LAWS': 27, 'SEX OFFENSES NON FORCIBLE': 32, 'TREA': 36, 'VEHICLE THEFT': 3, 'STOLEN PROPERTY': 14, 'ASSAULT': 7, 'MISSING PERSON': 17, 'DISORDERLY CONDUCT': 24, 'RUNAWAY': 20}
39
(884262, 15)
(878049, 18)


In [13]:
# Create random forest classifie
clf = RandomForestClassifier(max_features="log2", max_depth=11, n_estimators=24,
                             min_samples_split=1000, oob_score=True)
# Fit prediction
clf.fit(train[X_all], train[y])
pred = clf.predict_proba(test[X_all])
pred_c = clf.predict(test[X_all])
score = cross_val_score(clf, train[X_all], train[y], cv = 5)
print pred_c
print score


[ 7.  1.  2. ...,  1.  1.  2.]
[ 0.20686121  0.23875795  0.22734908  0.18254709  0.07783776]


In [18]:
print pred.shape

(884262, 39)


In [7]:
# generate the random number and calculate the accuracy
clf_r = RandomForestClassifier()
# Fit prediction
clf_r.fit(train[X_all], train[y])
pred_r = clf_r.predict_proba(test[X_all])
score_r = cross_val_score(clf_r, train[X_all], train[y], cv = 5)

NameError: name 'pred' is not defined

In [8]:
print pred_r
print score_r

[[ 0.          0.1         0.1        ...,  0.          0.          0.        ]
 [ 0.          0.61        0.05       ...,  0.          0.          0.        ]
 [ 0.          0.          0.4        ...,  0.          0.          0.        ]
 ..., 
 [ 0.05        0.15        0.03333333 ...,  0.          0.          0.        ]
 [ 0.          0.          0.3        ...,  0.          0.          0.        ]
 [ 0.          0.2         0.         ...,  0.          0.          0.        ]]
[ 0.17111459  0.13444335  0.12097192  0.10030523  0.06556527]


In [11]:
def bayes_naive_gaussien(X,Y,test):
    from sklearn.naive_bayes import GaussianNB
    gnb = GaussianNB()
    gnb = gnb.fit(X,Y)
    score = cross_val_score(gnb, X, Y, cv = 5)
    pred_nb = gnb.predict_proba(test)
    return score,pred_nb

def svm(X,Y,test):
    from sklearn import svm
    svm = svm.SVC()
    svm.fit(X, Y)
    score = cross_val_score(svm, X, Y, cv = 5)
    pred_svm = svm.predict_proba(test)
    return score,pred_svm

In [4]:
score_nb,pred_nb = bayes_naive_gaussien(train[X_all],train[y],test[X_all])
print score_nb
print pred_nb


[ 0.10414235  0.08313735  0.0891459   0.08649006  0.07873186]
[[  1.09150613e-02   2.73882921e-02   3.53557350e-02 ...,   2.80142461e-01
    8.14209648e-03   3.35422644e-06]
 [  3.28967096e-02   1.53752135e-01   1.25261263e-01 ...,   0.00000000e+00
    2.14410205e-03   1.92851973e-06]
 [  8.68579741e-03   2.54102238e-02   5.80040528e-02 ...,   1.03344375e-01
    4.41717798e-03   8.33454289e-06]
 ..., 
 [  5.67698914e-03   2.70438629e-02   3.70605393e-02 ...,   1.75547061e-08
    2.19603526e-03   7.90806695e-06]
 [  4.43135658e-03   1.73011816e-02   1.99516189e-02 ...,   3.84687563e-07
    2.80302125e-03   3.43720474e-05]
 [  5.57683657e-03   2.54984850e-02   2.57728904e-02 ...,   4.94041386e-11
    1.27784205e-03   3.12735445e-05]]


In [None]:
score_svm,pred_svm = svm(train[X_all],train[y],test[X_all])
print score_svm
print pred_svm

In [5]:
# Create submission
submission_nb = pd.DataFrame({cat_rev[p] : [pred_nb[i][p] for i in range(len(pred_nb))] for p in range(len(pred_nb[0]))})

submission_nb['Id'] = [i for i in range(len(submission_nb))]

submission_nb = submission_nb[['Id'] + sorted(train['Category'].unique())]
print(submission_nb.head())

# Write submission
submission_nb.to_csv('submission_nb.csv.gz', index=False, compression='gzip')

   Id     ARSON   ASSAULT    BAD CHECKS   BRIBERY      BURGLARY  \
0   0  0.013109  0.050605  1.911876e-06  0.003740  3.525824e-02   
1   1  0.006535  0.047812  2.857791e-09  0.000859  2.843887e-19   
2   2  0.009020  0.056024  1.590421e-05  0.001727  7.380018e-02   
3   3  0.011636  0.076732  8.805935e-06  0.003356  1.070643e-01   
4   4  0.011636  0.076732  8.805935e-06  0.003356  1.070643e-01   

   DISORDERLY CONDUCT  DRIVING UNDER THE INFLUENCE  DRUG/NARCOTIC  \
0            0.000203                     0.006973       0.075879   
1            0.000500                     0.053793       0.145836   
2            0.000130                     0.014233       0.153403   
3            0.000225                     0.006413       0.017823   
4            0.000225                     0.006413       0.017823   

   DRUNKENNESS     ...       SEX OFFENSES NON FORCIBLE  STOLEN PROPERTY  \
0     0.019005     ...                        0.000061         0.001521   
1     0.036277     ...          

In [None]:
# Create submission
submission_svm = pd.DataFrame({cat_rev[p] : [pred_svm[i][p] for i in range(len(pred_svm))] for p in range(len(pred_svm[0]))})

submission_svm['Id'] = [i for i in range(len(submission_nb))]

submission_svm = submission_svm[['Id'] + sorted(train['Category'].unique())]
print(submission_svm.head())

# Write submission
submission_svm.to_csv('submission_svm.csv.gz', index=False, compression='gzip')

In [3]:
random_pre = [np.random.randint(0, 38)for __ in range(len(train))]

print len(random_pre)
tot = 0
correct = 0
for k in xrange(len(train)):
    if random_pre[k] == train[y][k]:
        correct+=1
    tot+=1
print "OK : "+str(correct)+" / "+str(tot)+ " -> "+ str(correct/tot)   

878049
OK : 23106 / 878049 -> 0.0263151600879


In [3]:
random_pre = np.zeros([len(test),39])
for i in xrange(len(test)):
    for j in xrange(0,38):
        random_pre[i][j] = np.random.random()
for k in xrange(len(test)):
    random_pre[k]= random_pre[k]/sum(random_pre[k])

In [4]:
print test.shape
print random_pre.shape

(884262, 15)
(884262, 39)


In [5]:
# Create submission
submission_rp = pd.DataFrame({cat_rev[p] : [random_pre[i][p] for i in range(len(random_pre))] for p in range(len(random_pre[0]))})

submission_rp['Id'] = [i for i in range(len(submission_rp))]

submission_rp = submission_rp[['Id'] + sorted(train['Category'].unique())]
print(submission_rp.head())

# Write submission
submission_rp.to_csv('submission_rp.csv.gz', index=False, compression='gzip')

   Id     ARSON   ASSAULT  BAD CHECKS   BRIBERY  BURGLARY  DISORDERLY CONDUCT  \
0   0  0.014551  0.021662    0.006229  0.033790  0.044332            0.035805   
1   1  0.033426  0.037216    0.038787  0.012907  0.046216            0.050442   
2   2  0.042736  0.043795    0.020108  0.036097  0.033364            0.013380   
3   3  0.025049  0.053034    0.016354  0.049401  0.053860            0.019421   
4   4  0.043367  0.011791    0.014072  0.036352  0.024347            0.034878   

   DRIVING UNDER THE INFLUENCE  DRUG/NARCOTIC  DRUNKENNESS     ...       \
0                     0.039160       0.044120     0.012730     ...        
1                     0.052692       0.041891     0.004321     ...        
2                     0.042091       0.037877     0.001251     ...        
3                     0.019014       0.050570     0.056801     ...        
4                     0.057149       0.002597     0.050819     ...        

   SEX OFFENSES NON FORCIBLE  STOLEN PROPERTY   SUICIDE  SUSPI

In [None]:
# Create submission
submission = pd.DataFrame({cat_rev[p] : [pred[i][p] for i in range(len(pred))] for p in range(len(pred[0]))})

submission['Id'] = [i for i in range(len(submission))]

submission = submission[['Id'] + sorted(train['Category'].unique())]
print(submission.head())

# Write submission
submission.to_csv('submission.csv.gz', index=False, compression='gzip')

In [10]:
# Create submission
submission_r = pd.DataFrame({cat_rev[p] : [pred_r[i][p] for i in range(len(pred_r))] for p in range(len(pred_r[0]))})

submission_r['Id'] = [i for i in range(len(submission_r))]

submission_r = submission_r[['Id'] + sorted(train['Category'].unique())]
print(submission_r.head())

# Write submission
submission_r.to_csv('submission_r.csv.gz', index=False, compression='gzip')

   Id  ARSON   ASSAULT  BAD CHECKS  BRIBERY  BURGLARY  DISORDERLY CONDUCT  \
0   0    0.0  0.166667         0.0      0.0       0.1                 0.0   
1   1    0.0  0.150000         0.0      0.0       0.0                 0.0   
2   2    0.0  0.200000         0.0      0.0       0.2                 0.0   
3   3    0.0  0.150000         0.0      0.0       0.0                 0.0   
4   4    0.0  0.150000         0.0      0.0       0.0                 0.0   

   DRIVING UNDER THE INFLUENCE  DRUG/NARCOTIC  DRUNKENNESS     ...       \
0                          0.0           0.00          0.0     ...        
1                          0.0           0.04          0.0     ...        
2                          0.0           0.00          0.0     ...        
3                          0.0           0.05          0.0     ...        
4                          0.0           0.05          0.0     ...        

   SEX OFFENSES NON FORCIBLE  STOLEN PROPERTY  SUICIDE  SUSPICIOUS OCC  TREA  \
0     