In [1]:
#作者：1621430024
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
import lightgbm as lgb

Load data

In [2]:
train_data = pd.read_csv('../input/sf-crime/train.csv.zip', parse_dates=['Dates'])
test_data = pd.read_csv('../input/sf-crime/test.csv.zip', parse_dates=['Dates'])

Data info

In [3]:
train_data.info()
test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 878049 entries, 0 to 878048
Data columns (total 9 columns):
Dates         878049 non-null datetime64[ns]
Category      878049 non-null object
Descript      878049 non-null object
DayOfWeek     878049 non-null object
PdDistrict    878049 non-null object
Resolution    878049 non-null object
Address       878049 non-null object
X             878049 non-null float64
Y             878049 non-null float64
dtypes: datetime64[ns](1), float64(2), object(6)
memory usage: 60.3+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 884262 entries, 0 to 884261
Data columns (total 7 columns):
Id            884262 non-null int64
Dates         884262 non-null datetime64[ns]
DayOfWeek     884262 non-null object
PdDistrict    884262 non-null object
Address       884262 non-null object
X             884262 non-null float64
Y             884262 non-null float64
dtypes: datetime64[ns](1), float64(2), int64(1), object(3)
memory usage: 47.2+ MB


Transform data

In [4]:
all_features = pd.concat((train_data.iloc[:, [0, 3, 4, 6, 7, 8]],
                          test_data.iloc[:, [1, 2, 3, 4, 5, 6]]),
                         sort=False)

num_train = train_data.shape[0]

train_labels = pd.get_dummies(train_data['Category']).values
num_outputs = train_labels.shape[1]
train_labels = np.argmax(train_labels, axis=1)

all_features['year'] = all_features.Dates.dt.year
all_features['month'] = all_features.Dates.dt.month
all_features['new_year'] = all_features['month'].apply(
    lambda x: 1 if x == 1 or x == 2 else 0)
all_features['day'] = all_features.Dates.dt.day
all_features['hour'] = all_features.Dates.dt.hour
all_features['evening'] = all_features['hour'].apply(lambda x: 1
                                                     if x >= 18 else 0)

wkm = {
    'Monday': 0,
    'Tuesday': 1,
    'Wednesday': 2,
    'Thursday': 3,
    'Friday': 4,
    'Saturday': 5,
    'Sunday': 6
}
all_features['DayOfWeek'] = all_features['DayOfWeek'].apply(lambda x: wkm[x])
all_features['weekend'] = all_features['DayOfWeek'].apply(
    lambda x: 1 if x == 4 or x == 5 else 0)

OneHot_features = pd.get_dummies(all_features['PdDistrict'])

all_features['block'] = all_features['Address'].apply(
    lambda x: 1 if 'block' in x.lower() else 0)

PCA_features = all_features[['X', 'Y']].values
Standard_features = all_features[['DayOfWeek', 'year', 'month', 'day',
                                  'hour']].values
OneHot_features = pd.concat([
    OneHot_features, all_features[['new_year', 'evening', 'weekend', 'block']]
],
                            axis=1).values

scaler = StandardScaler()
scaler.fit(Standard_features)
Standard_features = scaler.transform(Standard_features)

pca = PCA(n_components=2)
pca.fit(PCA_features)
PCA_features = pca.transform(PCA_features)

all_features = np.concatenate(
    (PCA_features, Standard_features, OneHot_features), axis=1)

train_features = all_features[:num_train]
num_inputs = train_features.shape[1]
test_features = all_features[num_train:]

In [5]:
data_train = lgb.Dataset(train_features, label = train_labels)

Start up!

In [6]:
params = {
    'boosting': 'gbdt', 
    'objective': 'multiclass',
    'metrics' : 'multi_logloss',
    'num_class': num_outputs,
    'verbosity': 1,
    'device_type':'gpu',
    'gpu_platform_id':0,
    'gpu_device_id':0,
    'max_depth': 6,
    'num_leaves': 51,
    'min_data_in_leaf' : 25,
    'feature_fraction': 0.79,
    'learning_rate': 0.01,
    }
gbm = lgb.train(params, data_train, num_boost_round = 2000)
gbm.save_model('../working/gbm(v2).txt')
testResult = gbm.predict(test_features)
sampleSubmission = pd.read_csv('../input/sf-crime/sampleSubmission.csv.zip')
Result_pd = pd.DataFrame(testResult,
                         index=sampleSubmission.index,
                         columns=sampleSubmission.columns[1:])
Result_pd.to_csv('../working/sampleSubmission(gbmv2).csv', index_label='Id')