# 필요한 모듈 임포트

In [1]:
import pandas as pd
import lightgbm as lgb
import warnings
warnings.filterwarnings('ignore')
pd.set_option('display.max_colwidth', -1)
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

# 데이터 불러오기

In [2]:
train = pd.read_csv('../../week02/Eunhee/input_training.csv')
test = pd.read_csv('../../week02/Eunhee/input_testing.csv')

# 라벨 인코딩

In [3]:
def LabelEncoding(train_df,test_df,max_levels=2):
    for col in train_df:
        if train_df[col].dtype == 'object':
            if len(list(train_df[col].unique())) <= max_levels:
                le = preprocessing.LabelEncoder()
                le.fit(train_df[col])
                train_df[col]=le.transform(train_df[col])
                test_df[col]=le.transform(test_df[col])
    return [train_df,test_df]

def readInputAndEncode(input_path,train_file,test_file,target_column):
    training=pd.read_csv(input_path+train_file)
    testing=pd.read_csv(input_path+test_file)

    training,testing=LabelEncoding(training,testing)

    #print("Training Data Shape after Encoding ",training.shape)
    #print("Testing Data Shape after Encoding ",testing.shape)
    #Check if all train columns are there in test data. If not add the column to test data and replace it with zero
    train_cols=training.columns.tolist()
    test_cols=testing.columns.tolist()
    col_in_train_not_test=set(train_cols)-set(test_cols)
    for col in col_in_train_not_test:
        if col!=target_column:
            testing[col]=0
    col_in_test_not_train=set(test_cols)-set(train_cols)
    for col in col_in_test_not_train:
        training[col]=0
    print("Training Data Shape after Processing ",training.shape)
    print("Testing Data Shape after Processing ",testing.shape)
    return [training,testing]

# 분석에 필요없는 컬럼 삭제하기

In [4]:
train,test=readInputAndEncode("../../week02/Eunhee/",'input_training.csv','input_testing.csv','log_trip_duration')
train.drop(['pickup_date'],axis=1,inplace=True)
test.drop(['pickup_date'],axis=1,inplace=True)
train.drop(['pickup_datetime'],axis=1,inplace=True)
test.drop(['pickup_datetime'],axis=1,inplace=True)
test_id=test['id']
train.drop(['id'],axis=1,inplace=True)
test.drop(['id'],axis=1,inplace=True)
train.drop(['vendor_id'], axis=1, inplace=True)
test.drop(['vendor_id'], axis=1, inplace=True)

Training Data Shape after Processing  (1458644, 18)
Testing Data Shape after Processing  (625134, 17)


## 범주형 변수화

In [5]:
train['store_and_fwd_flag'] = train['store_and_fwd_flag'].astype('category')
train['pickup_neighbourhood'] = train['pickup_neighbourhood'].astype('category')
train['dropoff_neighbourhood'] = train['dropoff_neighbourhood'].astype('category')

test['store_and_fwd_flag'] = test['store_and_fwd_flag'].astype('category')
test['pickup_neighbourhood'] = test['pickup_neighbourhood'].astype('category')
test['dropoff_neighbourhood'] = test['dropoff_neighbourhood'].astype('category')

In [6]:
train.info()
test.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1458644 entries, 0 to 1458643
Data columns (total 14 columns):
 #   Column                    Non-Null Count    Dtype   
---  ------                    --------------    -----   
 0   passenger_count           1458644 non-null  int64   
 1   store_and_fwd_flag        1458644 non-null  category
 2   pickup_day                1458644 non-null  int64   
 3   pickup_hour               1458644 non-null  int64   
 4   pickup_day_of_week        1458644 non-null  int64   
 5   pickup_latitude_round3    1458644 non-null  float64 
 6   pickup_longitude_round3   1458644 non-null  float64 
 7   dropoff_latitude_round3   1458644 non-null  float64 
 8   dropoff_longitude_round3  1458644 non-null  float64 
 9   trip_distance             1458644 non-null  float64 
 10  bearing                   1458644 non-null  float64 
 11  pickup_neighbourhood      1458644 non-null  category
 12  dropoff_neighbourhood     1458644 non-null  category
 13  log_trip_dur

In [7]:
train.corr().style.background_gradient()

Unnamed: 0,passenger_count,pickup_day,pickup_hour,pickup_day_of_week,pickup_latitude_round3,pickup_longitude_round3,dropoff_latitude_round3,dropoff_longitude_round3,trip_distance,bearing,log_trip_duration
passenger_count,1.0,0.002014,0.009101,0.005948,-0.005112,0.002167,-0.00276,-0.00034,0.010306,-0.000651,0.021124
pickup_day,0.002014,1.0,0.000414,0.002893,-0.006493,-0.000871,-0.005562,-0.000456,0.005982,-0.000226,0.010385
pickup_hour,0.009101,0.000414,1.0,0.021308,0.010639,0.01015,0.013633,-0.022454,-0.015813,-0.021959,0.039107
pickup_day_of_week,0.005948,0.002893,0.021308,1.0,-0.000947,-0.017992,-0.006674,-0.008958,-0.022942,0.000621,0.025439
pickup_latitude_round3,-0.005112,-0.006493,0.010639,-0.000947,1.0,0.022823,0.494087,0.114941,-0.210145,-0.111362,-0.144064
pickup_longitude_round3,0.002167,-0.000871,0.01015,-0.017992,0.022823,1.0,0.100219,0.783599,0.259735,-0.117929,0.110324
dropoff_latitude_round3,-0.00276,-0.005562,0.013633,-0.006674,0.494087,0.100219,1.0,0.124836,-0.142139,0.049923,-0.123296
dropoff_longitude_round3,-0.00034,-0.000456,-0.022454,-0.008958,0.114941,0.783599,0.124836,1.0,0.134047,0.170872,0.071418
trip_distance,0.010306,0.005982,-0.015813,-0.022942,-0.210145,0.259735,-0.142139,0.134047,1.0,-1.9e-05,0.572128
bearing,-0.000651,-0.000226,-0.021959,0.000621,-0.111362,-0.117929,0.049923,0.170872,-1.9e-05,1.0,-0.004829


승하차의 위도간 상관관계는 약 0.49, 경도간 상관관계는 약 0.78으로 택시는 비슷한 경도에서 남북을 오가는 경향이 있는 것으로 보인다.

# Train set과 test set 분리하기

In [8]:
raw_x = train[['passenger_count','store_and_fwd_flag','pickup_day','pickup_hour','pickup_day_of_week','pickup_latitude_round3','pickup_longitude_round3','dropoff_latitude_round3','dropoff_longitude_round3','trip_distance','bearing','pickup_neighbourhood','dropoff_neighbourhood']]
raw_y = train[['log_trip_duration']]

In [9]:
train_x,test_x, train_y, test_y = train_test_split(raw_x, raw_y, test_size=0.2, random_state=42)

# LGBM regressor + GridSearchCV

In [10]:
model_lgb = lgb.LGBMRegressor()
parameters = {'n_estimators':[100,200,300,400,500,600],
              'objective':['regression'],
              'learning_rate':[0.01,0.05,0.1],
              'max_depth':[8,10,12,14,16,18,20,22],
              'num_leaves':[7,15,20,30,40],
              'num_iterations':[1000,1500,2000]}
grid_lgb = GridSearchCV(model_lgb, param_grid=parameters, cv=3, scoring='neg_mean_absolute_error', refit=True)
grid_lgb.fit(train_x, train_y)
# em = grid_lgb.best_estimator_
# pred = em.predict(test_x)
# accuracy_score(test_y, pred)

best parameters :  {'learning_rate': 0.1, 'max_depth': 20, 'n_estimators': 100, 'num_iterations': 2000, 'num_leaves': 40, 'objective': 'regression'}
best score :  -0.2422678823911808


In [12]:
print('best parameters : ', grid_lgb.best_params_)
print('mean absolute error : ', -grid_lgb.best_score_)

best parameters :  {'learning_rate': 0.1, 'max_depth': 20, 'n_estimators': 100, 'num_iterations': 2000, 'num_leaves': 40, 'objective': 'regression'}
mean absolute error :  0.2422678823911808
