In [1]:
import random
import numpy as np
import time
import lightgbm as lgb

from sklearn import svm
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import linear_kernel
from sklearn import preprocessing
from sklearn.metrics import f1_score

from xgboost import XGBClassifier

import pandas as pd
from sklearn import linear_model
from sklearn.model_selection import StratifiedKFold



In [2]:
N_VECTORIZATION = 30

## Preparation of the data

In [3]:
path_to_data = "data/"

training_set = pd.read_csv(path_to_data+"improved_training_set.csv")
testing_set = pd.read_csv(path_to_data+"improved_testing_set.csv")

In [4]:
selected_features = ["description_d2v_"+str(i)+"_source" for i in range(N_VECTORIZATION)]+["description_d2v_"+str(i)+"_target" for i in range(N_VECTORIZATION)]
selected_features += ["title_d2v_"+str(i)+"_source" for i in range(N_VECTORIZATION)]+["title_d2v_"+str(i)+"_target" for i in range(N_VECTORIZATION)]
selected_features += ["common_neighbor",
                     "same_cluster",
                     "jaccard",
                     "diff_in_bc",
                     "diff_in_inlinks",
                     "diff_in_year",
                     "author_nb_common",
                     "author_is_one_common",
                     "common_classification",
                     "title_is_one_common",
                     "title_nb_common_word",
                     "cos_similarity_title",
                     "cos_similarity_description",
                     "target_eccentricty",
                     'inlinks_target',
                     'betweenness_author_target',
                     'inlinks_author_target',
                     'cos_similarity_tf_title',
                     'cos_similarity_tf_description',
                    ]

print(len(selected_features))


139


In [5]:
X_train, X_test, Y_train = training_set[selected_features],testing_set[selected_features],training_set.label
del (training_set,testing_set)

## Classification

In [None]:
# Parameters for lgbm

parameters = {
        'application': 'binary',
        'objective': 'binary',
        'metric': 'auc',
        'is_unbalance': 'true',
        'boosting': 'gbdt', #'dart'
        'num_leaves': 80,
        'feature_fraction': 0.7,
        'min_data_in_leaf': 500,
        'learning_rate': 0.1,
        'num_iterations': 500,
        'max_bin': 255,
        'verbosity': -2
        }

In [None]:
average_score = 0
n_splits = 5
prediction = [] #final prediction on X_test after Kfold

skf = StratifiedKFold(n_splits=n_splits)
for train_index, test_index in skf.split(X_train, Y_train):
    sub_X_train, sub_X_test = X_train.iloc[train_index], X_train.iloc[test_index]
    sub_Y_train, sub_Y_test = Y_train.iloc[train_index], Y_train.iloc[test_index]
        
    lgb_train = lgb.Dataset(sub_X_train, sub_Y_train)
    lgb_eval = lgb.Dataset(sub_X_test, sub_Y_test, reference=lgb_train)
    
    rfc = lgb.train(parameters,
                       lgb_train,
                       valid_sets=lgb_eval,
                       num_boost_round=5000,
                       early_stopping_rounds=100,
                       verbose_eval=False)
    
    #prediction_sub is the prediction on the validation set
    prediction_sub = rfc.predict(sub_X_test)
    
    #prediction_test is the prediction on the testing set
    prediction_test = rfc.predict(X_test)
    
    prediction_sub = [1 if p>0.5 else 0 for p in prediction_sub]
    
    score_tmp = f1_score(sub_Y_test, prediction_sub)
    average_score += score_tmp
    print("f1 score is for this fold :", score_tmp)
    
    # Combination with previous fold
    if len(prediction)==0:
        prediction = prediction_test
    prediction += prediction_test
        
    del(sub_X_train, sub_X_test, sub_Y_train, sub_Y_test, prediction_sub, prediction_test)
    
prediction = [1 if p>=n_splits/2 else 0 for p in prediction]
print("final f1 score is", average_score/n_splits)
  



f1 score is for this fold : 0.975270259134


In [None]:
df_sub = pd.DataFrame(prediction, columns=["category"])
df_sub.to_csv('output.csv', float_format='%.6f', index_label="ID")

print("done.")