In [1]:
import pandas as pd
import numpy as np
import pandas_profiling as pp
import matplotlib.pyplot as plt

import string
import re

from os.path import join as path_join

from user_agents import parse
from sklearn.preprocessing import LabelEncoder

from dateutil.relativedelta import relativedelta

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import log_loss
from scipy.sparse import hstack

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

import warnings
warnings.filterwarnings('ignore')

## Read data

In [2]:
DATA_DIR = 'data/'

train = pd.read_csv(path_join(DATA_DIR, 'train.csv'), parse_dates=['date_created', 'user_date_created'])
test = pd.read_csv(path_join(DATA_DIR,'test.csv'), parse_dates=['date_created', 'user_date_created'])

## Preproccessing

In [3]:
train = train.sort_values(by='date_created', ascending=True)

train = train.loc[train['date_created'] >= pd.to_datetime('2016-01-01')]

### Creating feature - difference between creating_comment_time and registration_time

In [4]:
def time_difference_feature(df):
    diff = pd.to_datetime(df['date_created']) - pd.to_datetime(df['user_date_created'])
    result = diff.apply(lambda x: x.seconds // 60)
    return result

### Creating feature - count of comments 

In [5]:
def count_comments(df):
    '''merge return dataframe with train/test'''
    result = pd.DataFrame(df.groupby('user_id').size().reset_index(name='counts'))
    return result

In [6]:
get_dt_next = lambda ts: ts.diff().shift(1)
train['from_prev_comment'] = train.groupby('user_id').date_created.apply(get_dt_next).dt.total_seconds() // 60

test['from_prev_comment'] = test.groupby('user_id').date_created.apply(get_dt_next).dt.total_seconds() // 60

train['from_prev_comment'].fillna(2500000, inplace=True)
test['from_prev_comment'].fillna(2500000, inplace=True)

In [7]:
train['diff_time_comment_registration'] = time_difference_feature(train)
test['diff_time_comment_registration'] = time_difference_feature(test)

In [8]:
train = pd.merge(train, count_comments(train), how='left', on='user_id')
test = pd.merge(test, count_comments(test), how='left', on='user_id')

In [9]:
train['count_sent'] = train["comment"].apply(lambda x: len(re.findall("\n",str(x)))+1)
train['count_words'] = train["comment"].apply(lambda x: len(str(x).split()))
train['count_unique_words'] = train["comment"].apply(lambda x: len(set(str(x).split())))
train['count_letters'] = train["comment"].apply(lambda x: len(str(x)))
train["count_punctuations"] = train["comment"].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
train["count_words_upper"] = train["comment"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
train["count_words_title"] = train["comment"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
train['word_unique_percent'] = train['count_unique_words'] * 100 / train['count_words']
train["weekend"] = ((train["date_created"].dt.dayofweek) // 5 == 1).astype(int)
train['month'] = train['date_created'].apply(lambda x: x.month)

test['count_sent'] = test["comment"].apply(lambda x: len(re.findall("\n",str(x)))+1)
test['count_words'] = test["comment"].apply(lambda x: len(str(x).split()))
test['count_unique_words'] = test["comment"].apply(lambda x: len(set(str(x).split())))
test['count_letters'] = test["comment"].apply(lambda x: len(str(x)))
test["count_punctuations"] = test["comment"].apply(lambda x: len([c for c in str(x) if c in string.punctuation]))
test["count_words_upper"] = test["comment"].apply(lambda x: len([w for w in str(x).split() if w.isupper()]))
test["count_words_title"] = test["comment"].apply(lambda x: len([w for w in str(x).split() if w.istitle()]))
test['word_unique_percent'] = test['count_unique_words'] * 100 / test['count_words']
test["weekend"] = ((test["date_created"].dt.dayofweek) // 5 == 1).astype(int)
test['month'] = test['date_created'].apply(lambda x: x.month)

In [10]:
#count amount of every ip for company
ip_amount_for_company_train = train.groupby(['company_id', 'user_ip']).user_ip.count()
ip_company_train = pd.DataFrame(ip_amount_for_company_train) 
ip_company_train.columns = ['ip_count_for_company']
ip_company_train = ip_company_train.reset_index()
train = pd.merge(train, ip_company_train, how='left', on=['company_id', 'user_ip'])
train.ip_count_for_company.fillna(1, inplace = True)

ip_amount_for_company_test = test.groupby(['company_id', 'user_ip']).user_ip.count()
ip_company_test = pd.DataFrame(ip_amount_for_company_test) 
ip_company_test.columns = ['ip_count_for_company']
ip_company_test = ip_company_test.reset_index()
test = pd.merge(test, ip_company_test, how='left', on=['company_id', 'user_ip'])
test.ip_count_for_company.fillna(1, inplace = True)

In [11]:
#count amount of every ip for product
ip_amount_for_product_train = train.groupby(['product_id', 'user_ip']).user_ip.count()
ip_product_train = pd.DataFrame(ip_amount_for_product_train) 
ip_product_train.columns = ['ip_count_for_product']
ip_product_train = ip_product_train.reset_index()
train = pd.merge(train, ip_product_train, how='left', on=['product_id', 'user_ip'])
train.ip_count_for_product.fillna(1, inplace = True)

ip_amount_for_product_test = test.groupby(['product_id', 'user_ip']).user_ip.count()
ip_product_test = pd.DataFrame(ip_amount_for_product_test) 
ip_product_test.columns = ['ip_count_for_product']
ip_product_test = ip_product_test.reset_index()
test = pd.merge(test, ip_product_test, how='left', on=['product_id', 'user_ip'])
test.ip_count_for_product.fillna(1, inplace = True)

In [12]:
#count amount of every id for company
id_amount_for_company_train = train.groupby(['company_id', 'user_id']).user_id.count()
id_company_train = pd.DataFrame(id_amount_for_company_train) 
id_company_train.columns = ['id_count_for_company']
id_company_train = id_company_train.reset_index()
train = pd.merge(train, id_company_train, how='left', on=['company_id', 'user_id'])
train.id_count_for_company.fillna(1, inplace = True)

id_amount_for_company_test = test.groupby(['company_id', 'user_id']).user_id.count()
id_company_test = pd.DataFrame(id_amount_for_company_test) 
id_company_test.columns = ['id_count_for_company']
id_company_test = id_company_test.reset_index()
test = pd.merge(test, id_company_test, how='left', on=['company_id', 'user_id'])
test.id_count_for_company.fillna(1, inplace = True)

In [13]:
#mean comments ammount within min - max date_created

train_user_min = pd.DataFrame(train.groupby(['user_id']).date_created.min())
train_user_min.columns = ['min_date']
train_user_min = train_user_min.reset_index()

train_user_max = pd.DataFrame(train.groupby(['user_id']).date_created.max())
train_user_max.columns = ['max_date']
train_user_max = train_user_max.reset_index()

train_user_diff_time = pd.merge(train_user_max, train_user_min, how='left', on='user_id')
train_user_diff_time['diff_time'] = (train_user_diff_time.max_date - train_user_diff_time.min_date).dt.total_seconds()//(60*24)
train_user_diff_time = pd.merge(train_user_diff_time, count_comments(train), how='left', on='user_id')
train_user_diff_time['mean_in_time'] = train_user_diff_time.counts/(train_user_diff_time.diff_time+100)
train_user_diff_time.drop(['max_date', 'min_date', 'diff_time', 'counts'], axis = 1, inplace = True)

train = pd.merge(train, train_user_diff_time, how='left', on='user_id')


test_user_min = pd.DataFrame(test.groupby(['user_id']).date_created.min())
test_user_min.columns = ['min_date']
test_user_min = test_user_min.reset_index()

test_user_max = pd.DataFrame(test.groupby(['user_id']).date_created.max())
test_user_max.columns = ['max_date']
test_user_max = test_user_max.reset_index()

test_user_diff_time = pd.merge(test_user_max, test_user_min, how='left', on='user_id')
test_user_diff_time['diff_time'] = (test_user_diff_time.max_date - test_user_diff_time.min_date).dt.total_seconds()//(60*24)
test_user_diff_time = pd.merge(test_user_diff_time, count_comments(test), how='left', on='user_id')
test_user_diff_time['mean_in_time'] = test_user_diff_time.counts/(test_user_diff_time.diff_time+100)
test_user_diff_time.drop(['max_date', 'min_date', 'diff_time', 'counts'], axis = 1, inplace = True)

test = pd.merge(test, test_user_diff_time, how='left', on='user_id')

In [46]:
train['user_agent'].fillna('None', inplace = True)
test['user_agent'].fillna('None', inplace = True)

ldevice = lambda x: x.device.brand
los = lambda x: x.os.family
lbrowser = lambda x: x.browser.family

In [47]:
train_ua = train['user_agent'].apply(parse)
train['device'] = train_ua.apply(ldevice)
train['os'] = train_ua.apply(los)
train['browser'] = train_ua.apply(lbrowser)

test_ua = train['user_agent'].apply(parse)
test['device'] = test_ua.apply(ldevice)
test['os'] = test_ua.apply(los)
test['browser'] = test_ua.apply(lbrowser)

In [48]:
le = LabelEncoder()
columns = ['os', 'device', 'browser']
for column in columns:
    train_ = train[column].astype(str).str.lower()
    test_ = test[column].astype(str).str.lower()
    train_test_union = set(train_).union(set(test_))
    le.fit(list(train_test_union))
    
    train[column] = le.transform(train_)
    test[column] = le.transform(test_)

## TFifVectorizer

In [20]:
stop_words = []
with open(path_join(DATA_DIR, 'stopwords-ru.txt'), 'r') as file:
    stop_words = file.read().split('\n')
    
with open(path_join(DATA_DIR, 'stop-words-russian.txt'), 'r') as file:
    stop_words += file.read().split('\n')
stop_words = set(stop_words)

stop_words.discard('')
stop_words.discard('\ufeffа')
 
stop_words = list(stop_words)

In [17]:
train_text = train['comment']
test_text = test['comment']
all_text = pd.concat([train_text, test_text])

word_vectorizer = TfidfVectorizer(max_features=250000,  
                                  stop_words=stop_words,
                                  analyzer='word',
                                  ngram_range=(1,3),
                                 )

word_vectorizer.fit(all_text)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=250000, min_df=1,
        ngram_range=(1, 3), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=['никто', 'этим', 'всюду', 'мор', 'вообще', 'нею', 'те', 'будто', 'русский', 'и', 'уже', 'рука', 'сама', 'слишком', 'наверху', 'бывь', 'этих', 'эта', 'вид', 'такое', 'тринадцатый', 'почти', 'ту', 'оно', 'мож', 'про', 'пятнадцатый', 'туда', 'назад', 'нельзя', 'сами', 'вдруг', 'голова', 'ше... 'мимо', 'хотя', 'им', 'дело', 'она', 'даром', 'низко', 'потом', 'утро', 'место', 'что', 'которого'],
        strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [18]:
char_vectorizer =  TfidfVectorizer(max_features=2500,  
                                  stop_words=stop_words,
                                  analyzer='char',
                                 )
char_vectorizer.fit(all_text)

TfidfVectorizer(analyzer='char', binary=False, decode_error='strict',
        dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
        lowercase=True, max_df=1.0, max_features=2500, min_df=1,
        ngram_range=(1, 1), norm='l2', preprocessor=None, smooth_idf=True,
        stop_words=['никто', 'этим', 'всюду', 'мор', 'вообще', 'нею', 'те', 'будто', 'русский', 'и', 'уже', 'рука', 'сама', 'слишком', 'наверху', 'бывь', 'этих', 'эта', 'вид', 'такое', 'тринадцатый', 'почти', 'ту', 'оно', 'мож', 'про', 'пятнадцатый', 'туда', 'назад', 'нельзя', 'сами', 'вдруг', 'голова', 'ше... 'мимо', 'хотя', 'им', 'дело', 'она', 'даром', 'низко', 'потом', 'утро', 'место', 'что', 'которого'],
        strip_accents=None, sublinear_tf=False,
        token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
        vocabulary=None)

In [19]:
def get_sparse_matrix(trainIndices, valIndices):
    """ create sparse matr by wordVectorizer """
    train_word_features = word_vectorizer.transform(train.loc[trainIndices, 'comment'])
    val_word_features = word_vectorizer.transform(train.loc[valIndices, 'comment'])
        
    return (train_word_features, val_word_features)

def get_sparse_matrix_char(trainIndices, valIndices):
    """ create sparse matrix by charVectorizer """
    train_word_features = char_vectorizer.transform(train.loc[trainIndices, 'comment'])
    val_word_features = char_vectorizer.transform(train.loc[valIndices, 'comment'])
        
    return (train_word_features, val_word_features)

## Validation

In [21]:
print(train['date_created'].min())
print(train['date_created'].max())

2016-01-01 00:22:55
2017-10-31 23:55:45


In [22]:
def create_validation(df, start_date):
    return (
        df.loc[np.logical_and(df['date_created'] >= pd.to_datetime(start_date) - relativedelta(days=0),
                              df['date_created'] <  pd.to_datetime(start_date) + relativedelta(months=6))].index,
        df.loc[np.logical_and(df['date_created'] >= pd.to_datetime(start_date) + relativedelta(months=6),
                              df['date_created'] <  pd.to_datetime(start_date) + relativedelta(months=10))].index
           )

train_dates = ['2016-01-01', '2016-06-01', '2017-01-01'] # split data to the 3 sub datasets

myCViterator = []
for i in train_dates:
    trainIndices, valIndices = create_validation(train, i)

    myCViterator.append([trainIndices, valIndices])

In [23]:
def get_date(df, index):
    return df.loc[index, 'date_created'].date()

for x, y in myCViterator:
    print('X: {} - {}, Y: {} - {}'.format(get_date(train, min(x)), get_date(train, max(x)),
                                          get_date(train, min(y)), get_date(train, max(y))))

X: 2016-01-01 - 2016-06-30, Y: 2016-07-01 - 2016-10-31
X: 2016-06-01 - 2016-11-30, Y: 2016-12-01 - 2017-03-31
X: 2017-01-01 - 2017-06-30, Y: 2017-07-01 - 2017-10-31


In [110]:
models = []

params = {
    'C': [0.001, 0.01, 0.1, 1, 5]
}

for c in params['C']:
    
    log_model = LogisticRegression(class_weight='balanced', random_state=42, C=c)
    
    scores = []
    
    for trainIndex, valIndex in myCViterator:
    
        matrix = get_sparse_matrix(trainIndex, valIndex)
        
        log_model.fit(matrix[0], train.loc[trainIndex, 'is_fake'])
        
        prediction = log_model.predict_proba(matrix[1])
        
        scores.append(log_loss(train.loc[valIndex, 'is_fake'], prediction))
        
    models.append({'c': c, 'score': np.mean(scores)})

for model in models:
    print(model)

{'c': 0.001, 'score': 0.6314092295368253}
{'c': 0.01, 'score': 0.46912167358469564}
{'c': 0.1, 'score': 0.2640957235860757}
{'c': 1, 'score': 0.1520383191963193}
{'c': 5, 'score': 0.12638545285138106}


## create new feature from the logReg predictions

In [35]:
log_model = LogisticRegression(class_weight='balanced', random_state=42, C=1)
predictions = []

for trainIndex, valIndex in myCViterator:
    matrix = get_sparse_matrix(trainIndex, valIndex)
    
    log_model.fit(matrix[0], train.loc[trainIndex, 'is_fake'])
    
    predictions.append([train.loc[valIndex, 'is_fake'], log_model.predict_proba(matrix[1])[:, 1]])
    
train["log_matrix_pred"] = 0

for i, fold in enumerate(myCViterator):
    trainIndex, valIndex = fold[0], fold[1]
    train.loc[valIndex, 'log_matrix_pred'] = predictions[i][1]
    
train.head()

Unnamed: 0,comment_id,company_id,user_id,product_id,comment,rating,date_created,user_date_created,user_ip,user_agent,...,count_words_title,word_unique_percent,weekend,month,ip_count_for_company,ip_count_for_product,id_count_for_company,mean_in_time,log_matrix_pred_char,log_matrix_pred
0,1067612,1089378,4997365,,"Хороший магазин,быстрая доставка.",5,2016-01-01 00:22:55,2016-01-01 00:22:55,91.192.132.251,Mozilla/5.0 (Linux; Android 4.4.2; LenovoA3300...,...,1,100.0,0,1,1.0,1.0,1,0.01,0.0,0.0
1,1067613,2364143,4997376,68308.0,"Ребята молодцы!!отлично сработали, очень опера...",5,2016-01-01 00:51:05,2016-01-01 00:51:05,79.140.1.116,Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...,...,1,100.0,0,1,1.0,1.0,1,0.01,0.0,0.0
2,375841,884214,1607869,318970.0,"Кухня вся разломана, поцарапана и вообще не ра...",1,2016-01-01 00:57:20,2013-10-13 13:06:58,194.54.160.10,Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.3...,...,3,95.454545,0,1,1.0,1.0,1,0.01,0.0,0.0
3,1060723,2100809,4954901,902536.0,"Швидко вийшли на зв'язок, вирішили усі організ...",5,2016-01-01 01:15:38,2015-12-23 22:24:59,91.243.6.71,Mozilla/5.0 (Linux; Android 4.2.1; Lenovo P780...,...,1,100.0,0,1,1.0,1.0,1,0.000297,0.0,0.0
4,1034089,2177682,4786680,,Купил квадрокоптер. Прислали не в полной компл...,2,2016-01-01 01:31:24,2015-11-28 23:58:27,188.163.84.118,Mozilla/5.0 (Android 4.2.2; Mobile; rv:41.0) G...,...,4,95.0,0,1,1.0,1.0,1,0.000362,0.0,0.0


In [24]:
log_model = LogisticRegression(class_weight='balanced', random_state=42, C=1)
predictions = []

for trainIndex, valIndex in myCViterator:
    matrix = get_sparse_matrix_char(trainIndex, valIndex)
    
    log_model.fit(matrix[0], train.loc[trainIndex, 'is_fake'])
    
    predictions.append([train.loc[valIndex, 'is_fake'], log_model.predict_proba(matrix[1])[:, 1]])
    
train["log_matrix_pred_char"] = 0

for i, fold in enumerate(myCViterator):
    trainIndex, valIndex = fold[0], fold[1]
    train.loc[valIndex, 'log_matrix_pred_char'] = predictions[i][1]
    
train.head()

Unnamed: 0,comment_id,company_id,user_id,product_id,comment,rating,date_created,user_date_created,user_ip,user_agent,...,count_words_upper,count_words_title,word_unique_percent,weekend,month,ip_count_for_company,ip_count_for_product,id_count_for_company,mean_in_time,log_matrix_pred_char
0,1067612,1089378,4997365,,"Хороший магазин,быстрая доставка.",5,2016-01-01 00:22:55,2016-01-01 00:22:55,91.192.132.251,Mozilla/5.0 (Linux; Android 4.4.2; LenovoA3300...,...,0,1,100.0,0,1,1.0,1.0,1,0.01,0.0
1,1067613,2364143,4997376,68308.0,"Ребята молодцы!!отлично сработали, очень опера...",5,2016-01-01 00:51:05,2016-01-01 00:51:05,79.140.1.116,Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKi...,...,0,1,100.0,0,1,1.0,1.0,1,0.01,0.0
2,375841,884214,1607869,318970.0,"Кухня вся разломана, поцарапана и вообще не ра...",1,2016-01-01 00:57:20,2013-10-13 13:06:58,194.54.160.10,Mozilla/5.0 (Windows NT 6.1) AppleWebKit/537.3...,...,0,3,95.454545,0,1,1.0,1.0,1,0.01,0.0
3,1060723,2100809,4954901,902536.0,"Швидко вийшли на зв'язок, вирішили усі організ...",5,2016-01-01 01:15:38,2015-12-23 22:24:59,91.243.6.71,Mozilla/5.0 (Linux; Android 4.2.1; Lenovo P780...,...,1,1,100.0,0,1,1.0,1.0,1,0.000297,0.0
4,1034089,2177682,4786680,,Купил квадрокоптер. Прислали не в полной компл...,2,2016-01-01 01:31:24,2015-11-28 23:58:27,188.163.84.118,Mozilla/5.0 (Android 4.2.2; Mobile; rv:41.0) G...,...,0,4,95.0,0,1,1.0,1.0,1,0.000362,0.0


In [25]:
train.columns

Index(['comment_id', 'company_id', 'user_id', 'product_id', 'comment',
       'rating', 'date_created', 'user_date_created', 'user_ip', 'user_agent',
       'is_fake', 'from_prev_comment', 'diff_time_comment_registration',
       'counts', 'count_sent', 'count_words', 'count_unique_words',
       'count_letters', 'count_punctuations', 'count_words_upper',
       'count_words_title', 'word_unique_percent', 'weekend', 'month',
       'ip_count_for_company', 'ip_count_for_product', 'id_count_for_company',
       'mean_in_time', 'log_matrix_pred_char'],
      dtype='object')

In [54]:
# features for the training our models

needful_features = ['rating', 'from_prev_comment', 'diff_time_comment_registration', 'counts', 
                    'count_sent', 'count_words', 'count_unique_words', 'count_letters', 'count_punctuations',
                    'count_words_upper', 'count_words_title', 'weekend', 'ip_count_for_company', 
                    'ip_count_for_product', 'id_count_for_company', 'month', 
                    'log_matrix_pred',  'gb_pred',
#                     'log_matrix_pred_char',
#                   'os', 'device', 'browser', 'mean_in_time',
]

In [156]:
models = []

params = {
    'C': [1]
}

for c in params['C']:
    
    # log_model = LogisticRegression(class_weight='balanced', random_state=42, C=c)
    gb = GradientBoostingClassifier(random_state=42)
    
    scores = []
    
    for trainIndex, valIndex in myCViterator:
    
        x_train, x_val = train.loc[trainIndex, needful_features], train.loc[valIndex, needful_features]
        y_train, y_val = train.loc[trainIndex, 'is_fake'], train.loc[valIndex, 'is_fake']
        
        gb.fit(x_train, y_train)
        
        prediction = gb.predict_proba(x_val)
        
        scores.append(log_loss(y_val, prediction))
    
    print(scores)
    models.append({'c': c, 'score': np.mean(scores)})

for model in models:
    print(model)

[0.012201427499560017, 0.007834636632392172, 0.13641371371674274]
{'c': 1, 'score': 0.052149925949564975}


In [27]:
# choose the time range for the train

# trainIndex = train.loc[train['date_created'] >=  pd.to_datetime('2017-04-30')].index
trainIndex = train.loc[train['date_created'] >=  pd.to_datetime('2016-01-01')].index

In [36]:
log_model = LogisticRegression(class_weight='balanced', random_state=42, C=1)

matrix = word_vectorizer.transform(train.loc[trainIndex, 'comment'])
test_matrix = word_vectorizer.transform(test['comment'])

log_model.fit(matrix, train.loc[trainIndex, 'is_fake'])

log_predictions = log_model.predict_proba(test_matrix)[:, 1]

test['log_matrix_pred'] = log_predictions

In [37]:
log_model = LogisticRegression(class_weight='balanced', random_state=42, C=1)

matrix = char_vectorizer.transform(train.loc[trainIndex, 'comment'])
test_matrix = char_vectorizer.transform(test['comment'])

log_model.fit(matrix, train.loc[trainIndex, 'is_fake'])

log_predictions = log_model.predict_proba(test_matrix)[:, 1]

test['log_matrix_pred_char'] = log_predictions

In [49]:
# features for the GradientBoostingClassifier
gb_features = ['rating', 'from_prev_comment', 'diff_time_comment_registration', 'counts', 
                'count_sent', 'count_words', 'count_unique_words', 'count_letters', 
                'count_punctuations', 'count_words_upper', 'count_words_title', 'weekend',
                'ip_count_for_company', 'ip_count_for_product', 'id_count_for_company',
                'month', 'log_matrix_pred',
#                 'log_matrix_pred_char',
                ]

gb = GradientBoostingClassifier(random_state=42)

x_train, y_train = train.loc[trainIndex, gb_features], train.loc[trainIndex, 'is_fake']

gb.fit(x_train, y_train)

test['gb_pred'] = gb.predict_proba(test[gb_features])[:, 1]

In [55]:
import xgboost as xgb

model = xgb.XGBClassifier()

x_train, y_train = train.loc[trainIndex, needful_features], train.loc[trainIndex, 'is_fake']

model.fit(x_train, y_train)

prediction = model.predict_proba(test[needful_features])

In [56]:
result_df = pd.DataFrame(test['comment_id'])
result_df['is_fake'] = prediction[:, 1]

In [41]:
checkDF = pd.read_csv(path_join(DATA_DIR, 'best_submition/submission_RIt.csv'))

In [57]:
difference_locations = np.where(resuld_df != checkDF)
changed_from = result_df.values[difference_locations] 
changed_to = checkDF.values[difference_locations]
pd.DataFrame({'from': changed_from, 'to': changed_to})

Unnamed: 0,from,to
0,0.000241,0.000828
1,0.000338,0.000575
2,0.444795,0.008649
3,0.032736,0.009089
4,0.017183,0.011847
5,0.013221,0.006267
6,0.013221,0.006880
7,0.013221,0.006375
8,0.465849,0.008751
9,0.013221,0.005825


In [None]:
result_df = pd.DataFrame(test['comment_id'])
result_df['is_fake'] = prediction[:, 1]

result_df.to_csv(path_join(DATA_DIR, 'submission.csv'), index=False)