In [58]:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.grid_search import RandomizedSearchCV
from sklearn.grid_search import GridSearchCV

In [22]:
def output_metrics(model, train, predictors, target):
    print('Gini (train): ', 2*roc_auc_score(model.predict(train[predictors]), train[target])-1)
    print('AUC (train): ', roc_auc_score(model.predict(train[predictors]), train[target]))
    print('Accuracy (train): ', accuracy_score(model.predict(train[predictors]), train[target]))
    
def output_confusion_matrix(data, model, target, predictors):
    y_true = data[target]
    y_pred = model.predict(data[predictors])
    tn, fp, fn, tp = confusion_matrix(y_true, y_pred).ravel()
    print('true positives: ', round((tp*100)/len(data), 2), '% ', tp)
    print('false positives:', round((fp*100)/len(data), 2), '% ', fp)
    print('true negatives: ', round((tn*100)/len(data), 2), '% ', tn)
    print('false negatives: ', round((fn*100)/len(data), 2), '% ', fn)    

In [9]:
train = pd.read_csv('train_ds.csv')
train.head()

Unnamed: 0,cgsettlementbufferid,mcc,tranccy,ccy,amount,location,trandatetime,sexid,clientid
0,1,4131,978,980,6730,TRAMBESOS SANT JOAN DES ES,1385977762,1,13
1,2,5651,978,980,55912,ZARA BARCELONA P GRACIABARCELONA ES,1385991764,1,13
2,3,5812,978,980,135658,CAFE DE L'ACADEMIA BARCELONA ES,1385999148,1,13
3,4,5947,978,980,138294,2D BCN DISSENY EN BIJUTBARCELONA ES,1385999979,1,13
4,5,5651,978,980,426209,EL GANSO BARCELONA ES,1386005648,1,13


In [10]:
test = pd.read_csv('test_ds.csv')
test.head()

Unnamed: 0,cgsettlementbufferid,mcc,tranccy,ccy,amount,location,trandatetime,clientid
0,395,5411,980,980,7488,VOPAK 7301 UZHGOROD UA,1393172737,712
1,422,6010,980,980,40000,ODESA REGIONAL DEP. ODESSA UA,1393509649,766
2,429,6010,980,980,119000,ODESA REGIONAL DEP. ODESSA UA,1393583520,745
3,435,6010,980,980,210000,ODESA REGIONAL DEP. ODESSA UA,1393591808,748
4,445,6011,980,980,30000,"GAGARINA, 13 ODESSA UA",1393766754,766


In [16]:
#had to leave out location as Linear Regression takes only data of type float/int
predictors = ['cgsettlementbufferid', 'mcc', 'tranccy', 'ccy', 'amount', 'trandatetime', 
                'clientid']
target = 'sexid'

In [21]:
test.head(2)

Unnamed: 0,cgsettlementbufferid,mcc,tranccy,ccy,amount,location,trandatetime,clientid
0,395,5411,980,980,7488,VOPAK 7301 UZHGOROD UA,1393172737,712
1,422,6010,980,980,40000,ODESA REGIONAL DEP. ODESSA UA,1393509649,766


In [19]:
params = {"C": [0.001, 0.01, 0.1, 1, 10, 100, 1000, 10000], "gamma": [10000, 1000, 100, 10, 1 ,0.1, 
                                                                      0.01, 0.001]}

#tune the hyperparameters via a randomized search
grid = RandomizedSearchCV(LogisticRegression(), params)
grid.fit(train[predictors], train[target])
#acc = grid.score(train[predictors], train[target])
#print("grid search accuracy: {:.2f}%".format(acc * 100))
print("randomized search best parameters: {}".format(grid.best_params_))

In [None]:
model = SVC(C=0.0001, cache_size=200, class_weight=None, coef0=0.0,
        decision_function_shape='ovr', degree=3, gamma=100, kernel='rbf',
        max_iter=-1, probability=False, random_state=None, shrinking=True,
        tol=0.001, verbose=False).fit(train[predictors], train[target])

In [None]:
output_metrics(model, train, predictors, target)

In [None]:
output_confusion_matrix(train, model, target, predictors)

In [84]:
pred = model.predict(test_df)
#converting array to dataframe
pred_index = [str(i) for i in range(0, len(pred))]
pred_df = pd.DataFrame(data=pred, index=pred_index, columns=['sexid'])
pred_df.head()

Unnamed: 0,sexid
0,1
1,1
2,1
3,1
4,1


# Task 2

Мы знаем, что Джон Доу прилетел в Украину в начале июля 2015-го. Также известно, что 6го июля Джон ужинал в явочном кафе в Киеве и передал секретные документы другому шпиону, но в толпе наш агент его потерял.

In [None]:
#came to Ukraine in the begining of July 2015.
#Kyiv, 6th of July (some cafe)
#id - ?
#we should search by location and by datetime

In [252]:
test_data.head()

Unnamed: 0,cgsettlementbufferid,mcc,tranccy,ccy,amount,location,trandatetime,clientid
0,395,5411,980,980,7488,VOPAK 7301 UZHGOROD UA,1393172737,712
1,422,6010,980,980,40000,ODESA REGIONAL DEP. ODESSA UA,1393509649,766
2,429,6010,980,980,119000,ODESA REGIONAL DEP. ODESSA UA,1393583520,745
3,435,6010,980,980,210000,ODESA REGIONAL DEP. ODESSA UA,1393591808,748
4,445,6011,980,980,30000,"GAGARINA, 13 ODESSA UA",1393766754,766


In [253]:
#converting data from epoch seconds to timestamp format
test_data['trandatetime'] = test_data['trandatetime'].astype('datetime64[s]')

In [254]:
test_data.head()

Unnamed: 0,cgsettlementbufferid,mcc,tranccy,ccy,amount,location,trandatetime,clientid
0,395,5411,980,980,7488,VOPAK 7301 UZHGOROD UA,2014-02-23 16:25:37,712
1,422,6010,980,980,40000,ODESA REGIONAL DEP. ODESSA UA,2014-02-27 14:00:49,766
2,429,6010,980,980,119000,ODESA REGIONAL DEP. ODESSA UA,2014-02-28 10:32:00,745
3,435,6010,980,980,210000,ODESA REGIONAL DEP. ODESSA UA,2014-02-28 12:50:08,748
4,445,6011,980,980,30000,"GAGARINA, 13 ODESSA UA",2014-03-02 13:25:54,766


In [255]:
#removing the rows which don't match the condition of year - year==2015
test_data.drop(test_data[test_data['trandatetime'].dt.year != 2015].index, inplace=True)

#removing the rows which don't match the condition of month - month == 07
test_data.drop(test_data[test_data['trandatetime'].dt.month != 7].index, inplace=True) 

#removing the rows which don't match the condition of day - day == 06
test_data.drop(test_data[test_data['trandatetime'].dt.day != 6].index, inplace=True)

In [256]:
test_data.head()

Unnamed: 0,cgsettlementbufferid,mcc,tranccy,ccy,amount,location,trandatetime,clientid
37701,137481,6011,980,980,45000,4824884/CHERVONOZORYANYKYIV UA,2015-07-06 09:13:21,1190
37714,137437,6011,980,980,50000,KOROLEVA 76/1 ODESSA UA,2015-07-06 09:59:17,1582
37737,137480,6011,980,980,150000,4824884/CHERVONOZORYANYKYIV UA,2015-07-06 09:11:20,1190
37862,137843,6012,980,980,10000,MFPay2youAC VISA DIRECT UA,2015-07-06 12:56:44,4048
37863,137845,6012,980,980,10000,MFPay2youAC VISA DIRECT UA,2015-07-06 08:38:34,4048


In [257]:
#John Dou is in Kyiv but the location can be misspelled
#Kyiv, Kiev, Kiyev

test_data.drop(test_data[test_data['location'].str.contains("KYIV|KIEV|KIYEV|Kiiev") == False].index,
               inplace=True)

In [258]:
test_data.head()

Unnamed: 0,cgsettlementbufferid,mcc,tranccy,ccy,amount,location,trandatetime,clientid
37701,137481,6011,980,980,45000,4824884/CHERVONOZORYANYKYIV UA,2015-07-06 09:13:21,1190
37737,137480,6011,980,980,150000,4824884/CHERVONOZORYANYKYIV UA,2015-07-06 09:11:20,1190
38009,138663,6010,980,980,590000,KYIV RU KYIV UA,2015-07-06 07:51:20,2387
38013,138689,6010,980,980,150000,KYIV RU KYIV UA,2015-07-06 15:52:42,6169
38048,138783,6011,980,980,100000,DN00/V. Vasilevskoy strKIYEV UA,2015-07-06 11:09:41,3652


In [259]:
#John Dou was having supper, it means it was evening already
#evening starts at 17:00
test_data.drop(test_data[test_data['trandatetime'].dt.hour < 17].index, inplace=True)

In [260]:
test_data.head() 

Unnamed: 0,cgsettlementbufferid,mcc,tranccy,ccy,amount,location,trandatetime,clientid
38105,138934,5912,980,980,6154,APTEKA KYIVSKII MAIDAN LUTSK UA,2015-07-06 17:39:21,942
38106,138938,4814,980,980,20400,PORTMONE.MOBILE KYIV UA,2015-07-06 17:54:29,2163
38252,139225,5499,980,980,3310,MAGAZUNPRODUKTU0004262 KIYEV UAUA,2015-07-06 19:18:09,9254
38254,139230,5499,980,980,14030,PR816 KIYEV UAUA,2015-07-06 19:40:12,2163
38255,139231,5411,980,980,22223,MAGAZIN3000264 KIYEV UAUA,2015-07-06 19:41:25,713


In [261]:
#John Dou was having supper in cafe not in AZS or pharmacy or shop
#and he wasn't bying cosmetics in Eva either
test_data.drop(test_data[test_data['location'].str.contains("MOBILE|PORTMONE|AZS|APTEKA|MAGAZIN|MAGAZUN|EVA|KYSHENYA|PR") == True].index,
               inplace=True)

In [262]:
test_data.head()

Unnamed: 0,cgsettlementbufferid,mcc,tranccy,ccy,amount,location,trandatetime,clientid
38271,139486,5812,980,980,20798,DOMINOS.UA KYIV UA,2015-07-06 20:11:52,4449
38288,139678,5812,980,980,1320,ZDOROVENKI BULY KYIV UAUA,2015-07-06 19:16:04,9315
38320,139668,5499,980,980,7215,FUDMEREZHA VICHYAKIVSK KYIV UAUA,2015-07-06 18:28:38,7040
38321,139670,5814,980,980,3300,McDonald's 004 KYIV UAUA,2015-07-06 18:54:04,9315


In [266]:
print("John Dou is...")
print(test_data[["clientid"]])

John Dou is...
       clientid
38271      4449
38288      9315
38320      7040
38321      9315


In [272]:
more_data = pd.read_csv('test_ds.csv')
more_data.drop(more_data[np.logical_not(more_data['clientid'].isin([4449, 9315, 7040]))].index,
               inplace=True)

In [274]:
#converting data 
more_data['trandatetime'] = more_data['trandatetime'].astype('datetime64[s]')
more_data.head()

Unnamed: 0,cgsettlementbufferid,mcc,tranccy,ccy,amount,location,trandatetime,clientid
19012,74027,6010,980,980,40000,KIEVSKOE RU KYIV UA,2015-03-02 13:26:37,4449
19985,76942,6010,980,980,964000,KYIV RU KYIV UA,2015-03-10 15:48:24,4449
21075,80805,6010,980,980,69000,KIEVSKOE RU KYIV UA,2015-03-18 11:42:12,4449
21852,83105,6010,980,980,617500,KIEVSKOE RU KYIV UA,2015-03-24 11:32:51,4449
23052,87088,6010,980,980,90000,KIEVSKOE RU KYIV UA,2015-04-03 12:00:54,4449


In [None]:
#then 4449 can't be John Dou as he has records of in March and April

In [275]:
#John Dou came to Ukraine in the beginning of July
more_data.drop(more_data[more_data['trandatetime'].dt.month != 7].index, inplace=True) 

In [278]:
more_data.head()

Unnamed: 0,cgsettlementbufferid,mcc,tranccy,ccy,amount,location,trandatetime,clientid
37364,136284,5499,980,980,4690,FORAPRODUCTY0000002282 KIYEV UAUA,2015-07-01 18:42:18,7040
37392,136173,5912,980,980,1034,APTEKA5000000000002963 KIYEV UAUA,2015-07-01 07:45:21,7040
37610,137241,5331,980,980,1147,EVA-322 KIEV UA,2015-07-04 14:15:53,9315
37688,137194,5912,980,980,10980,ELEGANT-GRUPP APTEK5 KIEV UA,2015-07-03 17:19:56,7040
37871,138073,5499,980,980,12455,FUDMEREZHA VICHYAKIVSK KYIV UAUA,2015-07-03 20:44:04,7040


In [279]:
test_data.head()

Unnamed: 0,cgsettlementbufferid,mcc,tranccy,ccy,amount,location,trandatetime,clientid
38271,139486,5812,980,980,20798,DOMINOS.UA KYIV UA,2015-07-06 20:11:52,4449
38288,139678,5812,980,980,1320,ZDOROVENKI BULY KYIV UAUA,2015-07-06 19:16:04,9315
38320,139668,5499,980,980,7215,FUDMEREZHA VICHYAKIVSK KYIV UAUA,2015-07-06 18:28:38,7040
38321,139670,5814,980,980,3300,McDonald's 004 KYIV UAUA,2015-07-06 18:54:04,9315


#  John Dou is 7040.
Considering that he went only to one cafe.
But if not then 9315 can be John Dou too.

In [7]:
#let's find location of John Dou
loc_data = pd.read_csv('test_ds.csv')
loc_data.drop(loc_data[np.logical_not(loc_data['clientid'].isin([7040]))].index,
               inplace=True)
#converting data 
loc_data['trandatetime'] = loc_data['trandatetime'].astype('datetime64[s]')
loc_data.drop(loc_data[loc_data['location'].str.contains("KYIV|KIEV|KIYEV|Kiiev") == True].index,
               inplace=True)
#loc_data.head()
print(loc_data['location'])

24818    ODESA REGIONAL DEP.    ODESSA         UA
25184    AUTOLUX ODESA 2        ODESA        UAUA
25214    McDonald's 019         ODESA        UAUA
25382    'TAVRIA V'ILYICH'      ILYICHEVSK     UA
25393    ABRIKOS590000000002836 ODESSA       UAUA
25623    ACCESSORIZE            ODESA        UAUA
25823    PALASSHAYO             ODESSA       UAUA
25926    WWW.UKRPAYS.COM2       KHMELNITSKYI UAUA
26037    ODESA REGIONAL DEP.    ODESSA         UA
26079    SHOP "EVA-451"         IILICHEVSK     UA
26240    PS679                  ODESSA       UAUA
26246    APTEKAULJANA           ILICHEVSK    UAUA
26363    Autolux tickets        Kyiv         UAUA
26829    KOMPOT PANTELEIMONIVSK ODESA        UAUA
27173    'TAVRIA V'ILYICH'      ILYICHEVSK     UA
27425    CITRUS                 ODESSA       UAUA
27751    ABRIKOSN               ODESSA       UAUA
27753    DOBROEPIVO             ODESSA       UAUA
27758    CITRUS                 ODESSA       UAUA
29731    WWW.UKRPAYS.COM2       KHMELNITSKYI UAUA


# John Dou is probably from Odessa