In [21]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from utils.metric_utils import HardMetric
from sklearn.metrics import f1_score

%matplotlib inline

In [22]:
VALIDATION = False
SAMPLE = False
SAMPLE_FRAC = 0.3
final_metric = HardMetric(f1_score)

In [23]:
train = pd.read_csv('data/train.csv',index_col = 'id')
test = pd.read_csv('data/test.csv',index_col = 'id')
data = pd.concat([train, test]).drop(columns = 'target')
sample_test = pd.read_csv('data/sample_submission.csv',index_col = 'id')

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  This is separate from the ipykernel package so we can avoid doing imports until


In [24]:
from sklearn.model_selection import train_test_split
if VALIDATION:
    if SAMPLE:
        data = train.sample(frac = SAMPLE_FRAC)
    else:
        data = train
    train, test = train_test_split(data, train_size=0.7, shuffle=True)
    y_test = test['target']
    test = test.drop(columns = 'target')
else:
    train = train.sample(frac = 1.)

In [25]:
train.shape, test.shape

((7613, 4), (3263, 3))

# Feature Extraction

## Keyword

In [26]:
from feature_extraction.keyword import process_keyword

keyword = data['keyword']
X_keyword = process_keyword(keyword)
X_keyword.sample(10)

Unnamed: 0_level_0,keyword_OH_ablaze,keyword_OH_accident,keyword_OH_aftershock,keyword_OH_airplane%20accident,keyword_OH_ambulance,keyword_OH_annihilated,keyword_OH_annihilation,keyword_OH_apocalypse,keyword_OH_armageddon,keyword_OH_army,...,keyword_OH_whirlwind,keyword_OH_wild%20fires,keyword_OH_wildfire,keyword_OH_windstorm,keyword_OH_wounded,keyword_OH_wounds,keyword_OH_wreck,keyword_OH_wreckage,keyword_OH_wrecked,keyword_isna
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
5735,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
5330,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7929,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7332,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7453,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1003,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2103,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
604,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7913,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9405,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Text

In [27]:
from feature_extraction.text import process_text

text = data['text']
X_text = process_text(text)
X_text.sample(10)

Unnamed: 0_level_0,text_our,text_deeds,text_are,text_the,text_reason,text_of,text_this,text_earthquake,text_may,text_allah,...,text_reserve,text_hattrick,text_ebolaoutbreak,text_ala,text_issuicide,text_rajman,text_hasaka,text_risen,text_fasteners,text_xrwn
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6400,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7618,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3933,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
825,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10310,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3789,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7650,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7202,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9611,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5766,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## location

In [28]:
from feature_extraction.location import process_location

location = data['location']
X_location = process_location(location)
X_location.sample(10)

Unnamed: 0_level_0,location_nan,location_birmingham,location_est,location_september,location_bristol,location_estonia,location_africa,location_philadelphia,location_pa,location_panama,...,location_inwood,location_would,location_rather,location_philippians,location_deep,location_libland,location_canadaontario,location_reiss,location_acey,location_islanddåçtorontoåè
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9861,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
9837,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
10431,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
6475,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8779,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5592,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4966,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7417,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
7635,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4019,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


## combine

In [29]:
X_all = X_keyword.join(X_text).join(X_location)

X_train = X_all.loc[train.index]
y_train = train['target']
X_test = X_all.loc[test.index]

# Modeling

In [30]:
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression()

In [31]:
# from lightgbm import LGBMClassifier
# clf = LGBMClassifier(n_estimators=500)

In [32]:
# from sklearn.ensemble import RandomForestClassifier
# clf = RandomForestClassifier(n_estimators=200)

## Fitting

In [33]:
clf.fit(X_train, y_train)
y_pred_score = clf.predict_proba(X_test)[:, 1]

In [34]:
y_pred = (y_pred_score > 0.5).astype(int)

## Threshold choosing

In [35]:
from sklearn.model_selection import cross_val_predict

y_train_pred_score = cross_val_predict(clf, X_train, y_train, method = 'predict_proba')[:, 1]
th = final_metric.get_thresh(y_train, y_train_pred_score)

In [36]:
y_pred = (y_pred_score > th).astype(int)

In [37]:
th

0.406

# (Validation) Evaluation

In [38]:
if VALIDATION:
    from sklearn.metrics import confusion_matrix, accuracy_score, f1_score, roc_curve, plot_roc_curve, plot_confusion_matrix, plot_precision_recall_curve
    import matplotlib.pyplot as plt
    print(f'total score:\t {final_metric.metric(y_test, y_pred)}')
    plot_roc_curve(clf, X_test, y_test)
    plot_precision_recall_curve(clf, X_test, y_test)
    plot_confusion_matrix(clf, X_test, y_test)

# (Pred) Save Results

In [39]:
if not VALIDATION:
    import datetime
    
    current_time = str(datetime.datetime.now())[:-7].replace(' ', '__').replace('-', '_').replace(':', '_')
    pd.Series(data = y_pred, index = test.index, name='target').to_frame().to_csv(f'results/results_{current_time}.csv')

In [40]:
y_pred.sum(), y_pred.shape[0]

(1440, 3263)

# Feature Importances

In [41]:
feature_importances = pd.Series(
    data = clf.coef_[0],
    index = X_all.columns
)
feature_importances.sort_values()

text_prognosis          -3.369584
text_trader             -2.312578
text_reps               -2.125998
text_epileptic          -2.092256
keyword_OH_aftershock   -1.868602
                           ...   
keyword_OH_derailment    2.221541
text_munclejim           2.296881
text_trade               2.365533
text_entering            3.303098
text_blowing             3.668583
Length: 24969, dtype: float64