In [None]:
!pip install parfit
# !pip install --ignore-installed orange3
# !easy_install -U setuptools

In [2]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import normalize
from sklearn.metrics import f1_score, auc, accuracy_score, confusion_matrix, precision_score, balanced_accuracy_score, recall_score, roc_auc_score, roc_curve
from sklearn import tree, metrics
from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

import matplotlib.pyplot as plt

from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import *
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import MinMaxScaler, StandardScaler, RobustScaler, normalize
import seaborn as sns
from sklearn.decomposition import PCA

from sklearn.linear_model import SGDClassifier

from sklearn.model_selection import ParameterGrid
# import parfit.parfit as pf
# import os
# print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

## Load dataset

In [3]:
PATH_PREFIX = '../input/'
TWITTER_RELATIVE_500 = PATH_PREFIX + "Twitter-Relative-Sigma-500.data"

def load_dataset(path):
	data = pd.read_csv(path)
	return data[data.columns[:-1]], data[data.columns[-1]]

data_X, data_Y = load_dataset(TWITTER_RELATIVE_500)
data_X = data_X.astype(float)

X_train, X_test, Y_train, Y_test = train_test_split(data_X, data_Y, test_size=0.2, random_state=1)
# X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.2, random_state=1)

# Ensemble

In [5]:
train_set = pd.DataFrame(np.column_stack((X_train, Y_train)))
data = train_set
label = data.columns[-1]

buzz = data[label].value_counts()[1]
instances = data.describe().iloc[0][0]
# 38.86933701657458
negatives = data.loc[data[label]==0]
positives = data.loc[data[label]==1]
print(len((positives)))
print(len(negatives))
print(len(negatives)/len(positives))
print(len(negatives)/38)

splits = round(len(negatives)/(round(len(negatives)/len(positives))))
N = int(len(negatives)/splits)
frames = [ negatives.iloc[i*splits:(i+1)*splits].copy() for i in range(N+1) ]

frames[-2] = pd.concat([frames[-2],frames[-1]], axis=0)
frames = frames[:-1]
def print_scores(pred_Y):
  print(f1_score(Y_test, pred_Y))
  print(roc_auc_score(Y_test, pred_Y, average="weighted"))
  tn, fp, fn, tp = confusion_matrix(Y_test, pred_Y).ravel()
  print(balanced_accuracy_score(Y_test, pred_Y))
  print(tp/(tp+fn))
  print(precision_score(Y_test, pred_Y))
  print(tn/(fp+tn))

2940
109625
37.28741496598639
2884.8684210526317


In [6]:
def get_ensemble_predictions(model):
  models = [model for i in range(len(frames))]
  for idx,frame in enumerate(frames):
    dataset = pd.concat([positives, frame], axis=0)
    models[idx] = models[idx].fit(dataset[dataset.columns[:-1]].values,
                                  dataset[dataset.columns[-1]].values)
  predictions = []
  for model in models:
    pred = model.predict(X_test)
    predictions.append(pred)
  # average predictions
  predictions = np.matrix(predictions)
  predictions = np.sum(predictions, axis=0)
  predictions = np.divide(predictions,len(models))
  pred_Y = predictions.round()
  pred_Y = np.array(pred_Y)[0]
  print_scores(pred_Y)

In [9]:
get_ensemble_predictions(LinearSVC(C=1.0, class_weight='balanced', dual=True, fit_intercept=True,
     intercept_scaling=1, loss='hinge', max_iter=5000, multi_class='ovr',
     penalty='l2', random_state=None, tol=1e-05, verbose=0)) #standard scaler
# 0.381294964028777
# 0.6867508364499394
# 0.6867508364499394
# 0.3897058823529412
# 0.3732394366197183
# 0.9837957905469376



0.381294964028777
0.6867508364499394
0.6867508364499394
0.3897058823529412
0.3732394366197183
0.9837957905469376


In [10]:
get_ensemble_predictions(KNeighborsClassifier(n_neighbors=3, weights='distance', algorithm='kd_tree', p=1)) # standard scaler
# 0.3510605594835537
# 0.883402573395537
# 0.883402573395537
# 0.8397058823529412
# 0.22191993781577923
# 0.9270992644381327

0.3510605594835537
0.883402573395537
0.883402573395537
0.8397058823529412
0.22191993781577923
0.9270992644381327


In [11]:
get_ensemble_predictions(SGDClassifier(alpha=0.0001, average=False, class_weight=None,
      early_stopping=True, epsilon=0.1, eta0=0.001, fit_intercept=True,
      l1_ratio=0.15, learning_rate='optimal', loss='modified_huber',
      max_iter=None, n_iter=None, n_iter_no_change=5, n_jobs=None,
      penalty='none', power_t=0.5, random_state=None, shuffle=True,
      tol=0.0001, validation_fraction=0.2, verbose=0, warm_start=False)) # standard scaler
# 0.05501840689348275
# 0.5747032262763091
# 0.5747032262763091
# 1.0
# 0.02828736636299347
# 0.14940645255261817

0.05501840689348275
0.5747032262763091
0.5747032262763091
1.0
0.02828736636299347
0.14940645255261817


In [12]:
get_ensemble_predictions(KNeighborsClassifier(n_neighbors=3, weights='distance', algorithm='kd_tree', p=6)) # standard scaler + norm
# 0.35069337442218795
# 0.8820412270217242
# 0.8820412270217242
# 0.836764705882353
# 0.22183235867446394
# 0.9273177481610954

0.35069337442218795
0.8820412270217242
0.8820412270217242
0.836764705882353
0.22183235867446394
0.9273177481610954


In [13]:
get_ensemble_predictions(KNeighborsClassifier(n_neighbors=3, weights='distance', algorithm='kd_tree', p=1)) # standard scaler + norm
# 0.3510605594835537
# 0.883402573395537
# 0.883402573395537
# 0.8397058823529412
# 0.22191993781577923
# 0.9270992644381327

0.3510605594835537
0.883402573395537
0.883402573395537
0.8397058823529412
0.22191993781577923
0.9270992644381327


In [14]:
from sklearn.linear_model import LogisticRegression
get_ensemble_predictions(LogisticRegression(C=100))
# 0.4086799276672694
# 0.8877665715619875
# 0.8877665715619873
# 0.8308823529411765
# 0.2709832134292566
# 0.9446507901827981



0.4086799276672694
0.8877665715619875
0.8877665715619873
0.8308823529411765
0.2709832134292566
0.9446507901827981


In [17]:
from sklearn.tree import DecisionTreeClassifier
get_ensemble_predictions(DecisionTreeClassifier(max_depth=3))
# 0.3081555834378921
# 0.9024756133609222
# 0.9024756133609222
# 0.9029411764705882
# 0.18577912254160364
# 0.9020100502512562

0.3081555834378921
0.9024756133609222
0.9024756133609222
0.9029411764705882
0.18577912254160364
0.9020100502512562


## Preprocessing

In [None]:
def scale_data(train, test):
#   scaler = MinMaxScaler((0, 37505))
  scaler = StandardScaler() #standard
  scaler.fit(train)
  return scaler.transform(train), scaler.transform(test)

In [None]:
def feature_selection(train, test, comp):
    pca = PCA(comp)
    pca = pca.fit(train)
    return pca.transform(train), pca.transform(test)

##### Plots pca

In [None]:
X_train, X_test = scale_robust_data(X_train, X_test)
pca = PCA(0.95)
pca_fitted = pca.fit(X_train)
sns.heatmap(np.log(pca_fitted.inverse_transform(np.eye(pca_fitted.components_.shape[0]))), cmap="hot", cbar=True, cbar_kws={"orientation": "horizontal"})

In [None]:
X_train, X_test = scale_robust_data(X_train, X_test)
pca_fitted = PCA()
pca_fitted = pca_fitted.fit(X_train)
# X_train, X_test = pca_fitted.transform(X_train, X_test)
sns.heatmap(np.log(pca_fitted.inverse_transform(np.eye(X_train.shape[1]))), cmap="hot", cbar=True, cbar_kws={"orientation": "horizontal"})

## Models

In [None]:
stratified_kfold = KFold(10, False, 1)


In [None]:
def test_results(model_builder_fct, scaler, feature_selection, comp, norm, data_X, data_Y, model_name):
#   ipdb.set_trace(context=10)
  results = pd.DataFrame(index=range(10))
  col_name = f"{model_name}_stratified_kfold_%s"
  predictions, Y = [], []
  f1s, w_aucs, bacc, tpr, tnr, prec = [], [], [], [], [], []
  for train_idx, test_idx in stratified_kfold.split(data_X):
    print("Fold %s" % str(len(f1s)))
    train_X = data_X.iloc[train_idx]
    test_X = data_X.iloc[test_idx]
    train_Y, test_Y = data_Y.iloc[train_idx], data_Y.iloc[test_idx]

#     train_X, test_X = scaler(train_X, test_X)
    if norm:
      train_X, test_X = normalize(train_X, norm=norm), normalize(test_X, norm=norm)
    
    train_X, test_X = scaler(train_X, test_X)

    if feature_selection:
      train_X, test_X = feature_selection(train_X, test_X, comp)
    model = model_builder_fct(train_X, train_Y)
    print("Model fit")
    pred = model.predict(test_X)
    predictions.append(pred)
    Y.append(test_Y)
#     pred_Y = np.asarray(np.clip(pred,0,1)).round()
    pred_Y = model.predict(test_X)
#     print(sum(pred_Y))

#     pred_Y = np.vectorize(lambda x: 0 if x < 0.5 else 1)(pred_Y)
    f1s.append(f1_score(test_Y, pred_Y))
    w_aucs.append(roc_auc_score(test_Y, pred_Y, average="weighted"))
    tn, fp, fn, tp = confusion_matrix(test_Y, pred_Y).ravel()
    bacc.append(balanced_accuracy_score(test_Y, pred_Y))
    tpr.append(tp/(tp+fn))
    tnr.append(tn/(fp+tn))
#     assert tpr == recall_score(test_Y, pred_Y)
    prec.append(precision_score(test_Y, pred_Y))
    
  f1_df = pd.DataFrame({col_name % ("F1"): f1s})
  auc_df = pd.DataFrame({col_name % ("AUC"): w_aucs})
  bacc_df = pd.DataFrame({col_name % ("BACC"): bacc})
  tpr_df = pd.DataFrame({col_name % ("TPR(recall)"): tpr})
  tnr_df = pd.DataFrame({col_name % ("TNR"): tnr})
  prec = pd.DataFrame({col_name % ("Precision"): prec})
  results = pd.concat([results, f1_df, auc_df, bacc_df, tpr_df, tnr_df, prec], axis=1)
  return results, (Y, predictions)

In [None]:
def scale_data_standard(train, test):
#   scaler = MinMaxScaler((min_data, max_data))
  scaler = StandardScaler()
  scaler.fit(train)
  return scaler.transform(train), scaler.transform(test)

### SGD


In [None]:
def build_sgd(train, test):
  model = SGDClassifier(alpha=0.0001, average=False, class_weight=None,
      early_stopping=True, epsilon=0.1, eta0=0.001, fit_intercept=True,
      l1_ratio=0.15, learning_rate='optimal', loss='modified_huber',
      max_iter=None, n_iter=None, n_iter_no_change=5, n_jobs=None,
      penalty='none', power_t=0.5, random_state=None, shuffle=True,
      tol=0.0001, validation_fraction=0.2, verbose=0, warm_start=False)# 0.48384879725085916
  return model.fit(train, test)

In [None]:
sgd_results_scaled, (y, pred) = test_results(build_sgd, scale_data_standard, None, 'l1', data_X, data_Y, "SGDNormStandardSC")

In [None]:
sgd_results_scaled.describe().iloc[1]

In [None]:
sgd_results_scaled, (y, pred) = test_results(build_sgd, scale_data_standard, None, 'l1', data_X, data_Y, "SGDStandardSC")

In [None]:
sgd_results_scaled.describe().iloc[1]

In [None]:
sgd_results_scaled, (y, pred) = test_results(build_sgd, scale_data_standard, None, 'l2', data_X, data_Y, "SGDNormStandardSC")

In [None]:
sgd_results_scaled.describe().iloc[1]

In [None]:
sgd_results_scaled, (y, pred) = test_results(build_sgd, scale_data_standard, None, 'l2', data_X, data_Y, "SGDStandardSC")

In [None]:
sgd_results_scaled.describe().iloc[1]

In [None]:
sgd_results_scaled, (y, pred) = test_results(build_sgd, scale_data_standard, None, None, data_X, data_Y, "SGDStandardSC")

In [None]:
sgd_results_scaled.describe().iloc[1]

#### Kneighbors

In [None]:
from sklearn.neighbors import KNeighborsClassifier
X_train, X_test, Y_train, Y_test = train_test_split(data_X, data_Y, test_size=0.2, random_state=1)
print("Split")
X_train, X_test = scale_data(X_train, X_test)
print("Scaled")
X_train, X_test = normalize(X_train), normalize(X_test)
# X_train, X_test = feature_selection(X_train, X_test)
print("normalized")
model = KNeighborsClassifier(n_neighbors=3, weights='distance', algorithm='kd_tree', p=6)
print("fitting model")
model.fit(X_train, Y_train)
f1= f1_score(Y_test, model.predict(X_test))
print(f1)

In [None]:
m = model.predict(X_test)

In [None]:
roc_auc_score(Y_test, model.predict(X_test))

In [None]:
X_train, X_test, Y_train, Y_test = train_test_split(data_X, data_Y, test_size=0.2, random_state=1)
print("Split")

X_train, X_test = scale_data(X_train, X_test)
print("Scaled")

X_train, X_test = normalize(X_train, norm='l1'), normalize(X_test, norm='l1')sni
print("normalized")
model = KNeighborsClassifier(n_neighbors=3, weights='distance', algorithm='kd_tree', p=1)
model.fit(X_train, Y_train)
print("fitting model")
f1= f1_score(Y_test, model.predict(X_test))
print(f1)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
X_train, X_test, Y_train, Y_test = train_test_split(data_X, data_Y, test_size=0.2, random_state=1)
X_train, X_test = scale_data(X_train, X_test)
X_train, X_test = feature_selection(X_train, X_test)
grid = {
    'n_neighbors': [3],
    'weights': ['distance'],
    'algorithm': ['kd_tree'],
    'p': [5,6,7,8,9]
}
res = {}
name = "%s_%s_%s_%s"
for n in grid['n_neighbors']:
    for w in grid['weights']:
        for algo in grid['algorithm']:
            for p in grid['p']:
                model = KNeighborsClassifier(n_neighbors=n, weights=w, algorithm=algo, p=p)
                model.fit(X_train, Y_train)
                f1= f1_score(Y_test, model.predict(X_test))
                model_name = name % (str(n), w, algo, str(p))
                print(model_name,f1)
                res[model_name] = f1
print(res)

In [None]:
model = KNeighborsClassifier(n_neighbors=3, weights='distance', algorithm='auto', p=1, metric='manhattan')

In [None]:
from sklearn.neighbors import KNeighborsClassifier
X_train, X_test, Y_train, Y_test = train_test_split(data_X, data_Y, test_size=0.2, random_state=1)
X_train, X_test = scale_data(X_train, X_test)
# X_train, X_test = feature_selection(X_train, X_test)

X_train, X_test = normalize(X_train, norm='l1'), normalize(X_test, norm='l1')
print("normalized")
grid = {
    'n_neighbors': [3],
    'weights': ['distance'],
    'algorithm': ['kd_tree'],
    'p': [1,2,3,4],
    'metric': ['euclidean', 'manhattan', 'chebyshev', 'minkowski']
}
res = {}
name = "%s_%s_%s_%s_%s"
for n in grid['n_neighbors']:
    for w in grid['weights']:
        for algo in grid['algorithm']:
            for p in grid['p']:
                for m in grid['metric']:
                    model_name = name % (str(n), w, algo, str(p), m)
#                     print(model_name)
                    model = KNeighborsClassifier(n_neighbors=n, weights=w, algorithm=algo, p=p, metric=m)
                    model.fit(X_train, Y_train)
                    f1= f1_score(Y_test, model.predict(X_test))
                    print(model_name,f1)
                    res[model_name] = f1
print(res)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
X_train, X_test, Y_train, Y_test = train_test_split(data_X, data_Y, test_size=0.2, random_state=1)
X_train, X_test = scale_data(X_train, X_test)
X_train, X_test = feature_selection(X_train, X_test)
grid = {
    'n_neighbors': [1,3,5],
    'weights': ['distance'],
    'algorithm': ['kd_tree'],
    'p': [1,2,3,4,5,6]
}
res = {}
name = "%s_%s_%s_%s"
for n in grid['n_neighbors']:
    for w in grid['weights']:
        for algo in grid['algorithm']:
            for p in grid['p']:
                model = KNeighborsClassifier(n_neighbors=n, weights=w, algorithm=algo, p=p)
                model.fit(X_train, Y_train)
                f1= f1_score(Y_test, model.predict(X_test))
                model_name = name % (str(n), w, algo, str(p))
                print(model_name,f1)
                res[model_name] = f1
print(res)

### SVM + SGD 

In [None]:
from sklearn.linear_model import SGDClassifier
X_train, X_test = scale_robust_data(X_train, X_test)
X_train, X_test = pca_transform(X_train, X_test)
grid = {
    'l1_ratio': [0, 0.1, 0.01, 0.2, 0.15],
    'tol':[1e-3, 1e-4, 1e-5],
    'penalty': ['none', 'l1', 'l2', 'elasticnet'],
    'class_weight': [None, 'balanced'],
    'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron',
             'squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'],
    'learning_rate': ['constant', 'optimal', 'invscaling', 'adaptive'],
    'early_stopping': [True],
    'validation_fraction': [0.2],
    'eta0': [0.1, 0.01, 0.001]
}
paramGrid = ParameterGrid(grid)
bestModel, bestScore, allModels, allScores = pf.bestFit(SGDClassifier, paramGrid,
           X_train, Y_train, X_test, Y_test, 
           metric = f1_score,
           greater_is_better=True,
           scoreLabel = "F1")
print(bestModel, bestScore)

## Logistic regression

In [None]:
clf = LogisticRegressionCV(Cs=10,cv=5, random_state=0, multi_class='multinomial')

In [None]:
from sklearn.neighbors import KNeighborsClassifier
X_train, X_test, Y_train, Y_test = train_test_split(data_X, data_Y, test_size=0.2, random_state=1)
X_train, X_test = scale_data(X_train, X_test)
X_train, X_test = feature_selection(X_train, X_test)
grid = {
    'n_neighbors': [1,3,5,7,9,11,13],
    'weights': ['uniform', 'distance'],
    'algorithm': ['auto', 'ball_tree', 'kd_tree', 'brute'],
    'p': [1,2]
}
res = {}
name = "%s_%s_%s_%s"
for n in grid['n_neighbors']:
    for w in grid['weights']:
        for algo in grid['algorithm']:
            for p in grid['p']:
                model = KNeighborsClassifier(n_neighbors=n, weights=w, algorithm=algo, p=p)
                model.fit(X_train, Y_train)
                f1= f1_score(Y_test, model.predict(X_test))
                model_name = name % (str(n), w, algo, str(p))
                print(model_name,f1)
                res[model_name] = f1
print(res)

In [None]:
from sklearn.linear_model import LogisticRegressionCV
X_train, X_test, Y_train, Y_test = train_test_split(data_X, data_Y, test_size=0.2, random_state=1)
X_train, X_test = scale_robust_data(X_train, X_test)
X_train, X_test = pca_transform(X_train, X_test)
grid = {
    'cv': [10],
    'Cs':[10, 12, 14, 16],
    'solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
    'tol':[1e-3, 1e-4, 1e-5],
    'penalty': ['l2'],
    'class_weight': [None, 'balanced'],
}
paramGrid = ParameterGrid(grid)
bestModel2, bestScore2, allModels2, allScores2 = pf.bestFit(LogisticRegressionCV, paramGrid,
           X_train, Y_train, X_test, Y_test, 
           metric = f1_score,
           greater_is_better=True,
           scoreLabel = "F1")
print(bestModel2, bestScore2)

In [None]:
from sklearn.linear_model import LogisticRegressionCV
X_train, X_test, Y_train, Y_test = train_test_split(data_X, data_Y, test_size=0.2, random_state=1)
X_train, X_test = scale_data(X_train, X_test)
X_train, X_test = feature_selection(X_train, X_test)
grid = {
    'cv': [10],
    'Cs':[10, 12, 14, 16],
    'solver': ['liblinear', 'saga'],
    'scoring': ['f1'],
    'penalty': ['l1'],
    'class_weight': [None, 'balanced'],
    'tol':[1e-3, 1e-4, 1e-5],
}
paramGrid = ParameterGrid(grid)
bestModel3, bestScore3, allModels3, allScores3 = pf.bestFit(LogisticRegressionCV, paramGrid,
           X_train, Y_train, X_test, Y_test, 
           metric = metrics.SCORERS['f1'],
           greater_is_better=True,
           scoreLabel = "F1")
print(bestModel3, bestScore3)

In [None]:
metrics.SCORERS['f1']

#### Log regression

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import ParameterGrid
import parfit.parfit as pf
grid = {
    'C': [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 10, 100],
    'penalty': ['l1'],
    'solver': ['saga', 'warn'],
    'n_jobs': [-1],
    'class_weight' :['balanced', None],
    'tol': [1e-4, 1e-5, 1e-6, 1e-3, 1e-2, 1e-1, 1]
}
X, X_tst = scale_data(X_train, X_test)
paramGrid = ParameterGrid(grid)
bestModel, bestScore, allModels, allScores = pf.bestFit(LogisticRegression, paramGrid,
           X, Y_train, X_tst, Y_test, 
           metric = f1_score,
           greater_is_better=True,
           scoreLabel = "F1")
print(bestModel, bestScore)

In [None]:
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import ParameterGrid
import parfit.parfit as pf
grid = {
    'C': [1e-7, 1e-6, 1e-5, 1e-4, 1e-3, 1e-2, 1e-1, 1e0, 10, 100],
    'penalty': ['l2'],
    'solver': ['sag', 'newton-cg', 'lbfgs', 'warn'],
    'n_jobs': [-1],
    'class_weight' :['balanced', None]
}
X, X_tst = scale_data(X_train, X_test)
X, X_tst = normalize(X), normalize(X_tst)

paramGrid = ParameterGrid(grid)
bestModel, bestScore, allModels, allScores = pf.bestFit(LogisticRegression, paramGrid,
           X, Y_train, X_tst, Y_test, 
           metric = f1_score,
           greater_is_better=True,
           scoreLabel = "F1")
print(bestModel, bestScore)

In [None]:
LogisticRegression(C=0.1, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn', n_jobs=-1,
          penalty='l2', random_state=None, solver='warn', tol=0.0001,
          verbose=0, warm_start=False) 0.5320574162679426
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='warn', n_jobs=-1,
          penalty='l2', random_state=None, solver='sag', tol=0.0001,
          verbose=0, warm_start=False) 0.5024248302618817

### Nearest centroid

In [None]:
from sklearn.neighbors import NearestCentroid
X_train, X_test, Y_train, Y_test = train_test_split(data_X, data_Y, test_size=0.2, random_state=1)
X_train, X_test = scale_data(X_train, X_test)
X_train, X_test = feature_selection(X_train, X_test, 37)
grid = {
    'metric': ['euclidean', 'l2', 'l1', 'manhattan', 'cityblock', 'braycurtis', 'canberra', 'chebyshev', 'correlation', 'cosine', 'dice', 'hamming', 'jaccard', 'kulsinski', 'mahalanobis', 'matching', 'minkowski', 'rogerstanimoto', 'russellrao', 'seuclidean', 'sokalmichener', 'sokalsneath', 'sqeuclidean', 'yule']
}
res = {}
name = "%s_%s"
for m in grid['metric']:
    model = NearestCentroid(metric=m)
    model.fit(X_train, Y_train)
    f1= f1_score(Y_test, model.predict(X_test))
#     model_name = name % (m, str(sh))
    print(model_name,f1)
    res[m] = f1
import operator
sorted_x = sorted(res.items(), key=operator.itemgetter(1), reverse=True)
print(sorted_x)

In [None]:
def test_results(model_builder_fct, scaler, feature_selection, comp, norm, data_X, data_Y, model_name):
#   ipdb.set_trace(context=10)
  results = pd.DataFrame(index=range(10))
  col_name = f"{model_name}_stratified_kfold_%s"
  predictions, Y = [], []
  f1s, w_aucs, bacc, tpr, tnr, prec = [], [], [], [], [], []
  for train_idx, test_idx in stratified_kfold.split(data_X):
    print("Fold %s" % str(len(f1s)))
    train_X = data_X.iloc[train_idx]
    test_X = data_X.iloc[test_idx]
    train_Y, test_Y = data_Y.iloc[train_idx], data_Y.iloc[test_idx]
    
    if norm:
      train_X, test_X = normalize(train_X, norm=norm), normalize(test_X, norm=norm)
    if scaler:
      train_X, test_X = scaler(train_X, test_X)
    if feature_selection:
      train_X, test_X = feature_selection(train_X, test_X, comp)
    model = model_builder_fct(train_X, train_Y)
    print("Model fit")
    pred = model.predict(test_X)
    predictions.append(pred)
    Y.append(test_Y)
#     pred_Y = np.asarray(np.clip(pred,0,1)).round()
    pred_Y = model.predict(test_X)
#     print(sum(pred_Y))

#     pred_Y = np.vectorize(lambda x: 0 if x < 0.5 else 1)(pred_Y)
    f1s.append(f1_score(test_Y, pred_Y))
    w_aucs.append(roc_auc_score(test_Y, pred_Y, average="weighted"))
    tn, fp, fn, tp = confusion_matrix(test_Y, pred_Y).ravel()
    bacc.append(balanced_accuracy_score(test_Y, pred_Y))
    tpr.append(tp/(tp+fn))
    tnr.append(tn/(fp+tn))
#     assert tpr == recall_score(test_Y, pred_Y)
    prec.append(precision_score(test_Y, pred_Y))
    
  f1_df = pd.DataFrame({col_name % ("F1"): f1s})
  auc_df = pd.DataFrame({col_name % ("AUC"): w_aucs})
  bacc_df = pd.DataFrame({col_name % ("BACC"): bacc})
  tpr_df = pd.DataFrame({col_name % ("TPR(recall)"): tpr})
  tnr_df = pd.DataFrame({col_name % ("TNR"): tnr})
  prec = pd.DataFrame({col_name % ("Precision"): prec})
  results = pd.concat([results, f1_df, auc_df, bacc_df, tpr_df, tnr_df, prec], axis=1)
  return results, (Y, predictions)

In [None]:
def build_nc_1(train,test):
    model = NearestCentroid(metric='manhattan')
    return model.fit(X_train, Y_train)

### Ensemble

In [None]:
PATH_PREFIX = '../input/'
TWITTER_RELATIVE_500 = PATH_PREFIX + "Twitter-Relative-Sigma-500.data"

def load_dataset(path):
	data = pd.read_csv(path)
	return data[data.columns[:-1]], data[data.columns[-1]]

data_X, data_Y = load_dataset(TWITTER_RELATIVE_500)
data_X = data_X.astype(float)

X_train, X_test, Y_train, Y_test = train_test_split(data_X, data_Y, test_size=0.2, random_state=1)
# X_train, X_val, Y_train, Y_val = train_test_split(X_train, Y_train, test_size=0.2, random_state=1)

In [None]:
stratified_kfold = KFold(10, False, 1)

In [None]:
def scale_data_standard(train, test):
#   scaler = MinMaxScaler((0, 37505))
  scaler = StandardScaler()
#   scaler = RobustScaler()

  scaler.fit(train)
  return scaler.transform(train), scaler.transform(test)

In [None]:
def feature_selection(train, test, comp):
#     pca = PCA(n_components=comp)
    pca = PCA(n_components=comp) #-> 17 components
    pca = pca.fit(train)
    print((pca.components_).shape[0])

    return pca.transform(train), pca.transform(test)

In [None]:
def randforest(train, test):
    model=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
            max_depth=None, max_features=0.2, max_leaf_nodes=None,
            min_impurity_decrease=0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=77, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
    return model.fit(train,test)

In [None]:
results, (y, pred) = test_results(randforest, scale_data_standard, None,None,"l2", pd.DataFrame(normalize(data_X)), data_Y, "NC") # minmax scaler + pca

In [None]:
results.describe().iloc[1] # normalized

In [None]:
results.describe().iloc[1] #


In [None]:
from sklearn.ensemble import RandomForestClassifier
# X_train, X_test = scale_robust_data(X_train, X_test)
# X_train, X_test = pca_transform(X_train, X_test)
grid = {
    'n_estimators': [2,3,10,20,77],
    'criterion': ["entropy"],
    'max_features': [0.2, 0.5, 0.7, "sqrt", 'log2', 77],
    'min_impurity_decrease': [0, 0.1,0.01, 1],
    'class_weight': [None, 'balanced', 'balanced_subsample']
}
paramGrid = ParameterGrid(grid)
bestModel, bestScore, allModels, allScores = pf.bestFit(RandomForestClassifier, paramGrid,
           X_train, Y_train, X_test, Y_test, 
           metric = f1_score,
           greater_is_better=True,
           scoreLabel = "F1")
print(bestModel, bestScore)

### Bagging

In [None]:
def build_bag(train, test):
    model = BaggingClassifier(base_estimator=KNeighborsClassifier(n_neighbors=3, weights='distance', algorithm='kd_tree', p=1),
#                               DecisionTreeClassifier(class_weight='balanced', criterion='entropy',
#             max_depth=None, max_features=None, max_leaf_nodes=None,
#             min_impurity_decrease=0.0, min_impurity_split=None,
#             min_samples_leaf=1, min_samples_split=2,
#             min_weight_fraction_leaf=0.0, presort=False, random_state=None,
#             splitter='best'),
         bootstrap=True, bootstrap_features=False, max_features=1.0,
         max_samples=0.3, n_estimators=5, n_jobs=None, oob_score=False,
         random_state=None, verbose=0, warm_start=False)
    return model.fit(train, test)

In [None]:
data_X.values

In [None]:
bag_results, (y, pred) = test_results(build_bag, scale_data_standard, None, None,"l2", data_X, data_Y, "SGDStandardSC")

In [None]:
def plot_roc_curve(true_Y, scores, title = "ROC curve"):
  plt.figure()
  for i in range(np.shape(true_Y)[0]):
    Y = true_Y[i]
    pred = scores[i]
    fpr, tpr, _ = roc_curve(Y, pred)
    roc_auc = auc(fpr, tpr)
    plt.plot(fpr, tpr,
               lw=2, label='ROC curve (weighted_area = %0.2f)' % roc_auc)
  plt.plot([0, 1], [0, 1], color='navy', lw=2, linestyle='--')
  plt.xlim([0.0, 1.0])
  plt.ylim([0.0, 1.05])
  plt.xlabel('False Positive Rate')
  plt.ylabel('True Positive Rate')
  plt.title(title)
  plt.legend()
  plt.show()

In [None]:
plot_roc_curve(y, pred) #bag knn + norm+ scal

In [None]:
plot_roc_curve(y, pred) #bag knn

In [None]:
bag_results.describe().iloc[1] # bag knn + norm +scal

In [None]:
bag_results # bag knn + norm +scal

In [None]:
bag_results.describe().iloc[1] # bag knn


In [None]:
bag_results.filter(regex="F1")# bag knn

In [None]:
plot_roc_curve(y, pred)

In [None]:
bag_results.describe().iloc[1] # random forest


In [None]:
bag_results# random forest


In [None]:
bag_results.filter(regex="F1")

In [None]:

X_train, X_test = scale_data_standard(X_train, X_test)
X_train, X_test = feature_selection(X_train, X_test, 37)
grid = {
    'base_estimator': [
#         RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
#             max_depth=None, max_features=0.2, max_leaf_nodes=None,
#             min_impurity_decrease=0, min_impurity_split=None,
#             min_samples_leaf=1, min_samples_split=2,
#             min_weight_fraction_leaf=0.0, n_estimators=77, n_jobs=None,
#             oob_score=False, random_state=None, verbose=0,
#             warm_start=False)],
                       KNeighborsClassifier(n_neighbors=3, weights='distance', algorithm='kd_tree', p=1)],
#                        tree.DecisionTreeClassifier(criterion="entropy", class_weight="balanced")],
    'n_estimators': [5, 10, 20, 77, 100, 120],
    'max_samples': [0.01, 0.1, 0.2, 0.3],
    'max_features': [0.2, 0.5, 0.7, 1.0],
    'bootstrap_features': [True, False],
}
paramGrid = ParameterGrid(grid)
bestModel, bestScore, allModels, allScores = pf.bestFit(BaggingClassifier, paramGrid,
           X_train, Y_train, X_test, Y_test, 
           metric = f1_score,
           greater_is_better=True,
           scoreLabel = "F1")
print(bestModel, bestScore)


# BaggingClassifier(base_estimator=DecisionTreeClassifier(class_weight='balanced', criterion='entropy',
#             max_depth=None, max_features=None, max_leaf_nodes=None,
#             min_impurity_decrease=0.0, min_impurity_split=None,
#             min_samples_leaf=1, min_samples_split=2,
#             min_weight_fraction_leaf=0.0, presort=False, random_state=None,
#             splitter='best'),
#          bootstrap=True, bootstrap_features=False, max_features=1.0,
#          max_samples=0.3, n_estimators=5, n_jobs=None, oob_score=False,
#          random_state=None, verbose=0, warm_start=False) 0.48144624167459565
# BaggingClassifier(base_estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='entropy',
#             max_depth=None, max_features=0.2, max_leaf_nodes=None,
#             min_impurity_decrease=0, min_impurity_split=None,
#             min_samples_leaf=1, min_samples_split=2,
#             min_weight_fraction_leaf=0.0, n_estimators=77, n_jobs=None,
#             oob_score=False, random_state=None, verbose=0,
#             warm_start=False),
#          bootstrap=True, bootstrap_features=False, max_features=1.0,
#          max_samples=0.3, n_estimators=5, n_jobs=None, oob_score=False,
#          random_state=None, verbose=0, warm_start=False) 0.5565565565565566

### RULE

In [None]:
PATH_PREFIX = '../input/'
TWITTER_RELATIVE_500 = PATH_PREFIX + "Twitter-Relative-Sigma-500.data"

def load_data(path):
	data = pd.read_csv(path)
	return pd.DataFrame(data)

data = load_data(TWITTER_RELATIVE_500)

In [None]:

def series2table(series, variable):
    if series.dtype is np.dtype("int") or series.dtype is np.dtype("float"):
        series = series.values[:, np.newaxis]
        return Orange.data.Table(series)
    else:
        series = series.astype('category').cat.codes.reshape((-1,1))
        return Orange.data.Table(series)

def df2table(tdomain, df):
#     tdomain = df2domain(df)
    ttables = [series2table(df.iloc[:,i], tdomain[i]) for i in range(len(df.columns))]
    ttables = np.array(ttables).reshape((len(df.columns),-1)).transpose()
    return Orange.data.Table(tdomain , ttables)

In [None]:
import Orange
domain = Orange.data.Domain([Orange.data.DiscreteVariable(name) for name in data.columns])
tbl = df2table(domain, data)

# print(tbl.class_var)
# tbl = Orange.data.Table.from_file(PATH_PREFIX + "data/Twitter-Relative-Sigma-500_rule.txt")
learner = Orange.classification.CN2UnorderedLearner()

# consider up to 10 solution streams at one time
learner.rule_finder.search_algorithm.beam_width = 10

# continuous value space is constrained to reduce computation time
learner.rule_finder.search_strategy.constrain_continuous = True

# found rules must cover at least 15 examples
learner.rule_finder.general_validator.min_covered_examples = 15

# found rules may combine at most 2 selectors (conditions)
learner.rule_finder.general_validator.max_rule_length = 2

classifier = learner(tbl)

# Cross validating results
res = Orange.evaluation.testing.CrossValidation(tbl, [learner], k=5)