In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import precision_recall_curve
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.multiclass import OneVsRestClassifier
import tensorflow as tf

train = pd.read_csv('train.csv')
train = train.fillna(0)

test = pd.read_csv('test.csv')
test = test.fillna(0)

test_id = test['id']
test = test.drop(columns = ['id', 'keyword', 'location', 'text', 'clean_tweet'], axis = 1)

## Defining input and target variables 
X = train.drop(columns = ['id', 'keyword', 'location', 'text', 'clean_tweet', 'target'], axis = 1)
Y = train['target']

## Splitting the data 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)

## Scaling the data 
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)
test = scaler.fit_transform(test)

## Defining model 
model = tf.keras.models.Sequential([
        tf.keras.layers.Dense(10, input_dim = 65, activation = 'relu'),
#         tf.keras.layers.Dense(10, activation = 'relu'),
        tf.keras.layers.Dense(2, activation = 'softmax')
])

model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

## Fitting model 
model.fit(X_train, tf.keras.utils.to_categorical(Y_train, num_classes = 2), epochs = 50, batch_size = 32, validation_data = (X_test, tf.keras.utils.to_categorical(Y_test, num_classes = 2)))

## Predicting on train and test
nn_train_pred = model.predict(X_train)[:, 1]
nn_test_pred = model.predict(test)[:, 1]


def precision_recall_cutoff(Y_test, Y_pred):
    
    ## Computing the precision recall curve
    precision, recall, thresholds = precision_recall_curve(Y_test, Y_pred)
    
    ## Creating the precision-recall data-frame
    precision_recall = pd.DataFrame({'precision': precision[:-1], 'recall': recall[:-1], 'cutoff': thresholds})
    
    ## Finding the optimal cutoff (closest to precision = 1, recall = 1)
    precision_recall['1_minus_precision'] = 1 - precision_recall['precision']
    precision_recall['1_minus_recall'] = 1 - precision_recall['recall']
    precision_recall['Distance_to_perfect_model'] = np.sqrt(precision_recall['1_minus_precision']**2 + precision_recall['1_minus_recall']**2)
    
    ## Sorting based on Distance 
    precision_recall = precision_recall.sort_values(by = 'Distance_to_perfect_model').reset_index(drop = True)
        
    return precision_recall['cutoff'][0]

opt_cutoff = precision_recall_cutoff(Y_train, nn_train_pred)
print('The optimal cutoff is', opt_cutoff)

## Changing likelihoods to labels
nn_test_label = np.where(nn_test_pred < opt_cutoff, 0, 1)

## Creating data-frame for submission
data_out = pd.DataFrame({'id': test_id, 'target': nn_test_label})
# data_out.to_csv('network_submission_3.csv', index = False)