In [8]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import precision_recall_curve
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.multiclass import OneVsRestClassifier
import tensorflow as tf

train = pd.read_csv('train.csv')
train = train.fillna(0)

test = pd.read_csv('test.csv')
test = test.fillna(0)

test_id = test['id']
test = test.drop(columns = ['id', 'keyword', 'location', 'text', 'clean_tweet'], axis = 1)

## Defining input and target variables 
X = train.drop(columns = ['id', 'keyword', 'location', 'text', 'clean_tweet', 'target'], axis = 1)
Y = train['target']

## Splitting the data 
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2)

## Scaling the data 
scaler = MinMaxScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.fit_transform(X_test)
test = scaler.fit_transform(test)

## Defining model 
model = tf.keras.models.Sequential([
        tf.keras.layers.Dense(10, input_dim = 65, activation = 'relu'),
#         tf.keras.layers.Dense(10, activation = 'relu'),
        tf.keras.layers.Dense(2, activation = 'softmax')
])

model.compile(optimizer = 'adam', loss = 'categorical_crossentropy', metrics = ['accuracy'])

## Fitting model 
model.fit(X_train, tf.keras.utils.to_categorical(Y_train, num_classes = 2), epochs = 50, batch_size = 32, validation_data = (X_test, tf.keras.utils.to_categorical(Y_test, num_classes = 2)))

## Predicting on train and test
nn_train_pred = model.predict(X_train)[:, 1]
nn_test_pred = model.predict(test)[:, 1]


def precision_recall_cutoff(Y_test, Y_pred):
    
    ## Computing the precision recall curve
    precision, recall, thresholds = precision_recall_curve(Y_test, Y_pred)
    
    ## Creating the precision-recall data-frame
    precision_recall = pd.DataFrame({'precision': precision[:-1], 'recall': recall[:-1], 'cutoff': thresholds})
    
    ## Finding the optimal cutoff (closest to precision = 1, recall = 1)
    precision_recall['1_minus_precision'] = 1 - precision_recall['precision']
    precision_recall['1_minus_recall'] = 1 - precision_recall['recall']
    precision_recall['Distance_to_perfect_model'] = np.sqrt(precision_recall['1_minus_precision']**2 + precision_recall['1_minus_recall']**2)
    
    ## Sorting based on Distance 
    precision_recall = precision_recall.sort_values(by = 'Distance_to_perfect_model').reset_index(drop = True)
        
    return precision_recall['cutoff'][0]

opt_cutoff = precision_recall_cutoff(Y_train, nn_train_pred)
print('The optimal cutoff is', opt_cutoff)

## Changing likelihoods to labels
nn_test_label = np.where(nn_test_pred < opt_cutoff, 0, 1)

## Creating data-frame for submission
data_out = pd.DataFrame({'id': test_id, 'target': nn_test_label})
# data_out.to_csv('network_submission_3.csv', index = False)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50
The optimal cutoff is 0.34716642


In [13]:
data_out.to_csv('network_submission_3.csv', index = False)

In [7]:
data_out['target'].value_counts() / data_out.shape[0]

1    0.517009
0    0.482991
Name: target, dtype: float64

In [9]:
data_out['target'].value_counts() / data_out.shape[0]

1    0.581673
0    0.418327
Name: target, dtype: float64

In [11]:
Y.value_counts() / Y.shape[0]

0    0.57034
1    0.42966
Name: target, dtype: float64

In [2]:
X_train.shape

(6090, 65)

In [17]:
nn_pred

array([[0.7194805 , 0.28051952],
       [0.6584695 , 0.3415305 ],
       [0.6979774 , 0.3020226 ],
       ...,
       [0.35780114, 0.64219886],
       [0.62765753, 0.37234244],
       [0.5311666 , 0.46883336]], dtype=float32)

In [None]:
def precision_recall_cutoff(Y_test, Y_pred):
    
    ## Computing the precision recall curve
    precision, recall, thresholds = precision_recall_curve(Y_test, Y_pred)
    
    ## Creating the precision-recall data-frame
    precision_recall = pd.DataFrame({'precision': precision[:-1], 'recall': recall[:-1], 'cutoff': thresholds})
    
    ## Finding the optimal cutoff (closest to precision = 1, recall = 1)
    precision_recall['1_minus_precision'] = 1 - precision_recall['precision']
    precision_recall['1_minus_recall'] = 1 - precision_recall['recall']
    precision_recall['Distance_to_perfect_model'] = np.sqrt(precision_recall['1_minus_precision']**2 + precision_recall['1_minus_recall']**2)
    
    ## Sorting based on Distance 
    precision_recall = precision_recall.sort_values(by = 'Distance_to_perfect_model').reset_index(drop = True)
        
    return precision_recall['cutoff'][0]


logit_pred = logit_md.predict_(X)[:, 1]
opt_cutoff = precision_recall_cutoff(Y, logit_pred)
print('The optimal cutoff is', opt_cutoff)

## Predicting on test 
logit_test_pred = logit_md.predict_proba(test.drop(columns = ['id', 'keyword', 'text'], axis = 1))[:, 1]

## Changing likelihoods to labels
logit_test_label = np.where(logit_test_pred < opt_cutoff, 0, 1)

## Creating data-frame for submission
data_out = pd.DataFrame({'id': test['id'], 'target': logit_test_label})
data_out.to_csv('Logistic_submission_1.csv', index = False)


In [9]:
Y_test

7092    0
145     1
6006    1
6937    0
2419    1
       ..
2112    0
4728    0
5594    1
5142    0
6986    0
Name: target, Length: 1523, dtype: int64

In [4]:
test.head()

Unnamed: 0,id,keyword,location,text,clean_tweet,subjectivity,negative,neutral,positive,char_count,word_count,sent_count,capital_char_count,capital_word_count,stopword_count,unique_word_count,avg_wordlength,avg_sentlength,unique_vs_words,stopwords_vs_words
0,0,0,0,Just happened a terrible car crash,happened terrible car crash,1.0,1,0,0,34,6,1,1,0,1,6,5.666667,6.0,1.0,0.166667
1,2,0,0,"Heard about #earthquake is different cities, s...",heard different cities stay safe everyone,0.55,0,0,1,64,9,1,1,0,2,9,7.111111,9.0,1.0,0.222222
2,3,0,0,"there is a forest fire at spot pond, geese are...",forest fire spot pond geese fleeing across str...,0.0,0,1,0,96,19,1,1,1,10,19,5.052632,19.0,1.0,0.526316
3,9,0,0,Apocalypse lighting. #Spokane #wildfires,apocalypse lighting,0.0,0,1,0,40,4,2,2,0,0,4,10.0,2.0,1.0,0.0
4,11,0,0,Typhoon Soudelor kills 28 in China and Taiwan,typhoon soudelor kills 28 china taiwan,0.0,0,1,0,45,8,1,4,0,2,8,5.625,8.0,1.0,0.25
