In [98]:
! pip install tldextract

You should consider upgrading via the '/opt/conda/bin/python3.7 -m pip install --upgrade pip' command.[0m


In [99]:
import pandas as pd
import numpy as np
from tldextract import extract
import re,sys,os
import warnings
warnings.filterwarnings('ignore')

In [100]:
#nltk libraries
import nltk
from nltk.stem import WordNetLemmatizer,PorterStemmer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
nltk.download('stopwords')
stop=set(stopwords.words('english'))
from nltk.stem.snowball import SnowballStemmer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn import metrics
import matplotlib.pyplot as plt

[nltk_data] Downloading package stopwords to /usr/share/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [101]:
#import libraries
import keras
import tensorflow as tf
from keras.models import Sequential
from keras.layers import LSTM, Dense
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.optimizers import RMSprop
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import to_categorical
from keras.callbacks import EarlyStopping
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.preprocessing.sequence import pad_sequences

# Data Preprocessing

In [102]:
train_df = pd.read_csv('../input/zsdataset/train.csv',encoding= 'ISO-8859-1')
test_df = pd.read_csv('../input/zsdataset/test.csv',encoding= 'ISO-8859-1')

#extracts the url from the link, we will extract hostname only
def extract_url(x):
    tsd, td, tsu = extract(x) # prints abc, hostname, com
    return td

def classes_def(x):
    if x ==  "FACEBOOK":
        return "Facebook"
    elif x == 'FORUMS':
        return 'Forums'
    elif x == 'BLOG':
        return 'Blog'
    elif x == 'YOUTUBE':
        return 'Youtube'
    else:
        return 'Facebook'

def clean_text(x):
    normalizedsentense = x.lower()
    text = re.sub(r"[^a-z']+", ' ', normalizedsentense)
    return text


stemmer = SnowballStemmer('english')
def stemming(sentence):
    word_list = nltk.word_tokenize(sentence)
    stemmed_output = ' '.join([stemmer.stem(w) for w in word_list])
    return stemmed_output

train_df['Source']=train_df['Source'].apply(lambda x:classes_def(x))
test_df['Source']=test_df['Source'].apply(lambda x:classes_def(x))
train_df["Host"].fillna(train_df["Link"], inplace=True)
test_df["Host"].fillna(test_df["Link"], inplace=True)
train_df['Host']=train_df['Host'].apply(lambda x:extract_url(x))
test_df['Host']=test_df['Host'].apply(lambda x:extract_url(x))
train_df.loc[train_df.Host == '' , 'Host'] = 'youtube'

train_df = train_df.drop(['Link','time(GMT)','Title'],axis= 1)
test_df = test_df.drop(['Link','time(GMT)','Title'],axis= 1)

train_df['Date(ET)'] = pd.to_datetime(train_df['Date(ET)'],errors='coerce').dt.date
test_df['Date(ET)'] = pd.to_datetime(test_df['Date(ET)'],errors='coerce').dt.date

train_df['Time(ET)'] = pd.to_datetime(train_df['Time(ET)'],errors='coerce').dt.time
test_df['Time(ET)'] = pd.to_datetime(test_df['Time(ET)'],errors='coerce').dt.time

test_df.drop(columns = ['Index','Unnamed: 9'],inplace = True)

train_df = train_df[train_df['TRANS_CONV_TEXT'].notna()]

train_df['TRANS_CONV_TEXT']=train_df['TRANS_CONV_TEXT'].apply(lambda x:clean_text(x))
test_df['TRANS_CONV_TEXT']=test_df['TRANS_CONV_TEXT'].apply(lambda x:clean_text(x))

train_df['TRANS_CONV_TEXT']=train_df['TRANS_CONV_TEXT'].apply(lambda x: stemming(x))
test_df['TRANS_CONV_TEXT']=test_df['TRANS_CONV_TEXT'].apply(lambda x: stemming(x))

Tokenization

In [103]:
VOCAB_SIZE=4500
MAXLEN=2000
tokenizer=Tokenizer(VOCAB_SIZE,oov_token='<oov>', filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n')  # filtering special characters
train_data = train_df[['TRANS_CONV_TEXT','Patient_Tag']]
test_data = test_df[['TRANS_CONV_TEXT']]
tokenizer.fit_on_texts(train_data.TRANS_CONV_TEXT)
tokenizer.fit_on_texts(test_data.TRANS_CONV_TEXT)

In [104]:
def df_to_padded_sequences(df,tokenizer):
    sequences=tokenizer.texts_to_sequences(df.TRANS_CONV_TEXT)                                              #text to sequence of integers
    padded_sequences=pad_sequences(sequences,maxlen=MAXLEN, padding='post', truncating='post')  #padding
    return padded_sequences

X_train=df_to_padded_sequences(train_df,tokenizer)
df_test=df_to_padded_sequences(test_df,tokenizer)

In [105]:
df_test

array([[1206, 2526,    2, ...,    0,    0,    0],
       [ 336, 3523,  561, ...,    0,    0,    0],
       [ 746,  488, 1531, ...,    0,    0,    0],
       ...,
       [ 943,    2,  100, ...,    0,    0,    0],
       [ 229,  552,    8, ...,    0,    0,    0],
       [1021,    8,  166, ...,    0,    0,    0]], dtype=int32)

In [106]:
y_train = train_df.Patient_Tag

In [107]:
X_train ,X_test, y_train, y_test = train_test_split(X_train, y_train, test_size = 0.2, random_state = 42)

In [108]:
print('Training features shape: ',X_train.shape)
print('Validation features shape: ',X_test.shape)

print('Training labels shape: ', y_train.shape)
print('Validation labels shape: ', y_test.shape)

Training features shape:  (924, 2000)
Validation features shape:  (232, 2000)
Training labels shape:  (924,)
Validation labels shape:  (232,)


In [112]:
model1 = keras.Sequential([
    keras.layers.Embedding(VOCAB_SIZE, 32,input_length=MAXLEN),
    keras.layers.SpatialDropout1D(0.2),
    keras.layers.Bidirectional(keras.layers.LSTM(16)),
    keras.layers.Dense(1, activation="sigmoid")
])

In [113]:
model1.summary()

Model: "sequential_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_5 (Embedding)      (None, 2000, 32)          144000    
_________________________________________________________________
spatial_dropout1d_5 (Spatial (None, 2000, 32)          0         
_________________________________________________________________
bidirectional_5 (Bidirection (None, 32)                6272      
_________________________________________________________________
dense_5 (Dense)              (None, 1)                 33        
Total params: 150,305
Trainable params: 150,305
Non-trainable params: 0
_________________________________________________________________


In [122]:
EPOCHS=30

early_stopping = keras.callbacks.EarlyStopping(
    monitor='val_acc', 
    verbose=1,
    patience=5,
    mode='max',
    restore_best_weights=True)

model1.compile(loss="binary_crossentropy",optimizer=keras.optimizers.RMSprop(1e-4), metrics=['accuracy'])

history1 = model1.fit(X_train, y_train, validation_data=(X_test, y_test), batch_size=32, epochs=EPOCHS, callbacks = [early_stopping])

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


In [125]:
results_val=model1.predict(X_test)
results_val = [int(i>0.5) for i in results_val]
print(f'roc_auc = {metrics.roc_auc_score(y_test,results_val)}')
print(classification_report(y_test,results_val))
print(f"Accuracy Score = {accuracy_score(y_test,results_val)}")

roc_auc = 0.7624117213028355
              precision    recall  f1-score   support

           0       0.88      0.98      0.93       179
           1       0.88      0.55      0.67        53

    accuracy                           0.88       232
   macro avg       0.88      0.76      0.80       232
weighted avg       0.88      0.88      0.87       232

Accuracy Score = 0.8793103448275862


The results are not as good as it can be. But we can get a generics idea of LSTM 

In [118]:
results_test=model1.predict(df_test)

[[0.02114462]
 [0.0543978 ]
 [0.0215206 ]
 [0.86225826]
 [0.01820859]
 [0.0129878 ]
 [0.0292798 ]
 [0.07236271]
 [0.02666892]
 [0.01426792]
 [0.0210984 ]
 [0.01608971]
 [0.9046669 ]
 [0.01750873]
 [0.8435663 ]
 [0.89162296]
 [0.01856   ]
 [0.01767087]
 [0.01872816]
 [0.01635309]
 [0.0786112 ]
 [0.0215809 ]
 [0.02177105]
 [0.75974834]
 [0.01912771]
 [0.01866411]
 [0.01724737]
 [0.02229598]
 [0.07647318]
 [0.02203828]
 [0.017695  ]
 [0.01179892]
 [0.01801868]
 [0.02027011]
 [0.01609847]
 [0.45788285]
 [0.8299889 ]
 [0.02101289]
 [0.01532494]
 [0.01162338]
 [0.02415241]
 [0.04548585]
 [0.02307042]
 [0.0457787 ]
 [0.8651436 ]
 [0.03285392]
 [0.01854251]
 [0.8220957 ]
 [0.01812096]
 [0.01721468]
 [0.02035712]
 [0.01188848]
 [0.03021155]
 [0.01682877]
 [0.18996653]
 [0.7391718 ]
 [0.02320146]
 [0.01392793]
 [0.02206663]
 [0.01779619]
 [0.01663175]
 [0.02217299]
 [0.05568039]
 [0.04332101]
 [0.02057016]
 [0.01965763]
 [0.01441625]
 [0.01868075]
 [0.04995175]
 [0.02114366]
 [0.01792423]
 [0.88

In [119]:
results_test = [int(i>0.5) for i in results_test]
len(results_test)

571

In [120]:
submission_nn = pd.read_csv('../input/zsdataset/test.csv',encoding = 'ISO-8859-1')
submission = pd.DataFrame()
submission['Index'] = submission_nn['Index']
submission['Patient_Tag'] = results_test
submission.to_csv('submission_nn.csv',index = False)