In [25]:
import re
import string
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score,classification_report,f1_score
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer

import warnings
warnings.filterwarnings('ignore')


In [26]:
#loading files
train_df=pd.read_csv('../input/nlp-getting-started/train.csv')
test_df=pd.read_csv('../input/nlp-getting-started/test.csv')


In [27]:
train_df['keyword'].fillna(train_df['keyword'].mode()[0],inplace=True)   #replacing NaN in keyword with mode values
test_df['keyword'].fillna(test_df['keyword'].mode()[0],inplace=True)

train_df['location'].fillna(train_df['location'].mode()[0],inplace=True)  #replacing NaN in location with mode values
test_df['location'].fillna(test_df['location'].mode()[0],inplace = True)

In [28]:
df=pd.concat([train_df.assign(ind='train'),test_df.assign(ind='test')]) #concating after assigning a new column whcih will help to separte test and train later


In [29]:
def remove_html(text):
    html=re.compile(r'<.*?>')    
    return html.sub(r'',text)   #removing html texts


In [30]:
df['text']=df['text'].apply(lambda x:remove_html(x))   #apply lambda to all values in df series



In [31]:
def remove_url(text):
    url=re.compile(r'https?://\S+|www\.\S+')
    return url.sub(r'',text)

In [32]:
df['text']=df['text'].apply(lambda x:remove_url(x))  #pass a function and apply it to every single value of the series


In [33]:
def remove_emoji(text):
    emoji_pattern = re.compile('['
                                u"\U0001F600-\U0001F64F"  # emoticons
                                u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                                u"\U0001F680-\U0001F6FF"  # transport & map symbols
                                u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                                u"\U00002702-\U000027B0"
                                u"\U000024C2-\U0001F251"
                                ']+',flags=re.UNICODE)
    return emoji_pattern.sub(r'',text)

In [34]:
df['text']=df['text'].apply(lambda x:remove_emoji(x))


In [35]:
def remove_punct(text):
    table = str.maketrans('','',string.punctuation)
    return text.translate(table)


In [36]:
df['text']=df['text'].apply(lambda x: remove_punct(x))


In [37]:
train=df[df['ind'].eq('train')]                  #separating train data by the indicator we put ealier
test=df[df['ind'].eq('test')]
train=train.drop(axis=1,labels='ind')           # dropping the indicator from training set
test=test.drop(['ind'],axis=1)

In [38]:
def splits(x,y):
    return(train_test_split(x,y,test_size=0.2,random_state=1))

In [39]:
def scores(mdl):
    p=mdl.predict(X_test)
    print(roc_auc_score(y_test,p))
    print(classification_report(y_test,p,digits=5))
    print(f1_score(y_test,p))

In [40]:
cv = CountVectorizer()  #creating cv ,CountVectorizer object


In [41]:
cv_train=cv.fit_transform(train['text'])     
cv_test=cv.transform(test['text'])

In [42]:
cv_train.todense()             


matrix([[0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        ...,
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0],
        [0, 0, 0, ..., 0, 0, 0]])

In [43]:
gd=pd.get_dummies(train_df['keyword'])
gdt=pd.get_dummies(test_df['keyword'])
gd,gdt=gd.align(gdt,join='left',axis=1)   # the join=left method creates columns.Both df has same columns in the end

gd2=pd.get_dummies(train_df['location'])
gdt2=pd.get_dummies(test_df['location'])
gd2,gdt2=gd2.align(gdt2,join='left',axis=1)

In [44]:
training=pd.DataFrame(cv_train.todense(),columns=cv.get_feature_names_out())  #converting sparse matrix to
testing=pd.DataFrame(cv_test.todense(),columns=cv.get_feature_names_out())

In [45]:
training=pd.concat([training,gd,gd2],axis=1)  #joining vectorised text,location and keyword features
testing=pd.concat([testing,gdt,gdt2],axis=1)

In [46]:
training.fillna(0,inplace=True)   #filling zeros in the position created by aligning
testing.fillna(0,inplace=True) 

In [47]:
X_train,X_test,y_train,y_test=splits(training,train['target'])  #creating validation and train set


In [48]:
from tensorflow import keras
from tensorflow.keras import layers


In [49]:
input_dim= X_train.shape[1]


In [50]:
X_train.shape


(6090, 21360)

In [51]:
input_shape=X_train.shape[1]


In [52]:
model= keras.Sequential([layers.Dense(units=5,activation='relu',input_shape=[input_shape]),
                        layers.Dense(units=1,activation='sigmoid')])

In [53]:
X_train = np.asarray(X_train)
y_train = np.asarray(y_train)
X_test = np.asarray(X_test)
y_test = np.asarray(y_test)
testing=np.asarray(testing)

In [54]:
type(X_train)


numpy.ndarray

In [55]:
model.compile(optimizer='adam',loss='binary_crossentropy',metrics=['binary_accuracy'])


In [56]:
X_train, y_train, X_test, y_test = X_train.astype('float64'), y_train.astype('float64'), X_test.astype('float64'), y_test.astype('float64')

history= model.fit(X_train,y_train,
                  validation_data=(X_test,y_test),
                  batch_size=500,
                  epochs=20,
                  )

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [57]:
testing = testing.astype('float64')

p=model.predict(testing)



In [58]:
pred=np.round(p).astype(int)


In [59]:
pred=pred.reshape(3263)


In [60]:
submission=pd.DataFrame({'id':test['id'],'target':pred})

submission.to_csv('submission.csv',index=False)

In [61]:
submission


Unnamed: 0,id,target
0,0,1
1,2,1
2,3,1
3,9,0
4,11,1
...,...,...
3258,10861,0
3259,10865,1
3260,10868,1
3261,10874,1
