# Imports

In [4]:
import pandas as pd 
import numpy as np 
import tensorflow 
import keras 
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer 
from tqdm.auto import tqdm
import re
import nltk 
from nltk.stem import WordNetLemmatizer
nltk.download('wordnet')
from nltk.corpus import wordnet
import spacy
from tensorflow.keras.preprocessing.text import Tokenizer

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Honda\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Read_Data

In [5]:
Data=pd.read_csv('dataset.csv')
Data_prep_2=Data.copy()

In [6]:
Data.drop(columns='id',inplace=True)

In [7]:
Data.head()

Unnamed: 0,label,tweet
0,0,@user when a father is dysfunctional and is s...
1,0,@user @user thanks for #lyft credit i can't us...
2,0,bihday your majesty
3,0,#model i love u take with u all the time in ...
4,0,factsguide: society now #motivation


# EDA

Check_Counts

In [8]:
Data.label.value_counts()

0    29720
1     2242
Name: label, dtype: int64

Check_nulls

In [9]:
Data.isna().any()

label    False
tweet    False
dtype: bool

Check_duplicates

In [10]:
Data.duplicated().any()

True

In [11]:
Data.drop_duplicates(inplace=True)

# Preprocessing

As baseline processing we will do so :

1-check for mails to remove

2-check for websites to remove
 
3-remove non chars

4-Normalization

5-remove stop words

6-Lemmitization

In [12]:
Data['tweet'].apply(lambda x: re.findall('\S+@\S+',x)).any()

True

In [13]:
Data['tweet']=Data['tweet'].apply(lambda x: re.sub('\S+@\S+',' ',x))
Data['tweet']

0         @user when a father is dysfunctional and is s...
1        @user @user thanks for #lyft credit i can't us...
2                                      bihday your majesty
3        #model   i love u take with u all the time in ...
4                   factsguide: society now    #motivation
                               ...                        
31956    off fishing tomorrow @user carnt wait first ti...
31957    ate @user isz that youuu?ðððððð...
31958      to see nina turner on the airwaves trying to...
31959    listening to sad songs on a monday morning otw...
31961                     thank you @user for you follow  
Name: tweet, Length: 29530, dtype: object

In [14]:
Data['tweet'].apply(lambda x: re.findall('http\S+',x)).any()

False

In [15]:
Data['tweet']=Data['tweet'].apply(lambda x:re.sub('[^A-Za-z0-9]',' ',x))
Data['tweet']

0          user when a father is dysfunctional and is s...
1         user  user thanks for  lyft credit i can t us...
2                                      bihday your majesty
3         model   i love u take with u all the time in ...
4                   factsguide  society now     motivation
                               ...                        
31956    off fishing tomorrow  user carnt wait first ti...
31957    ate  user isz that youuu                      ...
31958      to see nina turner on the airwaves trying to...
31959    listening to sad songs on a monday morning otw...
31961                     thank you  user for you follow  
Name: tweet, Length: 29530, dtype: object

In [16]:
Data['tweet']=Data['tweet'].apply(lambda x: x.lower())

In [17]:
Data['tweet']

0          user when a father is dysfunctional and is s...
1         user  user thanks for  lyft credit i can t us...
2                                      bihday your majesty
3         model   i love u take with u all the time in ...
4                   factsguide  society now     motivation
                               ...                        
31956    off fishing tomorrow  user carnt wait first ti...
31957    ate  user isz that youuu                      ...
31958      to see nina turner on the airwaves trying to...
31959    listening to sad songs on a monday morning otw...
31961                     thank you  user for you follow  
Name: tweet, Length: 29530, dtype: object

In [18]:
Data['tweet']=Data['tweet'].apply(lambda x: re.sub('\d+','',x))
Data['tweet']

0          user when a father is dysfunctional and is s...
1         user  user thanks for  lyft credit i can t us...
2                                      bihday your majesty
3         model   i love u take with u all the time in ...
4                   factsguide  society now     motivation
                               ...                        
31956    off fishing tomorrow  user carnt wait first ti...
31957    ate  user isz that youuu                      ...
31958      to see nina turner on the airwaves trying to...
31959    listening to sad songs on a monday morning otw...
31961                     thank you  user for you follow  
Name: tweet, Length: 29530, dtype: object

In [19]:
from nltk.corpus import stopwords
nltk.download('stopwords')
stop_words=stopwords.words('english')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Honda\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [20]:
Data['tweet']=Data['tweet'].apply(lambda x : [word for word in x.split()  if word not in stop_words])
Data['tweet']

0        [user, father, dysfunctional, selfish, drags, ...
1        [user, user, thanks, lyft, credit, use, cause,...
2                                        [bihday, majesty]
3                      [model, love, u, take, u, time, ur]
4                        [factsguide, society, motivation]
                               ...                        
31956    [fishing, tomorrow, user, carnt, wait, first, ...
31957                              [ate, user, isz, youuu]
31958    [see, nina, turner, airwaves, trying, wrap, ma...
31959    [listening, sad, songs, monday, morning, otw, ...
31961                                [thank, user, follow]
Name: tweet, Length: 29530, dtype: object

In [21]:
pos_tags=[wordnet.VERB,wordnet.ADJ,wordnet.ADV,wordnet.NOUN]
lemmitaizer=WordNetLemmatizer()
for pos in pos_tags:
    Data['tweet']=Data['tweet'].apply(lambda x: [lemmitaizer.lemmatize(word,pos=pos) for word in x])
    
Data['tweet']

0        [user, father, dysfunctional, selfish, drag, k...
1        [user, user, thank, lyft, credit, use, cause, ...
2                                        [bihday, majesty]
3                      [model, love, u, take, u, time, ur]
4                        [factsguide, society, motivation]
                               ...                        
31956    [fish, tomorrow, user, carnt, wait, first, tim...
31957                              [eat, user, isz, youuu]
31958    [see, nina, turner, airwave, try, wrap, mantle...
31959    [listen, sad, song, monday, morning, otw, work...
31961                                [thank, user, follow]
Name: tweet, Length: 29530, dtype: object

In [22]:
Data['tweet']=Data['tweet'].apply(lambda x: ' '.join(x))

In [23]:
Data['tweet']

0        user father dysfunctional selfish drag kid dys...
1        user user thank lyft credit use cause offer wh...
2                                           bihday majesty
3                              model love u take u time ur
4                            factsguide society motivation
                               ...                        
31956        fish tomorrow user carnt wait first time year
31957                                   eat user isz youuu
31958    see nina turner airwave try wrap mantle genuin...
31959          listen sad song monday morning otw work sad
31961                                    thank user follow
Name: tweet, Length: 29530, dtype: object

# Feature extraction

1-count vectorizer

2-tfidf

3-tokenizer(binary,count,freq)

4-pretrained(glove)

In [24]:
from sklearn.model_selection import train_test_split
target=Data['label']
x_train, x_test ,y_train ,y_test = train_test_split(Data['tweet'],target,test_size=0.1,stratify=target)

In [25]:
x_train=x_train.tolist()

In [26]:
count_vect=CountVectorizer(max_features=10000,ngram_range=(1,1))
count_vect.fit(x_train)

CountVectorizer(max_features=10000)

In [27]:
count_vect.vocabulary_['york']

9924

In [28]:
len(count_vect.get_feature_names())



10000

In [29]:
X_train_count=count_vect.transform(x_train).todense()
X_test_count=count_vect.transform(x_test).todense()

In [30]:
pd.DataFrame(X_train_count,index=x_train,columns=count_vect.get_feature_names())



Unnamed: 0,aa,aap,aaron,ab,abandon,abasel,abba,abc,abe,ability,...,zoo,zoological,zootopia,zoro,zosh,zucchini,zuma,zurich,zydeco,zzzzzzzz
user user monroe amp nick best friend amp rosalee amp addie kelly amp babymonrosale grimm family nadalind,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
user real sweet successful,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
tomorrow introduce world ebony long journey follow dream somethingnew,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
user hope day special af bihday goodvibes,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
em excite en hide,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
turn cheek aka religious freedom could mean responder choose respond right,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
user gr loss condolence jo family yorkshire batley birstall jocoxmp sky brendancox brighton heave,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
user amp sick fuck look like pedophile friend user,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
user user talk greenwood tulsa ok city tulsa amp state ok yet acknowledge massacre history,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [31]:
print(X_train_count.shape)
print(X_test_count.shape)

(26577, 10000)
(2953, 10000)


In [32]:
from keras import layers,models
model_1=models.Sequential()
model_1.add(layers.Dense(128,activation='relu',input_shape=(10000,)))
model_1.add(layers.Dense(64,activation='relu'))
model_1.add(layers.Dense(32,activation='relu'))
model_1.add(layers.Dense(1,activation='sigmoid'))

In [33]:
from tensorflow.keras.optimizers import RMSprop
model_1.compile(optimizer= RMSprop(lr=0.0001),
              loss= keras.losses.binary_crossentropy,
              metrics= [keras.metrics.binary_accuracy,keras.metrics.Precision(),keras.metrics.Recall()])

  super(RMSprop, self).__init__(name, **kwargs)


In [34]:
model_1.fit(X_train_count,y_train,epochs=10,validation_data=(X_test_count,y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x21e5da51c70>

# Tokenizer

In [27]:
tokenizer=Tokenizer(num_words=10000,oov_token='<OOV>')
tokenizer.fit_on_texts(x_train)

In [28]:
tokenizer.word_index

{'<OOV>': 1,
 'user': 2,
 'day': 3,
 'love': 4,
 'get': 5,
 'happy': 6,
 'amp': 7,
 'go': 8,
 'make': 9,
 'life': 10,
 'today': 11,
 'u': 12,
 'like': 13,
 'good': 14,
 'new': 15,
 'father': 16,
 'see': 17,
 'time': 18,
 'smile': 19,
 'people': 20,
 'bihday': 21,
 'one': 22,
 'friend': 23,
 'feel': 24,
 'come': 25,
 'look': 26,
 'work': 27,
 'want': 28,
 'wait': 29,
 'girl': 30,
 'thank': 31,
 'weekend': 32,
 'fun': 33,
 'week': 34,
 'think': 35,
 'need': 36,
 'family': 37,
 'summer': 38,
 'say': 39,
 'great': 40,
 'live': 41,
 'know': 42,
 'year': 43,
 'friday': 44,
 'thankful': 45,
 'positive': 46,
 'beautiful': 47,
 'first': 48,
 'morning': 49,
 'back': 50,
 'world': 51,
 'take': 52,
 'thing': 53,
 'watch': 54,
 'way': 55,
 'dad': 56,
 'tomorrow': 57,
 'best': 58,
 'home': 59,
 'even': 60,
 'really': 61,
 'orlando': 62,
 'sad': 63,
 'sunday': 64,
 'never': 65,
 'music': 66,
 'night': 67,
 'fathersday': 68,
 'blog': 69,
 'cute': 70,
 'right': 71,
 'trump': 72,
 'leave': 73,
 'find': 

In [29]:
x_train_tokenized=tokenizer.texts_to_matrix(x_train,mode='binary')
x_test_tokenized=tokenizer.texts_to_matrix(x_test,mode='binary')

In [30]:
pd.DataFrame(x_train_tokenized,index=x_train)

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,9990,9991,9992,9993,9994,9995,9996,9997,9998,9999
come westhaven potchfest see brother user play pm today promise great,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
rasebud interior instacool smile style like follow tbt fun giditraffic user,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
bday entourage love instagood itsmybihday u cute follow photooftheday,0.0,1.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
nigeria try help son weneedhelp youareaparenttoo,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
cheeky write session user,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
n healthy take much surprise others surprise oneself great feat kristen haley twtl,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
saturday night live yay high five cecily strong bobby moynihan mc donalds,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user congratulation big guy put moan long,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
user b coz write sho review fav podcast user user publish user,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [31]:
from keras import layers,models
model_2=models.Sequential()
model_2.add(layers.Dense(128,activation='relu',input_shape=(10000,)))
model_2.add(layers.Dense(64,activation='relu'))
model_2.add(layers.Dense(32,activation='relu'))
model_2.add(layers.Dense(1,activation='sigmoid'))

In [46]:
from tensorflow.keras.optimizers import RMSprop,Adam
model_2.compile(optimizer= RMSprop(lr=0.0005),
              loss= keras.losses.binary_crossentropy,
              metrics= [keras.metrics.binary_accuracy,keras.metrics.Precision(),keras.metrics.Recall()])

  super(RMSprop, self).__init__(name, **kwargs)


In [47]:
model_2.fit(x_train_tokenized,y_train,epochs=10,validation_data=(x_test_tokenized,y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x11c9d5c8f40>

# Glove

In [39]:
nlp = spacy.load('en_core_web_md')

In [49]:
x_train_v=np.zeros((len(x_train),300))
x_test_v=np.zeros((len(x_test),300))

In [50]:
for i, doc in enumerate(nlp.pipe(x_train)):
    x_train_v[i, :] = doc.vector

for i, doc in enumerate(nlp.pipe(x_test)):
    x_test_v[i, :] = doc.vector

In [53]:
from keras import layers,models
model_3=models.Sequential()
model_3.add(layers.Dense(128,activation='relu',input_shape=(300,)))
model_3.add(layers.Dense(64,activation='relu'))
model_3.add(layers.Dense(32,activation='relu'))
model_3.add(layers.Dense(1,activation='sigmoid'))

In [54]:
from tensorflow.keras.optimizers import RMSprop,Adam
model_3.compile(optimizer= RMSprop(lr=0.0005),
              loss= keras.losses.binary_crossentropy,
              metrics= [keras.metrics.binary_accuracy,keras.metrics.Precision(),keras.metrics.Recall()])

In [56]:
model_3.fit(x_train_v,y_train,epochs=10,validation_data=(x_test_v,y_test))

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x11c7c151940>

# CNN_N_grams

In [50]:
def get_longest_sentence(data):
    max_len=0
    for text in data:
        text_len=len(text.split())
        max_len=max(text_len,max_len)
        
    return max_len

In [51]:
Data_cnn=Data.iloc[0:10000,:]

In [52]:
Data_cnn

Unnamed: 0,label,tweet
0,0,user father dysfunctional selfish drag kid dys...
1,0,user user thank lyft credit use cause offer wh...
2,0,bihday majesty
3,0,model love u take u time ur
4,0,factsguide society motivation
...,...,...
10566,0,hear ad radio blink summer tour promote dj pla...
10567,0,thankyoulordfohegiftoflife week everyone
10568,1,techjunkiejh alt right site plan fake black pe...
10569,0,tge dad father day


In [53]:
longest_input=get_longest_sentence(Data_cnn['tweet'])

In [54]:
longest_input

21

In [56]:
data_emb = np.zeros((len(Data['tweet']), longest_input, 300))
for i, text in enumerate(tqdm(nlp.pipe(Data['tweet']), total=len(Data['tweet']))):
    for j, token in enumerate(text):
        data_emb[i, j] = token.vector

HBox(children=(FloatProgress(value=0.0, max=29530.0), HTML(value='')))




IndexError: index 21 is out of bounds for axis 1 with size 21

#shape of data embedded (no of instances , max length of each sentence, 300 vct)

In [67]:
import tensorflow as tf

# define the network
inputs = tf.keras.layers.Input((longest_input, 300))
reshaped = tf.keras.layers.Reshape((longest_input, 300, 1))(inputs)


filters = [2, 3, 4]

# define the conv net
conv_1 = tf.keras.layers.Conv2D(100, (filters[0], 300), activation='relu')(reshaped)
conv_2 = tf.keras.layers.Conv2D(100, (filters[1], 300), activation='relu')(reshaped)
conv_3 = tf.keras.layers.Conv2D(100, (filters[2], 300), activation='relu')(reshaped)

# define max-pooling
pool_1 = tf.keras.layers.MaxPooling2D((longest_input - filters[0] + 1, 1), strides=(1,1))(conv_1)
pool_2 = tf.keras.layers.MaxPooling2D((longest_input - filters[1] + 1, 1), strides=(1,1))(conv_2)
pool_3 = tf.keras.layers.MaxPooling2D((longest_input - filters[2] + 1, 1), strides=(1,1))(conv_3)

# concatenate the convs
merged_tensor = tf.keras.layers.concatenate([pool_1, pool_2, pool_3], axis=1)

# now flatten them and add a dense layer
flatten = tf.keras.layers.Flatten()(merged_tensor)

# add a dense layer
clf = tf.keras.layers.Dense(100, activation='relu')(flatten)

# add final output
clf = tf.keras.layers.Dense(1, activation='sigmoid')(clf)

In [69]:
model = tf.keras.models.Model(inputs, clf)
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['acc'])
model.summary()

Model: "model"
__________________________________________________________________________________________________
 Layer (type)                   Output Shape         Param #     Connected to                     
 input_2 (InputLayer)           [(None, 37, 300)]    0           []                               
                                                                                                  
 reshape_1 (Reshape)            (None, 37, 300, 1)   0           ['input_2[0][0]']                
                                                                                                  
 conv2d_3 (Conv2D)              (None, 36, 1, 100)   60100       ['reshape_1[0][0]']              
                                                                                                  
 conv2d_4 (Conv2D)              (None, 35, 1, 100)   90100       ['reshape_1[0][0]']              
                                                                                              

In [78]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(data_emb, Data['label'], test_size=.2)

MemoryError: Unable to allocate 1.95 GiB for an array with shape (23624, 37, 300) and data type float64

In [None]:
model.fit(x_train, y_train, epochs=10,validation_data=(x_test,y_test))