In [1]:
import numpy as np
import pandas as pd
import re

In [2]:
data=pd.read_csv("covid19_tweets.csv")
data.head(1)

Unnamed: 0,user_name,user_location,user_description,user_created,user_followers,user_friends,user_favourites,user_verified,date,text,hashtags,source,is_retweet
0,ᏉᎥ☻լꂅϮ,astroworld,wednesday addams as a disney princess keepin i...,2017-05-26 05:46:42,624,950,18775,False,2020-07-25 12:27:21,If I smelled the scent of hand sanitizers toda...,,Twitter for iPhone,False


In [3]:
data.drop(['user_name','user_location','user_description','user_created','user_followers','user_favourites','user_friends','user_verified','date','hashtags','source','is_retweet'],axis=1,inplace=True)

In [4]:
data.head(1)

Unnamed: 0,text
0,If I smelled the scent of hand sanitizers toda...


In [5]:
data['text']

0         If I smelled the scent of hand sanitizers toda...
1         Hey @Yankees @YankeesPR and @MLB - wouldn't it...
2         @diane3443 @wdunlap @realDonaldTrump Trump nev...
3         @brookbanktv The one gift #COVID19 has give me...
4         25 July : Media Bulletin on Novel #CoronaVirus...
                                ...                        
179103    Thanks @IamOhmai for nominating me for the @WH...
179104    2020! The year of insanity! Lol! #COVID19 http...
179105    @CTVNews A powerful painting by Juan Lucena. I...
179106    More than 1,200 students test positive for #CO...
179107    I stop when I see a Stop\n\n@SABCNews\n@Izinda...
Name: text, Length: 179108, dtype: object

In [6]:
len(data['text'])

179108

In [7]:
data.drop(data.tail(160000).index,inplace=True)    #becouse the data was too big 

In [8]:
tweets=data['text']

In [9]:
len(tweets)

19108

## data preprocessing 

In [10]:
lower_tweets=[]
for t in tweets :
    t=t.lower()
    lower_tweets.append(t)


In [11]:
def clean_text(text): 
    text=re.sub('(@)\w*',"",text)
    text=re.sub("https?:\/\/\S+","",text)
    text=re.sub("(\.|\,|-)+","",text)
    text=re.sub("(\n)\w*","",text)
    return text 


In [12]:
cleaned_tweets=[]
for t in lower_tweets :
    clean_t=clean_text(t)
    cleaned_tweets.append(clean_t)

In [13]:
cleaned_tweets[:3]

['if i smelled the scent of hand sanitizers today on someone in the past i would think they were so intoxicated that… ',
 "hey   and   wouldn't it have made more sense to have the players pay their respects to the a… ",
 '   trump never once claimed #covid19 was a hoax we all claim that this effort to… ']

In [14]:
splitted=[]
for t in cleaned_tweets:
        text=t.split() #sequncing
        splitted.append(text)

In [15]:
splitted[2]

['trump',
 'never',
 'once',
 'claimed',
 '#covid19',
 'was',
 'a',
 'hoax',
 'we',
 'all',
 'claim',
 'that',
 'this',
 'effort',
 'to…']

In [16]:
vocab=[]
for t in splitted:
    for word in t:
        if word not in vocab:
            vocab.append(word)

In [17]:
vocab[:10]

['if',
 'i',
 'smelled',
 'the',
 'scent',
 'of',
 'hand',
 'sanitizers',
 'today',
 'on']

In [18]:
vocab_size=len(vocab)  #40949
tweets_size=len(cleaned_tweets) #19108

## text Prepration

In [19]:
from tensorflow.keras.preprocessing.text import one_hot 

In [20]:
encoded_tweets=[one_hot(tweet ,vocab_size,filters='') for tweet in cleaned_tweets]

In [21]:
encoded_tweets[:3]

[[21047,
  39255,
  4131,
  13661,
  9259,
  19948,
  7651,
  28029,
  9298,
  26841,
  17579,
  22758,
  13661,
  29218,
  39255,
  19896,
  33489,
  35210,
  36081,
  37092,
  25664,
  3770],
 [27375,
  24570,
  37998,
  13463,
  22972,
  7021,
  39235,
  26190,
  37620,
  22972,
  13661,
  12132,
  16335,
  3801,
  31831,
  37620,
  13661,
  38675],
 [34929,
  1655,
  33676,
  32498,
  18350,
  19247,
  5310,
  32938,
  18386,
  26281,
  20160,
  31071,
  23362,
  15451,
  31484]]

In [22]:
largest=len(encoded_tweets[0])
smallest=len(encoded_tweets[0])
for t in encoded_tweets:
    x=len(t)
    if x>largest:
        largest=x
    if x<smallest:
        smallest=x

In [23]:
largest

46

In [24]:
smallest

1

In [39]:
from tensorflow.keras.preprocessing.sequence import pad_sequences
padded_tweets=pad_sequences(encoded_tweets,maxlen=46,padding='post')

In [40]:
padded_tweets[:3]

array([[21047, 39255,  4131, 13661,  9259, 19948,  7651, 28029,  9298,
        26841, 17579, 22758, 13661, 29218, 39255, 19896, 33489, 35210,
        36081, 37092, 25664,  3770,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0],
       [27375, 24570, 37998, 13463, 22972,  7021, 39235, 26190, 37620,
        22972, 13661, 12132, 16335,  3801, 31831, 37620, 13661, 38675,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0],
       [34929,  1655, 33676, 32498, 18350, 19247,  5310, 32938, 18386,
        26281, 20160, 31071, 23362, 15451, 31484,     0,     0,     0,
            0,     0,     0,     0,     0,     0,     0,     0,     0,
            0,     0,     0,     0,     0,   

In [41]:
seq_length =4
X = []
Y = []
lenght=0
for tweet in padded_tweets:
    for i in range(0,46-seq_length,1):
        input_ = tweet[i:i + seq_length]
        output = tweet[i + seq_length]
        X.append(input_)
        Y.append(output)

In [43]:
X[:6]

[array([21047, 39255,  4131, 13661]),
 array([39255,  4131, 13661,  9259]),
 array([ 4131, 13661,  9259, 19948]),
 array([13661,  9259, 19948,  7651]),
 array([ 9259, 19948,  7651, 28029]),
 array([19948,  7651, 28029,  9298])]

In [44]:
Y[:6]

[9259, 19948, 7651, 28029, 9298, 26841]

In [45]:
len(X)

802536

In [46]:
len(Y)

802536

In [47]:
train_X,train_y=X[:700000],Y[:700000]
test_X,test_y=X[700000:],Y[700000:]
len(train_X),len(test_X)

(700000, 102536)

In [48]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding,Dense,Dropout,LSTM

In [51]:
model=Sequential()
model.add(Embedding(input_dim=len(train_X),output_dim=8,input_length=20))
model.add(LSTM(200,activation='relu'))
model.add(Dense(20,activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(1,activation='sigmoid'))
model.summary()

Model: "sequential_3"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 20, 8)             5600000   
                                                                 
 lstm_2 (LSTM)               (None, 200)               167200    
                                                                 
 dense_4 (Dense)             (None, 20)                4020      
                                                                 
 dropout_2 (Dropout)         (None, 20)                0         
                                                                 
 dense_5 (Dense)             (None, 1)                 21        
                                                                 
Total params: 5,771,241
Trainable params: 5,771,241
Non-trainable params: 0
_________________________________________________________________


In [52]:
model.compile(loss='binary_crossentropy', optimizer='rmsprop')

In [53]:
model.fit(train_X, train_y, epochs=20, batch_size=128)

ValueError: Failed to find data adapter that can handle input: (<class 'list'> containing values of types {"<class 'numpy.ndarray'>"}), (<class 'list'> containing values of types {"<class 'numpy.int32'>"})