In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from textblob import TextBlob
from wordcloud import WordCloud
from sklearn.feature_extraction.text import CountVectorizer
import gensim
import gensim.corpora as corpora
from gensim.utils import simple_preprocess
from gensim.models import CoherenceModel
import pyLDAvis
import pyLDAvis.gensim_models
import tensorflow 
from sklearn import decomposition
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.layers import LSTM
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Bidirectional
from tensorflow.keras.layers import Dropout

In [None]:
import tensorflow as tf
print(tf.__version__)

In [None]:
dataset = pd.read_csv('Covid.csv')
X = dataset.iloc[:, 1].values

In [None]:
dataset.head()

In [None]:
import re
import nltk

from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
corpus = []
for i in range(0, len( dataset['tweet_text'])):
    review = dataset['tweet_text'][i]
    review = re.sub(r'#[a-zA-Z0-9]+'," ", review)
    review = re.sub(r'@[a-zA-Z0-9]+', ' ', review)
    review = re.sub(r'&[a-zA-Z0-9]+', ' ', review)
    review = re.sub(r'RT[\s]+', ' ', review)
    review = re.sub(r"https?:\/\/\S+|www\S+|", "", review)
    review = re.sub(r'\\[a-zA-Z0-9]+', ' ', review)
    review = re.sub(r'^[a-zA-Z0-9]+'," ", review)
    review = re.sub(r'[^a-zA-Z0-9]+'," ", review)
    review = review.lower()
    review = review.split()
    ps = PorterStemmer()
    review = [ps.stem(word) for word in review if not word in set(stopwords.words('english'))]
    review = ' '.join(review)
    corpus.append(review)

In [None]:
corpus[21]

In [None]:
corpus

In [None]:
all = ' '.join([twts for twts in corpus] )
wordCloud= WordCloud(width = 1000,height = 500, random_state=0,background_color='White',max_font_size=119).generate(all)

plt.imshow(wordCloud,interpolation = "bilinear")
plt.axis('off')
plt.show()

In [None]:
def subjectivity(review):
    return TextBlob(review).sentiment.subjectivity
def polarity(review):
    return TextBlob(review).sentiment.polarity
Polarity=[]
Subjectivity=[]
for i in range(0,len( dataset['tweet_text'])):
    Polarity.append(polarity(corpus[i]))
    Subjectivity.append(subjectivity(corpus[i]))

Polarity = np.array(Polarity)
Subjectivity = np.array(Subjectivity)

In [None]:
len(Polarity)


In [None]:
Subjectivity

In [None]:
plt.figure(figsize=(10,10))
for i in range(0, len(dataset['tweet_text'])-1):
    plt.scatter(Polarity[i],Subjectivity[i],color='Blue')

plt.title("Setiment Analysis")
plt.xlabel("Polarity")
plt.ylabel("Subjectivity")
plt.show()


In [None]:
Sentiment=[]
for i in range(0,len(dataset['tweet_text'])):
    if (Polarity[i]>0):
        Sentiment.append(1)
    
    if Polarity[i]<=0:
        Sentiment.append(0)
Sentiment = np.array(Sentiment)

In [None]:
print(Sentiment)

dataset['Sentiments']=Sentiment

dataset.head()

In [None]:


plt.title("Sentiment Analysis")
plt.xlabel("Sentiment")
plt.ylabel("Counts")
dataset['Sentiments'].value_counts().plot(kind="bar")
plt.show()

In [None]:
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer().fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in     vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [None]:
get_top_n_words(corpus, 10)

In [None]:
def gen_words(texts):
    final = []
    for text in texts:
        new = gensim.utils.simple_preprocess(text, deacc=True)
        final.append(new)
    return (final)

data_words = gen_words(corpus)

print (data_words[0][0:20])

In [None]:
id2word = corpora.Dictionary(data_words)

corpus_topic = []
for text in data_words:
    new = id2word.doc2bow(text)
    corpus_topic.append(new)

print (corpus_topic[0][0:20])

word = id2word[[0][:1][0]]
print (word)

In [None]:
lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus_topic,
                                           id2word=id2word,
                                           num_topics=5,
                                           random_state=100,
                                           update_every=1,
                                           chunksize=100,
                                           passes=10,
                                           alpha="auto")

In [None]:
pyLDAvis.enable_notebook()
vis = pyLDAvis.gensim_models.prepare(lda_model, corpus_topic, id2word, mds="mmds", R=10)
vis

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [None]:
vectorizer= TfidfVectorizer(stop_words={'english'})
X= vectorizer.fit_transform(corpus).toarray()

In [None]:
X[0]

In [None]:
vectorizer.get_feature_names()

In [None]:
clf = decomposition.NMF(n_components=5 , random_state=42)

W1 = clf.fit_transform(X)
H1 = clf.components_

In [None]:
H1

In [None]:
W1

In [None]:
num_words = 15
vocab = np.array(vectorizer.get_feature_names())
top_words = lambda t : [vocab[i] for i in np.argsort(t)[:-num_words-1:-1]]
topic_words = ([top_words(t) for t in H1])
topics = [' '.join(t) for t in topic_words]
topics

In [None]:
col = ["Topic" + str(i) for i in range(clf.n_components)]
doc = ["Doc" + str(i) for i in range(len(corpus))]
df = pd.DataFrame(np.round(W1, 2),columns = col, index = doc)
significant_topic = np.argmax(df.values,axis =1)
df['Reliable_Topic'] = significant_topic
df


In [None]:
corpus[2]

In [None]:
Y = Sentiment
print(Y)

In [None]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.20, random_state = 0)

In [None]:
len(X_train)

In [None]:
from sklearn.svm import SVC
classifierSVC = SVC(kernel = 'linear', random_state = 0)
classifierSVC.fit(X_train, y_train)

In [None]:
y_pred = classifierSVC.predict(X_test)

In [None]:
y_pred

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_pred)

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

In [None]:
from sklearn.naive_bayes import GaussianNB
classifiernb = GaussianNB()
classifiernb.fit(X_train, y_train)

In [None]:
y_nb = classifiernb.predict(X_test)

In [None]:
y_nb

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_nb)

In [None]:
from sklearn.neighbors import KNeighborsClassifier
classifierknn = KNeighborsClassifier(n_neighbors=3 ,metric= 'minkowski' , p=2 )
classifierknn.fit(X_train , y_train)

In [None]:
y_knn = classifierknn.predict(X_test)

In [None]:
y_knn

In [None]:
accuracy_score(y_test,y_knn)

In [None]:
from sklearn.ensemble import RandomForestClassifier
classifierRF = RandomForestClassifier(n_estimators = 10, criterion = 'entropy', random_state = 0)
classifierRF.fit(X_train, y_train)

In [None]:
y_RF=classifierRF.predict(X_test)
y_RF

In [None]:
accuracy_score(y_RF,y_test)

In [None]:
from sklearn.tree import DecisionTreeClassifier
classifierDT = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifierDT.fit(X_train, y_train)
y_DT=classifierDT.predict(X_test)
y_DT


In [None]:
accuracy_score(y_DT,y_test)

In [None]:
y_test

In [None]:
from tensorflow.keras.preprocessing.text import one_hot

In [None]:
voc_size=10000
onehot_repr=[one_hot(words,voc_size)for words in corpus] 
print(onehot_repr)

In [None]:
from tensorflow.keras.layers import Embedding
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential

In [None]:
sent_length=25
embedded_docs=pad_sequences(onehot_repr,padding='pre',maxlen=sent_length)
embedded_docs

In [None]:
model=Sequential()
model.add(Embedding(voc_size,10,input_length=sent_length))
model.compile('adam','mse')

In [None]:
X_WE = np.array(embedded_docs)


In [None]:
X_WE[0]

In [None]:
for i in range(0, len( dataset['tweet_text'])):
    corpus[i] = corpus[i].split()
    

In [None]:
corpus[1]

In [None]:
import nltk
nltk.download('punkt')

In [None]:
from gensim.models import Word2Vec
model = Word2Vec(corpus, min_count = 2)


In [None]:
vector = model.wv['covid']
vector

In [None]:
class Sequencer():
    
    def __init__(self,
                 all_words,
                 max_words,
                 seq_len,
                 embedding_matrix
                ):
        
        self.seq_len = seq_len
        self.embed_matrix = embedding_matrix
        """
        temp_vocab = Vocab which has all the unique words
        self.vocab = Our last vocab which has only most used N words.
    
        """
        temp_vocab = list(set(all_words))
        self.vocab = []
        self.word_cnts = {}
        """
        Now we'll create a hash map (dict) which includes words and their occurencies
        """
        for word in temp_vocab:
            # 0 does not have a meaning, you can add the word to the list
            # or something different.
            count = len([0 for w in all_words if w == word])
            self.word_cnts[word] = count
            counts = list(self.word_cnts.values())
            indexes = list(range(len(counts)))
        
        # Now we'll sort counts and while sorting them also will sort indexes.
        # We'll use those indexes to find most used N word.
        cnt = 0
        while cnt + 1 != len(counts):
            cnt = 0
            for i in range(len(counts)-1):
                if counts[i] < counts[i+1]:
                    counts[i+1],counts[i] = counts[i],counts[i+1]
                    indexes[i],indexes[i+1] = indexes[i+1],indexes[i]
                else:
                    cnt += 1
        
        for ind in indexes[:max_words]:
            self.vocab.append(temp_vocab[ind])
                    
    def textToVector(self,text):
        # First we need to split the text into its tokens and learn the length
        # If length is shorter than the max len we'll add some spaces (100D vectors which has only zero values)
        # If it's longer than the max len we'll trim from the end.
        tokens = text.split()
        len_v = len(tokens)-1 if len(tokens) < self.seq_len else self.seq_len-1
        vec = []
        for tok in tokens[:len_v]:
            try:
                vec.append(self.embed_matrix[tok])
            except Exception as E:
                pass
        
        last_pieces = self.seq_len - len(vec)
        for i in range(last_pieces):
            vec.append(np.zeros(100,))
        
        return np.asarray(vec).flatten()

In [None]:
sequencer = Sequencer(all_words = [token for seq in corpus for token in seq],
              max_words = 1200,
              seq_len = 15,
              embedding_matrix = model.wv
             )

In [None]:
x_vecs = np.asarray([sequencer.textToVector(" ".join(seq)) for seq in corpus])
print(x_vecs.shape)

In [None]:
from sklearn.decomposition import PCA
pca_model = PCA(n_components=90)
pca_model.fit(x_vecs)
print("Sum of variance ratios: ",sum(pca_model.explained_variance_ratio_))

In [None]:
x_comps = pca_model.transform(x_vecs)
x_comps.shape
x_comps[0]

In [None]:
x_train_w2v,x_test_w2v,y_train,y_test = train_test_split(x_comps,Y,test_size=0.2,random_state=42)
x_test_w2v

In [None]:
classifierSVC_w2v = SVC(kernel = 'linear', random_state = 0)
classifierSVC_w2v.fit(x_train_w2v, y_train)
y_SVC_w2v = classifierSVC_w2v.predict(x_test_w2v)
y_SVC_w2v

In [None]:
accuracy_score(y_SVC_w2v,y_test)

In [None]:
classifiernb_w2v = GaussianNB()
classifiernb_w2v.fit(x_train_w2v, y_train)
y_nb_w2v = classifierSVC_w2v.predict(x_test_w2v)
accuracy_score(y_nb_w2v,y_test)

In [None]:
classifierRF_w2v = RandomForestClassifier(n_estimators =12, criterion = 'entropy', random_state = 0)
classifierRF_w2v.fit(x_train_w2v, y_train)
y_rf_w2v = classifierRF_w2v.predict(x_test_w2v)
accuracy_score(y_rf_w2v,y_test)

In [None]:
classifierDT_w2v = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifierDT_w2v.fit(X_train, y_train)
y_DT_w2v=classifierDT_w2v.predict(X_test)
accuracy_score(y_DT_w2v,y_test)

In [None]:

embedding_vector_features=40
model=Sequential()
model.add(Embedding(voc_size,embedding_vector_features))
model.add(Bidirectional(LSTM(4)))
model.add(Dropout(0.25))
model.add(Dense(1,activation='sigmoid'))
model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])
print(model.summary())

In [None]:
X_final=np.array(embedded_docs)
y_final=np.array(Y)

In [None]:
y_final.shape

In [None]:
X_train_LSTM, X_test_LSTM, y_train_lstm, y_test_lstm = train_test_split(X_final, y_final, test_size=0.2, random_state=42)

In [None]:
X_train_LSTM

In [None]:
model.fit(X_train_LSTM,y_train_lstm,validation_data=(X_test_LSTM,y_test_lstm),epochs=70,batch_size=10)

In [None]:
y_pred_LSTM=model.predict_classes(X_test_LSTM)

In [None]:
y_pred_LSTM


In [None]:
y_test_lstm

In [None]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test_lstm,y_pred_LSTM)

In [None]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test_lstm,y_pred_LSTM)