# **Importing Libraries**

In [None]:
import numpy as np
import pandas as pd
import re
import nltk 
nltk.download('brown')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger') 
from textblob import TextBlob
import matplotlib.pyplot as plt
import plotly.graph_objs as go
from sklearn.feature_extraction.text import TfidfVectorizer 
from sklearn.metrics.pairwise import cosine_similarity
from textblob import TextBlob
import plotly.express as px

# **Loading Datasets**

In [None]:
def remove_URL(sample):
    return re.sub(r"http\S+", "", sample) 

In [None]:
blm_tweets=pd.read_csv("https://raw.githubusercontent.com/DeepanNarayanaMoorthy/What-Does-BLM-Say/master/datasets/blm_tweets.csv", error_bad_lines=False)
blm_news=pd.read_csv("https://raw.githubusercontent.com/DeepanNarayanaMoorthy/What-Does-BLM-Say/master/datasets/blm_news.csv", error_bad_lines=False)

In [None]:
blm_tweets[:5]

Unnamed: 0.1,Unnamed: 0,Date,Name,Text,Hashtags
0,0,2020-07-31 23:52:04+00:00,Willimperial,😂 “The Liar Tweets Tonight” #resist #resistanc...,#resist #resistance #DumpTrump2020 #DumpTrump ...
1,1,2020-07-31 23:47:44+00:00,robbystarbuck,Going to go tell BLM protestors I’m Republican...,
2,2,2020-07-31 23:34:25+00:00,DemocracyInn,Wow The @WallOfDocs has joined the Portland pr...,#BlackLivesMatter #BlackLivesMatter #ImpotusAm...
3,3,2020-07-31 23:33:39+00:00,MDconnected,"If our flag is racists, why did they cover Joh...",#BLM
4,4,2020-07-31 23:30:20+00:00,GamingMegaverse,Whether the league does it or not I will. #Kne...,#Kneel4Hockey #BLM


In [None]:
blm_news["Text"]=[remove_URL(i) for i in blm_news["Text"]]
blm_tweets["Text"]=[remove_URL(str(i)) for i in blm_tweets["Text"]]
blm_news.drop_duplicates(subset ="Text", 
                     keep = False, inplace = True)
blm_news.reset_index(drop=True, inplace=True)
blm_news[:5]

Unnamed: 0.1,Unnamed: 0,Date,Name,Text,Hashtags
0,0,2020-06-10 17:42:40+00:00,BBCNews,Reni Eddo-Lodge breaks book chart record amid ...,
1,1,2020-07-25 13:59:02+00:00,TheEconomist,They are planning BLM style protests in Westmi...,
2,2,2020-06-20 11:28:01+00:00,TheEconomist,Many young activists ignored BLM UK’s advice n...,
3,3,2020-05-25 16:00:13+00:00,TheEconomist,“The general public has become a bit exhausted...,
4,9,2020-06-11 09:29:11+00:00,Reuters,Motor racing: McLaren's Norris loses followers...,


# **Identifying Most Prominent Words**

In [None]:
nouns=[]
for i in blm_tweets["Text"]:
  blob = TextBlob(i).noun_phrases
  nouns.extend(blob)
nouns=np.array(nouns)
nouns_values, nouns_counts = np.unique(nouns, return_counts=True)

In [None]:
np.unique(nouns_counts, return_counts=True)

(array([   1,    2,    3,    4,    5,    6,    7,    8,    9,   10,   11,
          12,   13,   14,   15,   16,   17,   18,   19,   20,   21,   22,
          23,   24,   25,   26,   27,   28,   29,   30,   33,   35,   36,
          37,   38,   40,   41,   45,   46,   47,   50,   51,   53,   55,
          56,   58,   59,   63,   64,   66,   74,   75,   79,   80,   81,
          87,   90,  103,  105,  112,  136,  140,  231,  353,  441,  476,
         669,  923, 1052, 1056, 1236]),
 array([11337,  1249,   412,   198,   104,    88,    55,    42,    46,
           24,    20,    18,    13,    16,    12,    13,    11,    19,
            3,     6,     6,     3,     7,     3,     3,     4,     3,
            1,     1,     2,     1,     2,     3,     1,     2,     1,
            1,     1,     1,     2,     1,     1,     2,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     1,     1,     1,     1,     1,     1,     1,     1,
            1,     

## **Removing Least Occuring Nouns**

In [None]:
for i in range(1,11):
  index_list=np.where(nouns_counts == i)
  nouns_counts=np.delete(nouns_counts, index_list)
  nouns_values=np.delete(nouns_values, index_list)
np.unique(nouns_counts, return_counts=True)

(array([  11,   12,   13,   14,   15,   16,   17,   18,   19,   20,   21,
          22,   23,   24,   25,   26,   27,   28,   29,   30,   33,   35,
          36,   37,   38,   40,   41,   45,   46,   47,   50,   51,   53,
          55,   56,   58,   59,   63,   64,   66,   74,   75,   79,   80,
          81,   87,   90,  103,  105,  112,  136,  140,  231,  353,  441,
         476,  669,  923, 1052, 1056, 1236]),
 array([20, 18, 13, 16, 12, 13, 11, 19,  3,  6,  6,  3,  7,  3,  3,  4,  3,
         1,  1,  2,  1,  2,  3,  1,  2,  1,  1,  1,  1,  2,  1,  1,  2,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,  1,
         1,  1,  1,  1,  1,  1,  1,  1,  1,  1]))

In [None]:
print(len(nouns_values))
print(len(nouns_counts))
count_sort_ind = np.argsort(nouns_counts)

211
211


## **Plotting Frequencies of Nouns**

In [None]:
x = list(nouns_values[count_sort_ind])
y = list(nouns_counts[count_sort_ind])
fig = go.Figure(go.Bar(x=x, y=y, name='Nouns', marker=dict(cmax=1236,cmin=0,color=y,colorbar=dict(title="Scale"),colorscale="Viridis")))
fig.update_layout(title_text='Nouns Distribution : Total words = '+str(len(nouns_values))+' : Please Zoom to See All Words' )
fig.write_html("nouns_distribution.html")
fig

# **Finding Similar Tweets**

In [None]:
blm_tweets_text=blm_tweets['Text']

In [None]:
tfidf_vectorizer = TfidfVectorizer() 
tfidf_matrix = tfidf_vectorizer.fit_transform(blm_tweets_text)
tfidf_matrix.shape

(4779, 14732)

In [None]:
c_sim = cosine_similarity(tfidf_matrix,tfidf_matrix)

In [None]:
type(c_sim)

numpy.ndarray

## **Top 25 Tweets in Similarity**

In [None]:
c_sim_sum=c_sim.sum(axis=1)
blm_tweets_text[np.argsort(c_sim_sum)[-25:]]

2063    Jews United for Justice affirms unequivocally ...
4494    Oh my. "While my time here has now come to an ...
4559    Anyone else tired of the "tHE pROTeST iSN't aB...
3946    I can't breathe. The final words of George Flo...
4568    The worst part of this is that white lives DO ...
556     This is the passion that rocked the nation whe...
3670    Demonstrators march past the US Embassy in Lon...
2453    Today, in Hillside, we marched for justice. Fo...
4171    Black man killed in the street, in broad dayli...
2496    On behalf of the entire USA I would like to th...
4757    George Floyd was not the first and he won’t be...
4627    This is wrong and ignorant, it’s about the deh...
2989    This is the face of what you're supporting. Th...
360     The #WallofVets mission is not to tell people ...
1870    Peaceful protests in Los Angeles and across Am...
1673    Achraf Hakimi: I spoke with Sancho before the ...
2564    I support the BLM movement. But I can't believ...
2516    Now is

### **Similarity Heat Map**

In [None]:
fig = go.Figure(data=go.Heatmap(
    z=[list(item) for item in c_sim[np.ix_(np.argsort(c_sim_sum)[-25:],np.argsort(c_sim_sum)[-25:])]],
    x = blm_tweets_text[np.argsort(c_sim_sum)[-100:]],
    y = blm_tweets_text[np.argsort(c_sim_sum)[-100:]]))
fig.write_html("sentences_scatter.html")
print('Plot Saved as sentences_scatter.html')

Plot Saved as sentences_scatter.html


# **Elementary Sentiment Analysis**

In [None]:
polarities = [TextBlob(text).sentiment.polarity for text in blm_tweets_text]
pd.DataFrame(polarities,columns=["pol"]).describe()

Unnamed: 0,pol
count,4779.0
mean,0.042171
std,0.259704
min,-1.0
25%,-0.051852
50%,0.0
75%,0.143095
max,1.0


In [None]:
df = pd.DataFrame(polarities,columns=["pol"])
fig = px.histogram(df, x="pol")
fig.update_layout(title_text='Sentiment Analysis Histogram : Number of Tweets = '+str(len(blm_tweets_text)) )
fig.write_html("sentiment_histogram.html")
fig.show()