# Applying sentiment analysis to your social network

## Step 1: Extract Data

### Twitter

In [None]:
import tweepy
from tweepy import OAuthHandler
import xlsxwriter

#import twitterCredentials

In [None]:
api_key = twitterCredentials.api_key
api_secret = twitterCredentials.api_secret
access_token = twitterCredentials.access_token
access_secret = twitterCredentials.access_secret

auth = OAuthHandler(api_key, api_secret)
auth.set_access_token(access_token, access_secret)
api = tweepy.API(auth, wait_on_rate_limit=True)

In [None]:
book = xlsxwriter.Workbook('Test1.xlsx')
sheet = book.add_worksheet('Data')
row = 0
col = 0
 
header = ["Date", "Day", "Tweet", "Retweets", "Likes"]
for title in header:
    sheet.write(row, col, title)
    col += 1
row = 1

In [None]:
name = "cnnespan"

timeline = api.user_timeline(id=name, count=500 )

for tweet in timeline:
    print(tweet.text)
    data = [tweet.created_at.date().strftime("%b %d %Y "), tweet.created_at.date().strftime("%b %d"), tweet.text, tweet.retweet_count, tweet.favorite_count]
    col = 0
    
    for text in data:
        sheet.write(row, col, text)
        col += 1
    row += 1
    
book.close()

### Facebook

In [None]:
from facebook_scraper import get_posts
from textblob import TextBlob

In [None]:
book = xlsxwriter.Workbook('test2.xlsx')
sheet = book.add_worksheet('djangogirlsData')
row = 0
col = 0
 
sheet.write(row, col, "Post")
row = 1

In [None]:
for post in get_posts('djangogirlsbogota', pages=4):
    b = TextBlob(post['text'])
    
    if b.detect_language() == 'es':
        print(post['text'])
        sheet.write(row, 0, post['text'])
        row += 1
    
book.close()

## Step 2: Load Data

In [None]:
import pandas as pd

import nltk
from nltk.probability import FreqDist
from nltk.tokenize import word_tokenize

%matplotlib inline

In [None]:
datos_excel = pd.read_excel(open('test1.xlsx','rb'))
datos_excel = pd.DataFrame(datos_excel)
datos_excel.head()

In [None]:
Post = ""
for data in datos_excel['Tweet']:
    Post += data
    
freq = FreqDist(word_tokenize(Post))
freq.plot(20)

## Step 3: Clean Data

In [None]:
import emoji
import re

from nltk.corpus import stopwords
import string

In [None]:
stopwordList = stopwords.words("Spanish")
stopwordList

In [None]:
punctuationList = list(string.punctuation)
punctuationList

In [None]:
stopwordsList = stopwordList + punctuationList + ['“','”','¡','¿',"''",'``','...','→','⬥']
stopwordsList

In [None]:
Post

In [None]:
def clean_tweets_emoji(text):
    allchars = [str for str in text]
    emoji_list = [c for c in allchars if c in emoji.UNICODE_EMOJI]
    clean_text = ' '.join([str for str in text.split() if not any(i in str for i in emoji_list)])
    
    return clean_text

In [None]:
tweets_clean_emoji = clean_tweets_emoji(Post)
tweets_clean_emoji

In [None]:
def clean_tweets_regex(text):
    tweet = re.sub(r'@[a-zA-Z0-9-_.]+', '', text)
    tweet = re.sub(r'#[a-zA-Z0-9-_.]+', '', tweet)
    tweet = re.sub(r'https://[a-zA-Z0-9-_./]+', '', tweet)
    tweet = re.sub(r'www.[a-zA-Z0-9-_./]+', '', tweet)
    tweet = re.sub(r'[a-zA-Z0-9-./]+[…]', '', tweet)
    tweet = re.sub(r'[ ]+[…]', '', tweet)
    tweet = re.sub('\n|\r', '', tweet)
    tweet = re.sub(r'[á|ä|â|à]', 'a', tweet)
    tweet = re.sub(r'[é|ê|è]', 'e', tweet)
    tweet = re.sub(r'[í|î|ì]', 'i', tweet)
    tweet = re.sub(r'[ó|ô|ò]', 'o', tweet)
    tweet = re.sub(r'[ú|û|ù|ü]', 'u', tweet)
    tweet = re.sub(r'RT ', '', tweet)
    return tweet

In [None]:
tweets_clean = clean_tweets_regex(tweets_clean_emoji)
tweets_clean

In [None]:
filtro = [palabra for palabra in word_tokenize(tweets_clean) if palabra not in stopwordsList]
filtro

In [None]:
freq = FreqDist(filtro)
freq.plot(20)

## Step 4: Process Data

### Tweets en Español

In [None]:
from nltk.classify import NaiveBayesClassifier

In [None]:
datos_excel = pd.read_excel(open('taggedTweets.xlsx','rb'))
datos_excel = pd.DataFrame(datos_excel)
sample = datos_excel.sample(len(datos_excel))
sample

In [None]:
def construirBolsaDePalabras(palabras):
    diccionario={}
    for palabra in word_tokenize(palabras):
        if palabra not in stopwordsList:
            diccionario[palabra]=1
    return diccionario

In [None]:
rasgosNegativos = []
rasgosPositivos = []
rasgosNeutrales = []
    
for indice_fila, fila in sample.iterrows():
    tweet_clean = clean_tweets_emoji(fila['Tweet'])
    tweet_clean = clean_tweets_regex(tweet_clean)
    if fila['Sentimiento'] == 'pos':
        rasgosPositivos.append((construirBolsaDePalabras(tweet_clean),fila['Sentimiento']))
    elif fila['Sentimiento'] == 'neu':
        rasgosNeutrales.append((construirBolsaDePalabras(tweet_clean),fila['Sentimiento']))
    else:
        rasgosNegativos.append((construirBolsaDePalabras(tweet_clean),fila['Sentimiento']))

In [None]:
len(rasgosPositivos)

In [None]:
rasgosPositivos[5]

In [None]:
divP=int(len(rasgosPositivos)*0.95)
divNeu=int(len(rasgosNeutrales)*0.95)
divNeg=int(len(rasgosNegativos)*0.95)
clasificadorSentimiento=NaiveBayesClassifier.train(rasgosPositivos[:divNeg]+rasgosNegativos[:divNeg]+rasgosNeutrales[:divNeg])

In [None]:
nltk.classify.util.accuracy(clasificadorSentimiento,rasgosPositivos[:divNeg]+rasgosNegativos[:divNeg]+rasgosNeutrales[:divNeg])

In [None]:
nltk.classify.util.accuracy(clasificadorSentimiento,rasgosPositivos[divNeg:len(rasgosNegativos)]+rasgosNegativos[divNeg:]+rasgosNeutrales[divNeg:len(rasgosNegativos)])

In [None]:
clasificadorSentimiento.show_most_informative_features()

In [None]:
tweet = "¿El peor solo de guitarra de la historia? Burlas a Nick Jonas por su interpretación"

print(tweet)
tweet_clean = clean_tweets_emoji(tweet)
tweet_clean = clean_tweets_regex(tweet_clean)
bolsa=construirBolsaDePalabras(tweet_clean)
print(bolsa)

In [None]:
clasificadorSentimiento.classify(bolsa)

In [None]:
datos_excel = pd.read_excel(open('test1.xlsx','rb'))
datos_excel = pd.DataFrame(datos_excel)
muestra = datos_excel.sample(len(datos_excel))
muestra

In [None]:
positive = []
neutral = []
negative = []

In [None]:
libro = xlsxwriter.Workbook('test1Tag.xlsx')
hoja = libro.add_worksheet('Data')

data = ["Tweet","Sentimiento"]
row = 0
col = 0
 
for titulo in data:
    hoja.write(row, col, titulo)
    col += 1
row = 1
col = 0

for indice_fila, fila in muestra.iterrows():
    print(fila[2])
    tweet_clean = clean_tweets_emoji(fila[2])
    tweet_clean = clean_tweets_regex(tweet_clean)
    bolsa=construirBolsaDePalabras(tweet_clean)
    sent = clasificadorSentimiento.classify(bolsa)
    print("\033[0;31;40m "+sent+" \033[0m")
    hoja.write(indice_fila+row, 0, fila[2])
    hoja.write(indice_fila+row, 1, sent)
    if sent == 'pos':
        positive.append(fila[2])
    elif sent == 'neu':
        neutral.append(fila[2])
    else:
        negative.append(fila[2])
libro.close()

### Tweets in English

In [None]:
from textblob import TextBlob 

In [None]:
excel_data = pd.read_excel(open('test2.xlsx','rb'))
excel_data = pd.DataFrame(excel_data)

In [None]:
popularity_list = []
num_list = []
num = 1

for tweet in excel_data['Post']:
    print(tweet)
    
    analysis = TextBlob(tweet)
    analysis = analysis.sentiment
    print(analysis)
    popularity = analysis.polarity
    popularity_list.append(popularity)
    num_list.append(num)
    num = num + 1

## Step 5: ShowData

### Texto en Español

In [None]:
%matplotlib inline
import matplotlib.pyplot as plt

In [None]:
fig = plt.figure(u'Gráfica de barras')
ax = fig.add_subplot(111)

nombres = ['Positive','Neutral','Negative']
datos = [len(positive),len(neutral),len(negative)]
xx = range(len(datos))

ax.bar(xx, datos, width=0.8, align='center')
ax.set_xticks(xx)
ax.set_xticklabels(nombres)

plt.show()

### Text in English

In [None]:
plt.figure(figsize=(20,10))
plt.scatter(num_list, popularity_list)
#plt.scatter(excel_data['Day'], popularity_list)
plt.title("Sentiments analysis to PyConCo2020")
plt.xlabel("Posts")
plt.ylabel("Sentiment")
plt.show()

### Bonus

In [None]:
from wordcloud import WordCloud

In [None]:
wordcloud = WordCloud(background_color="white", colormap="Dark2",
               max_font_size=150, random_state=42).generate(tweets_clean)

plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.title("WWCode DataPY2019")
plt.show()