**Instruções iniciais**

*   Abra os links dos dados:
    * https://tinyurl.com/bd-office
*   Clique em "Adicionar atalho ao Drive"


# Solução

In [1]:
from google.colab import drive
drive.mount("/content/drive")

Mounted at /content/drive


In [2]:
!pip install nltk twython

Collecting twython
  Downloading twython-3.9.1-py3-none-any.whl (33 kB)
Installing collected packages: twython
Successfully installed twython-3.9.1


In [3]:
!python -m nltk.downloader vader_lexicon

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [4]:
!apt-get update  > /dev/null
!apt-get install openjdk-8-jdk-headless -qq > /dev/null
!wget -q https://archive.apache.org/dist/spark/spark-3.5.1/spark-3.5.1-bin-hadoop3.tgz
!tar xf spark-3.5.1-bin-hadoop3.tgz
!pip install findspark pyspark

Collecting findspark
  Downloading findspark-2.0.1-py2.py3-none-any.whl (4.4 kB)
Collecting pyspark
  Downloading pyspark-3.5.1.tar.gz (317.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m317.0/317.0 MB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25l[?25hdone
Building wheels for collected packages: pyspark
  Building wheel for pyspark (setup.py) ... [?25l[?25hdone
  Created wheel for pyspark: filename=pyspark-3.5.1-py2.py3-none-any.whl size=317488491 sha256=fa5fe0bf4d39fa8f651ce9793cf4206622bdc49a45f641a6dd9cfd286292172e
  Stored in directory: /root/.cache/pip/wheels/80/1d/60/2c256ed38dddce2fdd93be545214a63e02fbd8d74fb0b7f3a6
Successfully built pyspark
Installing collected packages: findspark, pyspark
Successfully installed findspark-2.0.1 pyspark-3.5.1


In [None]:
%env PYTHONHASHSEED=1234
%env JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
%env SPARK_HOME=/content/spark-3.5.1-bin-hadoop3.tgz

env: PYTHONHASHSEED=1234
env: JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64
env: SPARK_HOME=/content/spark-3.5.1-bin-hadoop3.tgz


In [5]:
import findspark
findspark.init("/content/spark-3.5.1-bin-hadoop3")

In [6]:
from pyspark.sql import SparkSession
from pyspark.sql import Row

from datetime import datetime

appName = 'Big Data'
master = 'local[*]'

spark = SparkSession.builder     \
    .master(master) \
    .appName(appName) \
    .getOrCreate()

spark.sparkContext.setLogLevel("WARN")

In [7]:
tmp_path = "/content/drive/MyDrive/Colab Notebooks/mineracao_dados_complexos/mdc/07_big_data/test01/the-office-noquotes.csv"
input_data = spark.sparkContext.textFile(tmp_path)

In [8]:
input_data.take(10)

['id,season,episode,scene,line_text,speaker,deleted',
 '1,1,1,1,All right Jim. Your quarterlies look very good. How are things at the library?,Michael,False',
 "2,1,1,1,Oh I told you. I couldn't close it. So...,Jim,False",
 "3,1,1,1,So you've come to the master for guidance? Is this what you're saying grasshopper?,Michael,False",
 '4,1,1,1,Actually you called me in here but yeah.,Jim,False',
 "5,1,1,1,All right. Well let me show you how it's done.,Michael,False",
 "6,1,1,2,Yes I'd like to speak to your office manager please. Yes hello. This is Michael Scott. I am the Regional Manager of Dunder Mifflin Paper Products. Just wanted to talk to you manager-a-manger.  All right. Done deal. Thank you very much sir. You're a gentleman and a scholar. Oh I'm sorry. OK. I'm sorry. My mistake.  That was a woman I was talking to so... She had a very low voice. Probably a smoker so...  So that's the way it's done.,Michael,False",
 "7,1,1,3,I've uh I've been at Dunder Mifflin for 12 years the last fo

In [9]:
'1,1,1,1,All right Jim. Your quarterlies look very good. How are things at the library?,Michael,False'.split(",")

['1',
 '1',
 '1',
 '1',
 'All right Jim. Your quarterlies look very good. How are things at the library?',
 'Michael',
 'False']

In [11]:
from nltk.sentiment import SentimentIntensityAnalyzer

sentiment = SentimentIntensityAnalyzer()

characters = {"michael", "jim", "pam", "dwight"}
characters = characters.union({"kevin", "erin", "stanley", "andy", "angela", "toby"})

characters

{'andy',
 'angela',
 'dwight',
 'erin',
 'jim',
 'kevin',
 'michael',
 'pam',
 'stanley',
 'toby'}

In [12]:
type(characters)

set

In [13]:
"michael" in characters

True

In [14]:
# import re

# Modifique o método para computar o sentimento de cada fala
def line_sentiment(line):

    try:
        id, season, episode, scene, line_text, speaker, deleted = line.split(",")
        speaker = speaker.lower()

        if speaker in characters:

            spolarity = sentiment.polarity_scores(line_text)
            yield (speaker, (spolarity["compound"], 1)) # (0.001, 1))
    except:
        pass


In [15]:
s = input_data.flatMap(line_sentiment)

In [16]:
s.take(10)

[('michael', (0.4927, 1)),
 ('jim', (0.0, 1)),
 ('michael', (0.0, 1)),
 ('jim', (0.4215, 1)),
 ('michael', (0.2732, 1)),
 ('michael', (0.8496, 1)),
 ('michael', (0.2225, 1)),
 ('pam', (0.2732, 1)),
 ('michael', (0.4588, 1)),
 ('pam', (0.0, 1))]

In [17]:
# Implemente e aplique um método reduce para acumulação dos sentimentos dos personagens
def accumulate_polarity(acc, value):
    # acc = (polarity, 1)
    acc_sum = acc[0]
    acc_count = acc[1]

    acc_sum += value[0]
    acc_count += value[1]

    return (acc_sum, acc_count)

In [18]:
sum_polarity = s.reduceByKey(accumulate_polarity)

In [19]:
sum_polarity.take(10)

[('michael', (1698.1268000000057, 11525)),
 ('pam', (661.6148000000007, 5180)),
 ('stanley', (16.981700000000004, 723)),
 ('kevin', (155.54509999999993, 1653)),
 ('toby', (82.92880000000002, 884)),
 ('andy', (541.9979999999989, 3946)),
 ('jim', (888.028599999999, 6572)),
 ('dwight', (595.5401000000004, 7111)),
 ('angela', (83.32569999999998, 1640)),
 ('erin', (167.85999999999996, 1459))]

In [20]:
# Implemente e aplique um método para calculo do sentimento médio
def calculate_average(value):

    return round(value[0] / value[1], 2)

In [21]:
mean_polarity = sum_polarity.mapValues(calculate_average)

In [23]:
mean_polarity.collect()

[('michael', 0.15),
 ('pam', 0.13),
 ('stanley', 0.02),
 ('kevin', 0.09),
 ('toby', 0.09),
 ('andy', 0.14),
 ('jim', 0.14),
 ('dwight', 0.08),
 ('angela', 0.05),
 ('erin', 0.12)]

In [24]:
mean_polarity.saveAsTextFile("/content/drive/MyDrive/Colab Notebooks/mineracao_dados_complexos/mdc/07_big_data/test01/output")

In [35]:
mean_polarity_df = mean_polarity.toDF().toPandas()
mean_polarity_df.rename(columns={"_1": "character", "_2": "compound_polarity"}, inplace=True)

In [36]:
mean_polarity_df

Unnamed: 0,character,compound_polarity
0,michael,0.15
1,pam,0.13
2,stanley,0.02
3,kevin,0.09
4,toby,0.09
5,andy,0.14
6,jim,0.14
7,dwight,0.08
8,angela,0.05
9,erin,0.12


In [37]:
mean_polarity_df.to_csv("/content/drive/MyDrive/Colab Notebooks/mineracao_dados_complexos/mdc/07_big_data/test01/output/output.txt", index=False)