In [1]:
import findspark
findspark.init()
import pyspark as ps
import pickle
import pyodbc

In [2]:
spark = ps.sql.SparkSession.builder \
        .master('local[1]') \
        .appName('Sentimental-bank-processing') \
        .getOrCreate()
sc = spark.sparkContext  

In [3]:
path = r'C:\Users\Breno\Documents\ComputerScience\sentimental_bank\arquivo'

In [4]:
with open(path, 'rb') as arq:
    tw_dict = pickle.load(arq)

In [5]:
tw_dict['location'][3].decode('utf-8')[:tw_dict['location'][3].decode('utf-8').index(",")]

'Rio de Janeiro'

In [6]:
tw_dict.keys()

dict_keys(['date', 'text', 'number_followers', 'number_friends', 'location'])

In [7]:
class Find_Metrics:
    def __init__(self, dict):
        self.dict = dict
        self.number_followers = [number for number in tw_dict['number_followers']]
        self.date = [day for day in tw_dict['date']]
        self.number_friends = [n_friends for n_friends in tw_dict['number_friends']]
        self.location = [local for local in tw_dict['location']]
        
    def total_number_followers(self):
        total = sc.parallelize(self.number_followers) \
                .map(lambda x: int(x)) \
                .sum()
        return total
    
    def followers_mean(self):
        mean = sc.parallelize(self.number_followers) \
                .map(lambda x: int(x)) \
                .mean()
        return mean
    
    def followers_stdev(self):
        stdev = sc.parallelize(self.number_followers) \
                .map(lambda x: int(x)) \
                .stdev()
        return stdev
    
    def total_number_friends(self):
        total = sc.parallelize(self.number_friends) \
                .map(lambda x: int(x)) \
                .sum()
        return total
    
    def friends_mean(self):
        mean = sc.parallelize(self.number_friends) \
                .map(lambda x: int(x)) \
                .mean()
        return mean
    
    def friends_stdev(self):
        stdev = sc.parallelize(self.number_friends) \
                .map(lambda x: int(x)) \
                .stdev()
        return stdev
    
    def day_of_year(self):
        day = sc.parallelize(self.date) \
                .map(lambda x: x.decode('utf-8')[0:10]) \
                .take(1)
        return day
    
    def city(self):
        loc = sc.parallelize(self.location) \
                .map(lambda x: x.decode('utf-8').lower()) \
                .filter(lambda x: x != '') \
                .collect()
        return loc

In [8]:
metrics =  Find_Metrics(tw_dict)

total_followers = metrics.total_number_followers()
followers_mean = metrics.followers_mean()
followers_stdev = metrics.followers_stdev()
total_friends = metrics.total_number_friends()
friends_mean = metrics.friends_mean()
friends_stdev = metrics.friends_stdev()
day = metrics.day_of_year()

In [9]:
day

['2021-10-16']

In [10]:
from datetime import datetime
df = sc.parallelize([(float(total_followers),
                     float(followers_mean),
                     float(followers_stdev),
                     float(total_friends),
                     float(friends_mean),
                     float(friends_stdev),
                     str(day[0]))]).toDF([
    'Followera_total',
    'Followers_mean',
    'Followers_stdev',
    'Friendsd_total',
    'Friends_mean',
    'Friends_stdev',
    'Time_Of_Query'
])

df.show()

+---------------+-----------------+-----------------+--------------+------------+------------------+-------------+
|Followera_total|   Followers_mean|  Followers_stdev|Friendsd_total|Friends_mean|     Friends_stdev|Time_Of_Query|
+---------------+-----------------+-----------------+--------------+------------+------------------+-------------+
|    2.7507142E7|275071.4199999999|1830738.280370431|      109853.0|     1098.53|3853.7817386432253|   2021-10-16|
+---------------+-----------------+-----------------+--------------+------------+------------------+-------------+



In [None]:
conn = pyodbc.connect('Driver={SQL Server};'
                      'Server=DESKTOP-HL3J42P\SQLEXPRESS;'
                      'Database=dbSentimental_Bank;'
                      'Trusted_Connection=yes;')

cursor = conn.cursor()
cursor.execute('SELECT * FROM tb_Stats_Science')

cursor.fetchall()