In [1]:
"""Real-time analysis project - tweeter sentiment analysis""";

In [2]:
import requests
import json

In [76]:
# =============================================================
# CONFIGURATION
# =============================================================
with open("config.json", "r", encoding='utf-8') as conf:
    config = json.load(conf)

In [4]:
# =============================================================
# TWEET SCRAPING
# =============================================================

In [5]:
class ApiConnector:
    """Object providing methods for tweeter data scraping based on hashtag list provided by user"""
    
    url_base = "https://api.twitter.com/2/tweets/search/recent?query={}&max_results={}&tweet.fields=created_at"
    
    def __init__(self, hashtags: list, max_results: int, bearer_token: str):
        self.hashtags = hashtags
        self.max_results = max_results
        self.headers = {"Authorization": f"Bearer {bearer_token}"}
        
    @property
    def query(self) -> str:
        
        _query_list = ["%23" + self.hashtags[0]]
        
        for tag in self.hashtags[1:]:
            _query_list.append("%20OR%20%23" + tag)
            
        _query = "".join(_query_list)
        return _query
        
    def get_hashtags(self) -> list:
        return self.hashtags
    
    def set_hashtags(self, hashtags: list) -> None:
        self.hashtags = hashtags
        
    def get_max_results(self) -> int:
        return self.max_results
        
    def set_max_results(max_results: int) -> None:
        self.max_results = max_results
        
    @property
    def api_url(self) -> str:
        return self.url_base.format(self.query, self.max_results)
    
    def get_tweets(self) -> list:
        """Returns a list containing scraped tweets"""
        session = requests.Session()
        response = session.get(self.api_url, headers=self.headers)
        session.close()
        
        tweets = response.json()["data"]
        return tweets

In [6]:
_HASHTAGS = ["polskilad", "polskiwal", "nowylad", "nowywal", "drozyznapis"]
_MAX_RESULTS = 10

In [7]:
conn = ApiConnector(_HASHTAGS,
                   _MAX_RESULTS,
                   config["bearer_token"])


In [8]:
tweets = conn.get_tweets()

In [None]:
# =============================================================
# SENTIMENT ANALYSIS
# =============================================================

In [8]:
import re
from textblob import TextBlob
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from pyspark import SparkConf,SparkContext
from pyspark.sql import SparkSession

In [16]:
# create spark configuration
conf = SparkConf()
conf.setAppName("TwitterAnalysisApp")
# create spark context with the above configuration
sc = SparkContext(conf=conf)
sc.setLogLevel("ERROR")
# create spark session
spark = SparkSession(sc)

In [51]:
"""Define transformations"""

def get_score(text: str) -> dict:
    return SentimentIntensityAnalyzer().polarity_scores(text)


def clean_text(text) -> str:

        text=re.sub(r'@[A-Za-z0-9!#$%^&*_]+', '', text)

        text=re.sub(r'#', '', text)

        text=re.sub(r'RT[\s]+', '', text)

        text=re.sub(r'https?:\/\/\S+', '', text)

        text=re.sub(r'\n', '', text) 

        if text[0] == ":":
            text = text[1:]

        return"".join([i if ord(i) < 128 else "" for i in text])

    
def leave_char(letter):
    return str.isalpha(letter) or letter == " "


def prepare_text(text):
    text_cleaned = clean_text(text)
    text_raw = ''.join(filter(leave_char, text_cleaned))
    return str(TextBlob(text_raw).translate(from_lang = 'pl', to = 'eng'))    


def count_hashtags(text: str) -> dict:
    res = {}
    for word in text.split(" "):
        if "#" in word:
            if word.strip() not in res:
                res[word.strip()] = 1
            else:
                res[word.strip()] += 1
    return res

def score_raw_text(raw_text: str) -> dict:
    return get_score(prepare_text(raw_text))

In [49]:
# load tweets to spark dataframe
sDf = spark.createDataFrame(tweets, ["id", "timestamp", "text"])

In [58]:
"""Transform data with spark rdd.map"""
rdd = sDf.rdd
rdd2 = rdd.map(lambda x: (x["id"], x["timestamp"], score_raw_text(x["text"]), count_hashtags(x["text"])))
final_sDf = rdd2.toDF(["timestamp", "id", "score", "hashtags"])
final_sDf.show()

+--------------------+-------------------+--------------------+--------------------+
|           timestamp|                 id|               score|            hashtags|
+--------------------+-------------------+--------------------+--------------------+
|2022-01-17T11:00:...|1483031548039413762|{neg -> 0.176, po...|{#Pols… -> 1, #Wa...|
|2022-01-17T11:00:...|1483031513545515010|{neg -> 0.0, pos ...|{#projektdomu -> ...|
|2022-01-17T11:00:...|1483031347807625219|{neg -> 0.266, po...|    {#NowyWał. -> 1}|
|2022-01-17T11:00:...|1483031343311298564|{neg -> 0.064, po...|{leków.

#NowyWał...|
|2022-01-17T10:59:...|1483031166617890817|{neg -> 0.0, pos ...|     {#NowyWał -> 1}|
|2022-01-17T10:59:...|1483031140357267461|{neg -> 0.0, pos ...|{#KAS -> 1, #Pols...|
|2022-01-17T10:59:...|1483031133856088071|{neg -> 0.0, pos ...|     {#NowyWał -> 1}|
|2022-01-17T10:59:...|1483031112318427136|{neg -> 0.266, po...|    {#NowyWał. -> 1}|
|2022-01-17T10:59:...|1483031091971821569|{neg -> 0.0, pos ...|{#

In [None]:
# =============================================================
# DATABASE
# =============================================================

In [59]:
"""Prepare data frame for sending"""
pDf = final_sDf.toPandas()
pDf = pd.concat([pDf, pd.json_normalize(pDf["score"])], axis=1)

# create table for score
score_df = pDf.loc[:, ["id", "timestamp", "neg", "neu", "pos", "compound"]]
#df_score.rename(columns={"id": "ID", "created_at": "TIMESTAMP", "neg": "NEG", 
#                         "neu": "NEU", "pos": "POS", "compound": "COMPOUND"}, inplace=True)

# create table for hashtags
hashtag_df = pd.DataFrame([[i, k, v] for i, d in pDf[['id', 'hashtags']].values for k, v in d.items()],
columns=['id','hashtags', 'value'])
hashtag_df.drop("value", axis=1, inplace=True)

In [72]:
"""Connect to database"""
from snowflake.sqlalchemy import URL
from sqlalchemy import create_engine

In [78]:
config = config["database"]

In [79]:
engine = create_engine(URL(
    user=config["user"],
    password=config["password"],
    account=config["account"],
    warehouse=config["warehouse"],
    database=config["database"],
    schema =config["schema"]
))
 
db_connection = engine.connect()

In [80]:
# save score to database
score_df.to_sql("analiza_sentymentu", db_connection, if_exists='append', index=False)

# save hashtags to database