In [1]:
import pandas as pd

## Tweets

### Step 1 - drop unnecessary columns

In [2]:
df = pd.read_pickle('../datasets/tweets.pkl.gz')

In [14]:
pd.set_option('display.max_colwidth', None)
pd.set_option('display.max_columns', None)

In [7]:
df.head(n=1)

Unnamed: 0,id,conversation_id,created_at,date,time,timezone,user_id,username,name,place,tweet,language,mentions,urls,photos,replies_count,retweets_count,likes_count,hashtags,cashtags,link,retweet,quote_url,video,thumbnail,near,geo,source,user_rt_id,user_rt,retweet_id,reply_to,retweet_date,translate,trans_src,trans_dest
0,1313185264135811079,1313097792609488897,2020-10-05 20:32:19 CEST,2020-10-05,20:32:19,100,1116715380821692417,nowicka_danuta,Danuta Nowicka,,@CzarnekP Zdrowia!,pl,[],[],[],0,0,0,[],[],https://twitter.com/nowicka_danuta/status/1313185264135811079,False,,0,,,,,,,,"[{'screen_name': 'CzarnekP', 'name': 'Przemysław Czarnek', 'id': '834344635774464000'}]",,,,


In [8]:
filtered = df[['username', 'id', 'link', 'tweet']]

In [9]:
filtered.to_pickle('../datasets/for_presentation/tweets_raw.pkl.gz')

### Step 2 - join with users/parties/coalitions

In [2]:
tweets = pd.read_pickle('../datasets/for_presentation/tweets_raw.pkl.gz')

In [3]:
users = pd.read_csv('../datasets/accounts_processed.csv', index_col=0)

In [4]:
users = users[['username', 'party', 'coalition', 'pozycja']]
users = users.rename(columns={'pozycja': 'role'})
users['username'] = users['username'].apply(str.lower)


In [9]:
tweets_users = tweets.merge(users, on='username')

In [10]:
tweets_users.to_pickle('../datasets/for_presentation/tweets_with_party_coalition.pkl.gz')

### Stage 3 - calculate sentiment

In [2]:
import fasttext

In [3]:
sentiment_model = fasttext.load_model('../trained_models/sentiment_model.bin')



In [4]:
clean_tweets = pd.read_pickle('../datasets/tweets_cleaned_emojied2text.pkl.gz')
tweets_users = pd.read_pickle('../datasets/for_presentation/tweets_with_party_coalition.pkl.gz')

In [5]:
clean_tweets['tweet'] = clean_tweets['tweet'].apply(str.lower)
clean_tweets = clean_tweets[['id', 'tweet']]

In [6]:
just_tweets = clean_tweets['tweet'].tolist()

In [7]:
%%time

predictions = sentiment_model.predict(just_tweets)[0]

CPU times: user 1min 32s, sys: 1.52 s, total: 1min 34s
Wall time: 1min 34s


In [13]:
predictions = [label for sublist in predictions for label in sublist]

In [14]:
clean_tweets['sentiment'] = predictions
clean_tweets = clean_tweets[['id', 'sentiment']]

In [15]:
tweets_users_sentiment = tweets_users.merge(clean_tweets, on='id', how='right')

In [20]:
tweets_users_sentiment.replace(to_replace={
    '__label__positive': 'positive',
    '__label__negative': 'negative',
    '__label__ambiguous': 'ambiguous',
    '__label__neutral': 'neutral'
}, inplace=True)

In [21]:
tweets_users_sentiment['sentiment'].value_counts()

negative     551675
neutral      440461
positive     361306
ambiguous    137464
Name: sentiment, dtype: int64

In [22]:
tweets_users_sentiment.to_pickle('../datasets/for_presentation/tweets_with_party_coalition_sentiment.pkl.gz')