## Import libraries

In [1]:
import importlib
import aux_func
importlib.reload(aux_func)

from tqdm import tqdm
tqdm.pandas()
import pandas as pd
import numpy as np
import time
import re
import morfeusz2
from collections import Counter
import networkx as nx
import matplotlib.pyplot as plt
import matplotlib as mpl
import seaborn as sns
import matplotlib.ticker as ticker
from pylab import cm
from itertools import product, combinations
import matplotlib.font_manager as fm
from aux_func import to_1D, text_to_word_list, rem_stopwords_tokenize, lemmatize_all, find_optimal_clusters, plot_tsne_pca, get_top_keywords

from scipy.stats import bernoulli, binom, norm, binom_test
from unidecode import unidecode
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.cluster import MiniBatchKMeans
from sklearn.decomposition import PCA
import plotly.express as px
import plotly
from sentimentpl.models import SentimentPLModel
plotly.offline.init_notebook_mode (connected = True)
mpl.rcParams['font.family'] = 'Avenir'
plt.rcParams['font.size'] = 18
plt.rcParams['axes.linewidth'] = 2
colors = cm.get_cmap('tab10', 2)

## Load data

In [2]:
speech_data = pd.read_csv("../data/speech_data.csv", sep=";")
speech_data["date"] = pd.to_datetime(speech_data["date"])
speech_data["last_name"] = speech_data["speaker"].apply(lambda x: x.split(" ")[0])
speech_data.to_csv("../data/speech_data_org.csv")

deputies = pd.read_csv("../data/deputies.csv", sep=";", names=["Name", "Party"])
deputies["last_name"] = deputies["Name"].apply(lambda x: x.split(" ")[0])
deputies.to_csv("../data/deputies_data_org.csv")
deputies.Party[deputies.Party == "PO-KO"] = "PO"
deputies.Party[deputies.Party == "PSL-KP"] = "PSL"
deputies.Party[deputies.Party == "PSL-UED"] = "PSL"

## Data cleaning

#### Remove deputy Polak and same lastname deputies

In [3]:
deputies = deputies[deputies["last_name"] != "Polak"]
deputies = deputies[deputies["last_name"] != "Duda"]
duplicates = deputies[deputies.duplicated(['last_name'], keep=False)]
deputies.drop_duplicates(subset=['last_name'], keep=False, inplace=True)
deputies_list = list(deputies["last_name"])
deputies.to_csv("../data/deputies_data_clean.csv")

speech_data_wo_dup = speech_data[~speech_data.last_name.isin(list(duplicates["last_name"]))].copy()

#### Merge datasets

In [4]:
joined_data = pd.merge(speech_data_wo_dup, deputies[["last_name", "Party"]], how="inner", on="last_name")

## Text cleaning

#### Remove non-speaker parts from speech

In [5]:
full_text_list = []
for name, t in zip(joined_data["speaker"], joined_data["speech_text"]):
    is_ok = True
    full_text = ""
    for text in t.split("\xa0\xa0"):
        name_reversed = name.split(" ")[1] + " " + name.split(" ")[0]
        if text != "":
            if text[-1]==":":
                is_ok = False
            if text == f"\xa0 Poseł {name_reversed}:":
                is_ok = True
            if is_ok and text !=  f"\xa0 Poseł {name_reversed}:":
                full_text += text
    full_text_list.append(full_text)
joined_data["speech_text_only_author"] = full_text_list

#### Remove special characters from speech text

In [6]:
joined_data["speech_text_only_author"].replace(to_replace = "(@\[0-9A-Za-ząćęłńóśżź,.():]+)|([^0-9A-Za-ząćęłńóśżź,.(): \t])|(\w+:\/\/\S+)|^rt|http.+?|\(.*\)"
                            ,regex=True
                            ,value=""
                            ,inplace=True)

## Create derived variables

#### Load Morfeusz

In [7]:
morf = morfeusz2.Morfeusz()

#### Create variable with mentions

In [8]:
# joined_data["mentions"] = joined_data["speech_text_only_author"].apply(lambda x: pd.unique([word[2][1].split(":")[0] for word in morf.analyse(x) if 'nazwisko' in word[2][3] and word[2][1].split(":")[0] in deputies_list]))
joined_data["mentions_org"] = joined_data["speech_text_only_author"].progress_apply(lambda x: pd.unique([word[2][0].split(":")[0] for word in morf.analyse(x) if 'nazwisko' in word[2][3] and word[2][1].split(":")[0] in deputies_list]))
joined_data["mentioned_someone"] = joined_data["mentions_org"].progress_apply(lambda x: len(x) != 0)
joined_data.to_csv("../data/speech_data_clean.csv")

100%|██████████| 37218/37218 [03:34<00:00, 173.74it/s]
100%|██████████| 37218/37218 [00:00<00:00, 886423.48it/s]


#### Create mentions dataset

In [23]:
# mentions_data = joined_data[joined_data["mentioned_someone"] == 1].copy()[["speaker", "last_name", "speech_text_only_author", "mentions"]].explode('mentions').reset_index(drop=True)
# mentions_data["mention_sentence"] = mentions_data.apply(lambda x: [sentence + '.' for sentence in x["speech_text_only_author"].split('.') if x["mentions"] in sentence], axis=1)

In [9]:
mentions_data_org = joined_data[joined_data["mentioned_someone"] == 1].copy()[["speaker", "last_name", "speech_text_only_author", "mentions_org"]].explode('mentions_org').reset_index(drop=True)
mentions_data_org["mention_sentence"] = mentions_data_org.apply(lambda x: [sentence + '.' for sentence in x["speech_text_only_author"].split('.') if x["mentions_org"] in sentence], axis=1)
mentions_data_org["mentions"] = mentions_data_org.apply(lambda x: pd.unique([word[2][1].split(":")[0] for word in morf.analyse(x.mentions_org) if 'nazwisko' in word[2][3]])[0], axis=1)



#### Create probability dataset

In [10]:
prob_data = mentions_data_org.copy()[["speaker", "last_name", "mentions"]].rename(columns = {"last_name" : "i", "mentions" : "j"})
prob_data["n_ij"] = prob_data.groupby(["i", "j"]).transform("count")
prob_data["p_ij"] = prob_data["n_ij"] / len(prob_data)

empirical_prob = pd.DataFrame(list(product(deputies["last_name"],deputies["last_name"])), columns=["i", "j"])
empirical_prob = pd.merge(empirical_prob, prob_data[["i", "j", "n_ij", "p_ij"]], how="left", on=["i", "j"]).drop_duplicates()
empirical_prob = pd.merge(empirical_prob, empirical_prob[["j", "i", "n_ij"]], how="left", left_on=["i", "j"], right_on=["j", "i"]).drop(["j_y", "i_y"], axis=1).rename(columns = {"i_x" : "i", "j_x" : "j", "n_ij_x" : "n_ij", "n_ij_y" : "n_ji"})
empirical_prob["p_ji"] = empirical_prob["n_ji"] / len(prob_data)
empirical_prob = empirical_prob.fillna(0)


In [11]:
marginal_prob = empirical_prob.groupby("i").sum().reset_index().rename(columns={"n_ij" : "n_i", "p_ij" : "p_i", "n_ji" : "n_j", "p_ji" : "p_j"})


### Calculate model probabilities

In [12]:
model_prob = pd.DataFrame(list(product(deputies["last_name"],deputies["last_name"])), columns=["i", "j"])
model_prob = pd.merge(model_prob, marginal_prob[["i", "p_i"]], how="left", on="i")
model_prob = pd.merge(model_prob, marginal_prob[["i", "p_j"]], how="left", left_on="j", right_on="i").drop("i_y", axis=1).rename(columns = {"i_x" : "i"})

model_prob["p_model_ij"] = model_prob["p_i"] * model_prob["p_j"]

### Empirical vs model probabilities

In [13]:
emp_vs_model_prob = pd.DataFrame(list(product(deputies["last_name"],deputies["last_name"])), columns=["i", "j"])

emp_vs_model_prob = pd.merge(emp_vs_model_prob, model_prob[["i", "j", "p_model_ij"]], how="left", on=["i", "j"])
emp_vs_model_prob = pd.merge(emp_vs_model_prob, empirical_prob[["i", "j", "p_ij", "n_ij"]], how="left", on=["i", "j"])

In [14]:
emp_vs_model_prob["under"] = emp_vs_model_prob.apply(lambda x: binom_test(x=x.n_ij, n=len(prob_data), p=x.p_model_ij, alternative="less") <= 0.05, axis=1)
emp_vs_model_prob["over"] = emp_vs_model_prob.apply(lambda x: binom_test(x=x.n_ij, n=len(prob_data), p=x.p_model_ij, alternative="greater") <= 0.05, axis=1)

In [15]:
final_prob_data = emp_vs_model_prob.merge(deputies[["last_name", "Party"]], left_on="i", right_on="last_name", how="left")
final_prob_data = final_prob_data.merge(deputies[["last_name", "Party"]], left_on="j", right_on="last_name", how="left").drop(["last_name_x", "last_name_y"], axis=1)
final_prob_data.rename(columns={"Party_x":"party_i", "Party_y":"party_j"}, inplace=True)
final_prob_data.to_csv("../data/final_prob_data.csv")


In [16]:
final_prob_data

Unnamed: 0,i,j,p_model_ij,p_ij,n_ij,under,over,party_i,party_j
0,Abramowicz,Abramowicz,1.385456e-06,0.0,0.0,False,False,PiS,PiS
1,Abramowicz,Adamczyk,1.300197e-05,0.0,0.0,False,False,PiS,PiS
2,Abramowicz,Ajchler,0.000000e+00,0.0,0.0,False,False,PiS,PO
3,Abramowicz,Andruszkiewicz,4.262941e-07,0.0,0.0,False,False,PiS,niez.
4,Abramowicz,Andzel,1.492029e-06,0.0,0.0,False,False,PiS,PiS
...,...,...,...,...,...,...,...,...,...
207020,Żyżyński,Zwiercan,8.355364e-07,0.0,0.0,False,False,PiS,PiS
207021,Żyżyński,Zyska,9.548987e-07,0.0,0.0,False,False,PiS,PiS
207022,Żyżyński,Żalek,0.000000e+00,0.0,0.0,False,False,PiS,PiS
207023,Żyżyński,Żmijan,0.000000e+00,0.0,0.0,False,False,PiS,PO


### Sentiment

In [20]:
sentiment_data = pd.DataFrame(mentions_data_org.explode("mention_sentence"))
sentiment_data.mention_sentence = sentiment_data.mention_sentence.astype(str)
sentiment_data.mention_sentence = sentiment_data.apply(lambda x: ''.join(char for char in x["mention_sentence"] if not char.isdigit()), axis=1)

In [21]:
sentiment_data

Unnamed: 0,speaker,last_name,speech_text_only_author,mentions_org,mention_sentence,mentions
0,Abramowicz Adam,Abramowicz,Panie Marszałku Wysoki Sejmie Panie Ministrze...,Szydło,Dzisiaj mamy dla państwa doskonałą wiadomość:...,Szydło
1,Abramowicz Adam,Abramowicz,Panie Marszałku Wysoki Sejmie Rok temu Sejm u...,Morawiecki,"Polski rząd zmienia to prawo, premier Mateusz...",Morawiecki
1,Abramowicz Adam,Abramowicz,Panie Marszałku Wysoki Sejmie Rok temu Sejm u...,Morawiecki,Dla nich premier Morawiecki także przewidział...,Morawiecki
2,Abramowicz Adam,Abramowicz,Panie Marszałku Wysoki Sejmie Panowie Ministr...,Morawiecki,Premier Morawiecki dołączył ten projekt do sw...,Morawiecki
3,Abramowicz Adam,Abramowicz,Panie Marszałku Wysoka Izbo Stoimy przed hist...,Morawieckiego,Za chwilę Wysoka Izba przyjmie ustawę z pakie...,Morawiecki
...,...,...,...,...,...,...
15311,Żyżyński Jerzy,Żyżyński,Panie Marszałku Wysoka Izbo Szanowni Państwo ...,Jaworski,Ten budżet po prostu jest dobrym budżetem w r...,Jaworski
15312,Żyżyński Jerzy,Żyżyński,Panie Marszałku Wysoka Izbo Szanowni Państwo ...,Jakubiak,"Przy całej sympatii, bo pan Jakubiak produkuj...",Jakubiak
15313,Żyżyński Jerzy,Żyżyński,Dziękuję bardzo. Panie Marszałku Wysoka Izbo ...,Neumann,"przewodniczący Neumann o przyszłym budżecie,...",Neumann
15314,Żyżyński Jerzy,Żyżyński,Dziękuję bardzo. Ja w ramach niewykorzystaneg...,Kukiza,Ja w ramach niewykorzystanego czasu chciałem ...,Kukiz


In [22]:
# model = SentimentPLModel(from_pretrained='latest')
# sentiment_data["mention_sentence_score"] = sentiment_data.progress_apply(lambda x: model(x.mention_sentence).item(),axis=1)
# sentiment_score = sentiment_data.groupby(sentiment_data.index)['mention_sentence_score'].mean()

100%|██████████| 19992/19992 [26:01<00:00, 12.81it/s] 


In [24]:
sentiment_data.to_csv("../data/sentiment_data.csv")

In [51]:

# sentiment_data["mention_sentence_score"] = pd.read_csv("../data/sentiment_score.csv")

100%|██████████| 19992/19992 [24:10<00:00, 13.79it/s] 


In [37]:
final_sentiment_data = sentiment_data.copy()
final_sentiment_data = final_sentiment_data.merge(deputies[["last_name", "Party"]], on="last_name", how="left").rename(columns={"Party" : "party_i"})
final_sentiment_data = final_sentiment_data.merge(deputies[["last_name", "Party"]], left_on="mentions", right_on="last_name", how="left").rename(columns={"Party" : "party_j"}).drop("last_name_y", axis=1).rename(columns={"last_name_x":"last_name"})

In [38]:
final_sentiment_data

Unnamed: 0,speaker,last_name,speech_text_only_author,mentions_org,mention_sentence,mentions,mention_sentence_score,party_i,party_j
0,Abramowicz Adam,Abramowicz,Panie Marszałku Wysoki Sejmie Panie Ministrze...,Szydło,Dzisiaj mamy dla państwa doskonałą wiadomość:...,Szydło,-0.008995,PiS,PiS
1,Abramowicz Adam,Abramowicz,Panie Marszałku Wysoki Sejmie Rok temu Sejm u...,Morawiecki,"Polski rząd zmienia to prawo, premier Mateusz...",Morawiecki,-0.007059,PiS,WiS
2,Abramowicz Adam,Abramowicz,Panie Marszałku Wysoki Sejmie Rok temu Sejm u...,Morawiecki,Dla nich premier Morawiecki także przewidział...,Morawiecki,-0.008995,PiS,WiS
3,Abramowicz Adam,Abramowicz,Panie Marszałku Wysoki Sejmie Panowie Ministr...,Morawiecki,Premier Morawiecki dołączył ten projekt do sw...,Morawiecki,-0.008995,PiS,WiS
4,Abramowicz Adam,Abramowicz,Panie Marszałku Wysoka Izbo Stoimy przed hist...,Morawieckiego,Za chwilę Wysoka Izba przyjmie ustawę z pakie...,Morawiecki,-0.008995,PiS,WiS
...,...,...,...,...,...,...,...,...,...
19987,Żyżyński Jerzy,Żyżyński,Panie Marszałku Wysoka Izbo Szanowni Państwo ...,Jaworski,Ten budżet po prostu jest dobrym budżetem w r...,Jaworski,-0.008995,PiS,PiS
19988,Żyżyński Jerzy,Żyżyński,Panie Marszałku Wysoka Izbo Szanowni Państwo ...,Jakubiak,"Przy całej sympatii, bo pan Jakubiak produkuj...",Jakubiak,0.589318,PiS,niez.
19989,Żyżyński Jerzy,Żyżyński,Dziękuję bardzo. Panie Marszałku Wysoka Izbo ...,Neumann,"przewodniczący Neumann o przyszłym budżecie,...",Neumann,-0.376608,PiS,PO
19990,Żyżyński Jerzy,Żyżyński,Dziękuję bardzo. Ja w ramach niewykorzystaneg...,Kukiza,Ja w ramach niewykorzystanego czasu chciałem ...,Kukiz,-0.008995,PiS,Kukiz15


In [39]:
final_sentiment_data.to_csv("../data/final_sentiment_data.csv")

In [272]:
sentiment_score.to_csv("../data/sentiment_score.csv")

In [258]:
final_sentiment_data = mentions_data_org.copy()
final_sentiment_data = final_sentiment_data.merge(deputies[["last_name", "Party"]], on="last_name", how="left").rename(columns={"Party" : "party_i"})
final_sentiment_data = final_sentiment_data.merge(deputies[["last_name", "Party"]], left_on="mentions", right_on="last_name", how="left").rename(columns={"Party" : "party_j"})
final_sentiment_data["sentiment_score"] = sentiment_score

In [329]:
test2 = pd.DataFrame(test.groupby(["party_i", "party_j"])['sentiment_score'].sum()).sort_values(by="sentiment_score").reset_index()
test2["sentiment_score"]=test2["sentiment_score"].astype(int)