# Auswertung der Länderanalyse

In [45]:
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt

In [19]:
# Liest den Dataframe der Ergebnisse
df = pd.read_csv("NLP_Laenderanalyse_Kilometer_Komplett.csv")

In [20]:
df

Unnamed: 0,land,neutral,negative,positive,kilometer
0,türkei,6810,1297,331,2350.0
1,polen,6091,2838,898,655.0
2,rumänien,2688,480,65,1223.0
3,griechenland,3641,1388,2686,1651.0
4,kroatien,3258,995,211,795.0
5,bulgarien,2537,1110,73,1482.0
6,afghanistan,2931,567,107,4868.0
7,russland,1940,6254,57,5239.0
8,serbien,4779,1188,55,1045.0
9,portugal,4200,397,137,1945.0


In [21]:
# Neue Spalte die die Gesamtanzahl der Query Expansions ausrechnet
df["total"] = df["neutral"] + df["negative"] + df["positive"]

In [22]:
df

Unnamed: 0,land,neutral,negative,positive,kilometer,total
0,türkei,6810,1297,331,2350.0,8438
1,polen,6091,2838,898,655.0,9827
2,rumänien,2688,480,65,1223.0,3233
3,griechenland,3641,1388,2686,1651.0,7715
4,kroatien,3258,995,211,795.0,4464
5,bulgarien,2537,1110,73,1482.0,3720
6,afghanistan,2931,567,107,4868.0,3605
7,russland,1940,6254,57,5239.0,8251
8,serbien,4779,1188,55,1045.0,6022
9,portugal,4200,397,137,1945.0,4734


In [23]:
# Neue Spalten in denen die prozentualen Anteile der Sentimente aufgeführt sind
df["%neutral"] = df["neutral"] / df["total"]
df["%negative"] = df["negative"] / df["total"]
df["%positive"] = df["positive"] / df["total"]

In [26]:
df

Unnamed: 0,land,neutral,negative,positive,kilometer,total,%neutral,%negative,%positive
0,türkei,6810,1297,331,2350.0,8438,0.807063,0.153709,0.039227
1,polen,6091,2838,898,655.0,9827,0.619823,0.288796,0.091381
2,rumänien,2688,480,65,1223.0,3233,0.831426,0.148469,0.020105
3,griechenland,3641,1388,2686,1651.0,7715,0.471938,0.179909,0.348153
4,kroatien,3258,995,211,795.0,4464,0.729839,0.222894,0.047267
5,bulgarien,2537,1110,73,1482.0,3720,0.681989,0.298387,0.019624
6,afghanistan,2931,567,107,4868.0,3605,0.813037,0.157282,0.029681
7,russland,1940,6254,57,5239.0,8251,0.235123,0.757969,0.006908
8,serbien,4779,1188,55,1045.0,6022,0.79359,0.197277,0.009133
9,portugal,4200,397,137,1945.0,4734,0.887199,0.083861,0.02894


In [55]:
# Dataframe wird als CSV gespeichert
df.to_csv("NLP_Laender_Entfernung_df.csv")

In [27]:
# Den Spalten werden eigene Variablen zur Berechnung der Korrelation zugewiesen
anteil_negativ = df["%negative"]
anteil_positiv = df["%positive"]
anteil_neutral = df["%neutral"]
entfernung = df["kilometer"]

# Korrelation zwischen Anteil negativ und Entfernung

In [29]:
# Berechnung der Korrelation zwischen dem Anteil negativer Sentiments und der Entfernung zu Deutschland
# Pearson Korrelationskoeffizient
anteil_negativ.corr(entfernung) 

0.22503992465074452

# Korrelation zwischen Anteil positiv und Entfernung

In [32]:
# Berechnung der Korrelation zwischen dem Anteil positiver Sentiments und der Entfernung zu Deutschland
# Pearson Korrelationskoeffizient
anteil_positiv.corr(entfernung) 

0.05629415149146166

# Korrelation zwischen Anteil neutral und Entfernung

In [33]:
# Berechnung der Korrelation zwischen dem Anteil neutraler Sentiments und der Entfernung zu Deutschland
# Pearson Korrelationskoeffizient
anteil_neutral.corr(entfernung) 

-0.21841127892877668

-----------------
# Veranschaulichung der Daten

## Hier ist das Dataframe nach verschiedenen Kriterien sortiert, damit man die Ergebnisse aus verschiedenen Blickwinkeln betrachten kann

In [50]:
# Sortiert nach Gesamtanzahl der Query Expansions
df_total = df.sort_values(by=['total'], ascending=False)
df_total

Unnamed: 0,land,neutral,negative,positive,kilometer,total,%neutral,%negative,%positive
30,deutschland,9165,2923,1074,,13162,0.696323,0.222079,0.081599
1,polen,6091,2838,898,655.0,9827,0.619823,0.288796,0.091381
23,england,3779,3779,1482,772.0,9040,0.418031,0.418031,0.163938
0,türkei,6810,1297,331,2350.0,8438,0.807063,0.153709,0.039227
7,russland,1940,6254,57,5239.0,8251,0.235123,0.757969,0.006908
28,indien,4293,2433,1337,6637.0,8063,0.532432,0.301749,0.165819
3,griechenland,3641,1388,2686,1651.0,7715,0.471938,0.179909,0.348153
11,china,3280,2465,1627,7324.0,7372,0.444927,0.334373,0.2207
15,niederlande,5809,1252,289,336.0,7350,0.79034,0.17034,0.03932
10,frankreich,3947,2949,113,872.0,7009,0.563133,0.420745,0.016122


In [49]:
# Sortiert nach Anteil der negativen Query Expansions
df_negative = df.sort_values(by=['%negative'], ascending=False)
df_negative

Unnamed: 0,land,neutral,negative,positive,kilometer,total,%neutral,%negative,%positive
7,russland,1940,6254,57,5239.0,8251,0.235123,0.757969,0.006908
13,pakistan,860,1351,67,5373.0,2278,0.377524,0.593064,0.029412
10,frankreich,3947,2949,113,872.0,7009,0.563133,0.420745,0.016122
23,england,3779,3779,1482,772.0,9040,0.418031,0.418031,0.163938
18,spanien,2935,2335,835,1696.0,6105,0.480753,0.382473,0.136773
11,china,3280,2465,1627,7324.0,7372,0.444927,0.334373,0.2207
17,bosnien,1921,992,154,973.0,3067,0.626345,0.323443,0.050212
28,indien,4293,2433,1337,6637.0,8063,0.532432,0.301749,0.165819
5,bulgarien,2537,1110,73,1482.0,3720,0.681989,0.298387,0.019624
25,arabisch,2289,1265,759,4145.0,4313,0.530721,0.293299,0.17598


In [51]:
# Sortiert nach Anteil der positiven Query Expansions
df_positive = df.sort_values(by=['%positive'], ascending=False)
df_positive

Unnamed: 0,land,neutral,negative,positive,kilometer,total,%neutral,%negative,%positive
3,griechenland,3641,1388,2686,1651.0,7715,0.471938,0.179909,0.348153
21,italien,2732,1483,2041,979.0,6256,0.436701,0.237052,0.326247
11,china,3280,2465,1627,7324.0,7372,0.444927,0.334373,0.2207
25,arabisch,2289,1265,759,4145.0,4313,0.530721,0.293299,0.17598
28,indien,4293,2433,1337,6637.0,8063,0.532432,0.301749,0.165819
23,england,3779,3779,1482,772.0,9040,0.418031,0.418031,0.163938
18,spanien,2935,2335,835,1696.0,6105,0.480753,0.382473,0.136773
19,ukraine,2019,721,426,1548.0,3166,0.637713,0.227732,0.134555
14,irak,1761,803,317,3392.0,2881,0.611246,0.278723,0.110031
12,vietnam,2723,609,340,9102.0,3672,0.741558,0.16585,0.092593


In [53]:
# Sortiert nach Entfernung
df_entfernung = df.sort_values(by=['kilometer'], ascending=False)
df_entfernung

Unnamed: 0,land,neutral,negative,positive,kilometer,total,%neutral,%negative,%positive
12,vietnam,2723,609,340,9102.0,3672,0.741558,0.16585,0.092593
22,amerika,4111,1662,235,7765.0,6008,0.684254,0.276631,0.039115
11,china,3280,2465,1627,7324.0,7372,0.444927,0.334373,0.2207
26,afrika,2545,660,200,7087.0,3405,0.74743,0.193833,0.058737
28,indien,4293,2433,1337,6637.0,8063,0.532432,0.301749,0.165819
13,pakistan,860,1351,67,5373.0,2278,0.377524,0.593064,0.029412
7,russland,1940,6254,57,5239.0,8251,0.235123,0.757969,0.006908
6,afghanistan,2931,567,107,4868.0,3605,0.813037,0.157282,0.029681
25,arabisch,2289,1265,759,4145.0,4313,0.530721,0.293299,0.17598
27,iran,2719,907,253,4057.0,3879,0.700954,0.233823,0.065223


In [56]:
df_neutral = df.sort_values(by=['%neutral'], ascending=False)
df_neutral

Unnamed: 0,land,neutral,negative,positive,kilometer,total,%neutral,%negative,%positive
24,schweiz,6411,253,46,532.0,6710,0.95544,0.037705,0.006855
16,oesterreich,4144,281,37,506.0,4462,0.928732,0.062976,0.008292
9,portugal,4200,397,137,1945.0,4734,0.887199,0.083861,0.02894
2,rumänien,2688,480,65,1223.0,3233,0.831426,0.148469,0.020105
29,israel,4367,928,39,2976.0,5334,0.81871,0.173978,0.007312
6,afghanistan,2931,567,107,4868.0,3605,0.813037,0.157282,0.029681
0,türkei,6810,1297,331,2350.0,8438,0.807063,0.153709,0.039227
8,serbien,4779,1188,55,1045.0,6022,0.79359,0.197277,0.009133
15,niederlande,5809,1252,289,336.0,7350,0.79034,0.17034,0.03932
26,afrika,2545,660,200,7087.0,3405,0.74743,0.193833,0.058737
