# Import and become familiar with data

In [None]:
#!pip install germanetpy==0.2.2
from pathlib import Path
from germanetpy import germanet
import pandas as pd
import csv
import os
import glob
import numpy as np
import spacy
import swifter
import re
import nltk
import codecs
from nltk.tokenize import RegexpTokenizer
from nltk.stem.porter import *
from nltk.corpus import stopwords
from nltk.corpus import wordnet 
from nltk.stem.cistem import Cistem
from collections import Counter
from spacy.lang.de.examples import sentences 
#import for graphs:
#!pip3 install bokeh 
from bokeh.plotting import ColumnDataSource
from bokeh.plotting import figure
from bokeh.plotting import show
from bokeh.plotting import output_file
from bokeh.transform import factor_cmap
#!pip install matplotlib
import matplotlib as plt
import matplotlib.pyplot as plt
import matplotlib.pyplot as pl
#loading de_core_news_sm-3.0.0 for preprocessing
nlp = spacy.load("/Users/anna/Desktop/NLP/sciebo-code_PowerPuff/de_core_news_sm-3.0.0/de_core_news_sm/de_core_news_sm-3.0.0")
#germanet import
germanet_object = germanet.Germanet("/Users/anna/germanet_data/GN_V170_XML")
#loading Datset
df = pd.read_csv(r'/Users/anna/Desktop/NLP/sciebo-code_PowerPuff/suggestions_minorities_slice.csv', delimiter=",", error_bad_lines=False, engine ='python')
df.head(15)



In [2]:
import re
from typing import List
import torch
import torch.nn.functional as F
from transformers import AutoModelForSequenceClassification, AutoTokenizer

class GSBertPolarityModel:
    
    def __init__(self, model_name: str = "oliverguhr/german-sentiment-bert"):
        self.model = AutoModelForSequenceClassification.from_pretrained(model_name)
        # Always use original tokenizer
        self.tokenizer = AutoTokenizer.from_pretrained("oliverguhr/german-sentiment-bert")

        self.clean_chars = re.compile(r'[^A-Za-züöäÖÜÄß ]', re.MULTILINE)
        self.clean_http_urls = re.compile(r'https*\S+', re.MULTILINE)
        self.clean_at_mentions = re.compile(r'@\S+', re.MULTILINE)

    def replace_numbers(self, text: str) -> str:
        return text.replace("0", " null").replace("1", " eins").replace("2", " zwei") \
            .replace("3", " drei").replace("4", " vier").replace("5", " fünf") \
            .replace("6", " sechs").replace("7", " sieben").replace("8", " acht") \
            .replace("9", " neun")

    def clean_text(self, text: str) -> str:
        text = text.replace("\n", " ")
        text = self.clean_http_urls.sub('', text)
        text = self.clean_at_mentions.sub('', text)
        text = self.replace_numbers(text)
        text = self.clean_chars.sub('', text)  # use only text chars
        text = ' '.join(text.split())  # substitute multiple whitespace with single whitespace
        text = text.strip().lower()
        return text

    @staticmethod
    def probs2polarities(pnn: torch.Tensor) -> torch.Tensor:
        """Transform softmax probs of a [positive, negative, neutral] classifier
        into scalar polarity scores of range [-1, 1].
        High values express positive sentiment, low negative ones negative sentiment.
        Values close to 0 express neutral sentiment."""
        pos = pnn[:, 0]
        neg = pnn[:, 1]
        # Transform range [0, 1] to [-1, 1]
        # Ignore neutrality score as it's implicitly encoded as (1 - pos - neg)
        polarities = pos - neg
        return polarities

    def predict_sentiment_batch(self, texts: List[str]) -> torch.Tensor:
        texts = [self.clean_text(text) for text in texts]
        # Add special tokens takes care of adding [CLS], [SEP], <s>... tokens in the right way for each model.
        input_ids = self.tokenizer.batch_encode_plus(
            texts,
            padding=True,
            add_special_tokens=True,
            truncation=True  # Ensure that the text does not exceed the token limit
        )
        input_ids = torch.tensor(input_ids["input_ids"])

        with torch.no_grad():
            logits = self.model(input_ids)
            probs = F.softmax(logits[0], dim=1)

        polarities = self.probs2polarities(probs)
        return polarities

    def analyse_sentiment(self, text: str) -> float:
        polarity = self.predict_sentiment_batch([text]).item()
        return polarity

In [3]:
#looking for unique terms in queryterm for seeing the different categories
unique_queryterm=np.unique(df['queryterm'])

# Cleaning

In [4]:
#Data cleaning: remove distracting characters 
df['cleaned'] = df['raw_data'].replace(regex = ['{"q":.+?.}|\\\\u003c\\\\.b|,"t":{"bpc":false,"tlw":false}|,0|"phrase":|\\\\u003e|\\\\u003cb|\\\\u003cVb|\\\\u003cse|\\\\u003csc'], value='') 

In [5]:
#removing _minorities
df['label'] = df['label'].replace(regex = [r'_minorities'], value='')

In [6]:
#tokenization of cleaned and queryterm and create a new dataframe
tokenizer = RegexpTokenizer(r'\w+')
df["tokens_suggestions"] = df.apply(lambda row: tokenizer.tokenize(str(row["cleaned"].lower())), axis=1)
df["tokensroot"] = df.apply(lambda row: tokenizer.tokenize(str(row["queryterm"].lower())), axis=1)
#df["spacy_tokens"] = df.apply(lambda row: nlp(str(row["tokensroot"])), axis=1)
tokensroot2 = df['tokensroot']

In [7]:
#using nlp ergo spacy for tokensroot
tokensroot = nlp(str(df['tokensroot']))

# Data processing

In [9]:
#Is the queryterm a question or not?; 0=no question, 1=question
n = 200000
i = 0
stemmer = Cistem()
german_stop_words = stopwords.words('german')
df['Frage?'] = 0
df['tokens_suggestions_cleaned_root'] = ''
df['stemmed_words'] = ''
df['synsets_ger'] = ''
questionTerms = ['ist', 'sind', 'sollen', 'wann', 'warum', 'wenn', 'weshalb', 'wo', 'wollen']
cleaned_token_suggestions = []
for row in df.itertuples(index=True, name='Pandas'):
    synset = []
    cleanedSuggestions = list(set(row.tokens_suggestions))
    df.at[row.Index, 'tokens_suggestions_cleaned_root'] = cleanedSuggestions
    if row.tokensroot[0] in questionTerms:
        df.at[row.Index, 'Frage?'] = 1
    for tokensroot in row.tokensroot:
        if tokensroot in row.tokens_suggestions:
            cleanedSuggestions.pop(cleanedSuggestions.index(tokensroot))
            df.at[row.Index, 'tokens_suggestions_cleaned_root'] = cleanedSuggestions
            for cleanedSuggestion in cleanedSuggestions:
                synsetToken = germanet_object.get_synsets_by_orthform(cleanedSuggestion)
                if synsetToken != []:
                    synset.append(synsetToken)
        if tokensroot not in german_stop_words and tokensroot not in questionTerms:
            stemmedTokensroot = stemmer.stem(tokensroot)
            df.at[row.Index, 'stemmed_words'] = stemmedTokensroot
    df.at[row.Index, 'synsets_ger'] = synset

In [10]:
#getting hypernyms and hyponyms of synsets_ger to dataframe lexunits
def get_custom_names_synsets(syns):
    ret = []
    try:
        for cleanedSyns in syns:
            for syn in cleanedSyns:
                for lemma in syn.lexunits:
                    ret.extend(lemma.get_all_orthforms())
    #print(ret)
        return ret
    except:
        return ret
df["lexunits"] = df.apply(lambda row: get_custom_names_synsets(row["synsets_ger"]), axis=1)

    
def get_hypernyms_from_list_of_synsets(list_syns):
    directHypernyms = []
    for second_level_list_syns in list_syns:
        for syn in second_level_list_syns:
            directHypernyms.append(syn.direct_hypernyms)
    return directHypernyms
df["hypernyms"] = df.apply(lambda row: get_hypernyms_from_list_of_synsets(row["synsets_ger"]), axis=1)
df["lexunits_hypernyms"] = df.apply(lambda row: get_custom_names_synsets(row["hypernyms"]), axis=1)

def get_hyponyms_from_list_of_synsets(list_syns):
    directHyponyms = []
    for second_level_list_syns in list_syns:
        for syn in second_level_list_syns:
            directHyponyms.append(syn.direct_hyponyms)
    return directHyponyms
df["hyponyms"] = df.apply(lambda row: get_hyponyms_from_list_of_synsets(row["synsets_ger"]), axis=1)
df["lexunits_hyponyms"] = df.apply(lambda row: get_custom_names_synsets(row["hyponyms"]), axis=1)
#df[["suggestion_ger","lexunits_hypernyms", "lexunits_hyponyms", "lexunits"]]

In [12]:
#tensor results for tokens_suggestions_cleaned_root and the lexunits tensors of hypernyms
model = GSBertPolarityModel()
df["tokens_suggestions_tensor"] = ''
df["lexunits_tensor"] = ''
def get_tensor(words):
    if words != []:
        word = list(dict.fromkeys(words))
        filter(lambda x: x in german_stop_words, words)
        return model.predict_sentiment_batch(words)
    else:
        return ''
df["lexunits_tensor"] = df.lexunits.swifter.apply(lambda row: get_tensor(row))
df["tokens_suggestions_tensor"] = df.tokens_suggestions_cleaned_root.swifter.apply(lambda row: get_tensor(row))

Pandas Apply:   0%|          | 0/52464 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/52464 [00:00<?, ?it/s]

In [16]:
#sum all tensors in a line for tokens_suggestions and lexunits
def sum_tensors (tensors):
    if tensors != '':
        return sum(tensors)
    #sum_tensors = sum(tensors)
df['tokens_suggestions_tensor_sum'] = df.tokens_suggestions_tensor.swifter.apply(lambda row: sum_tensors(row))
df['lexunits_tensor_sum'] = df.lexunits_tensor.swifter.apply(lambda row: sum_tensors(row))
df.head(50)

Pandas Apply:   0%|          | 0/52464 [00:00<?, ?it/s]

Pandas Apply:   0%|          | 0/52464 [00:00<?, ?it/s]

Unnamed: 0,suggestion_id,label,queryterm,date,client,lang,geolocation,url,proxy_ip,proxy_city,...,synsets_ger,lexunits,hypernyms,lexunits_hypernyms,hyponyms,lexunits_hyponyms,tokens_suggestions_tensor,lexunits_tensor,tokens_suggestions_tensor_sum,lexunits_tensor_sum
0,4538056,google_psy,türke,2020-05-03 08:05:01.0,psy-ab,de-DE,DE,https://www.google.com/complete/search,localhost,Köln,...,[],[],[],[],[],[],"[tensor(-0.8585), tensor(0.9375), tensor(-0.01...",,tensor(1.6650),
1,4538057,google_firefox,türke,2020-05-03 08:05:01.0,firefox,de-DE,DE,https://www.google.com/complete/search,localhost,Köln,...,[],[],[],[],[],[],"[tensor(-0.8585), tensor(-0.0100), tensor(-0.8...",,tensor(0.5884),
2,4538058,ddg,türke,2020-05-03 08:05:01.0,,,,https://duckduckgo.com/ac/,localhost,Köln,...,[],[],[],[],[],[],"[tensor(-0.8837), tensor(-0.6965), tensor(0.63...",,tensor(-1.3746),
3,4538059,bing,türke,2020-05-03 08:05:01.0,,,,http://api.bing.net/osjson.aspx,localhost,Köln,...,"[[Synset(id=s55185, lexunits=zurücktreten), Sy...","[zurücktreten, zurücktreten, zurücktreten, zur...","[{Synset(id=s55172, lexunits=aufgeben, verzich...","[aufgeben, verzichten, versagen, ausscheiden, ...","[{}, {Synset(id=s80699, lexunits=demissioniere...","[demissionieren, beitragsfrei, gebührenfrei, g...","[tensor(-0.8585), tensor(0.9375), tensor(0.433...","[tensor(-0.9261), tensor(-0.9261), tensor(-0.9...",tensor(3.1092),tensor(-2.4034)
4,4538060,google_psy,pole,2020-05-03 08:05:02.0,psy-ab,de-DE,DE,https://www.google.com/complete/search,localhost,Köln,...,"[[Synset(id=s66916, lexunits=polen)], [Synset(...","[polen, polemisch]","[{Synset(id=s56823, lexunits=anschließen)}, {S...","[anschließen, stilistisch, provozierend, provo...","[{Synset(id=s80471, lexunits=umpolen)}, {}]",[umpolen],"[tensor(-0.7592), tensor(-0.8762), tensor(0.81...","[tensor(-0.8216), tensor(-0.8856)]",tensor(-2.8080),tensor(-1.7072)
5,4538061,google_firefox,pole,2020-05-03 08:05:02.0,firefox,de-DE,DE,https://www.google.com/complete/search,localhost,Köln,...,"[[Synset(id=s66916, lexunits=polen)], [Synset(...","[polen, polemisch]","[{Synset(id=s56823, lexunits=anschließen)}, {S...","[anschließen, stilistisch, provozierend, provo...","[{Synset(id=s80471, lexunits=umpolen)}, {}]",[umpolen],"[tensor(-0.7592), tensor(-0.8762), tensor(0.81...","[tensor(-0.8216), tensor(-0.8856)]",tensor(-3.7279),tensor(-1.7072)
6,4538062,bing,pole,2020-05-03 08:05:03.0,,,,http://api.bing.net/osjson.aspx,localhost,Köln,...,"[[Synset(id=s66916, lexunits=polen)], [Synset(...","[polen, italienisch]","[{Synset(id=s56823, lexunits=anschließen)}, {S...","[anschließen, südeuropäisch, südländisch]","[{Synset(id=s80471, lexunits=umpolen)}, {Synse...","[umpolen, mittelitalienisch, süditalienisch, n...","[tensor(0.1253), tensor(-0.8762), tensor(0.940...","[tensor(-0.8196), tensor(0.6341)]",tensor(-0.6651),tensor(-0.1856)
7,4538063,ddg,pole,2020-05-03 08:05:03.0,,,,https://duckduckgo.com/ac/,localhost,Köln,...,[],[],[],[],[],[],"[tensor(0.8301), tensor(0.7862), tensor(0.1622...",,tensor(3.4194),
8,4538064,google_psy,rumäne,2020-05-03 08:05:03.0,psy-ab,de-DE,DE,https://www.google.com/complete/search,localhost,Köln,...,"[[Synset(id=s5333, lexunits=in, angesagt, kult...","[in, angesagt, kultig, trendig, aussehen, auss...","[{Synset(id=s5330, lexunits=modern)}, {Synset(...","[modern, sein, suchen, sein, fehlinformieren]","[{}, {Synset(id=s52138, lexunits=ausnehmen), S...","[ausnehmen, wirken, schauen, rumlügen, heuchel...","[tensor(0.9491), tensor(0.7781), tensor(-0.949...","[tensor(0.6114), tensor(-0.8469), tensor(0.976...",tensor(-4.1615),tensor(-3.5504)
9,4538065,bing,rumäne,2020-05-03 08:05:05.0,,,,http://api.bing.net/osjson.aspx,localhost,Köln,...,"[[Synset(id=s5333, lexunits=in, angesagt, kult...","[in, angesagt, kultig, trendig, funktionieren,...","[{Synset(id=s5330, lexunits=modern)}, {Synset(...","[modern, passieren, ereignen, denken, befassen...","[{}, {Synset(id=s105884, lexunits=nachgehen), ...","[nachgehen, vorgehen, leerlaufen, zuarbeiten, ...","[tensor(0.6576), tensor(0.7320), tensor(-0.841...","[tensor(0.6114), tensor(-0.8469), tensor(0.976...",tensor(-1.1195),tensor(0.9746)


In [17]:
#gender
df.loc[df.stemmed_words=='manner','stemmed_words'] = 'Mann'
df.loc[df.stemmed_words=='frauen','stemmed_words'] = 'Frau'
df.loc[df.stemmed_words=='mann','stemmed_words'] = 'Mann'
df.loc[df.stemmed_words=='frau','stemmed_words'] = 'Frau'
#minorities - cultural, linguistic and historic criteria
df.loc[df.stemmed_words=='rom','stemmed_words'] = 'Roma'
df.loc[df.stemmed_words=='roma','stemmed_words'] = 'Roma'
df.loc[df.stemmed_words=='sinto','stemmed_words'] = 'Sinti'
df.loc[df.stemmed_words=='sinti','stemmed_words'] = 'Sinti'
df.loc[df.stemmed_words=='fluchtling','stemmed_words'] = 'Migrant'
df.loc[df.stemmed_words=='illegaler','stemmed_words'] = 'Migrant'
df.loc[df.stemmed_words=='illegal','stemmed_words'] = 'Migrant'
df.loc[df.stemmed_words=='migra','stemmed_words'] = 'Migrant'
df.loc[df.stemmed_words=='immigra','stemmed_words'] = 'Migrant'
df.loc[df.stemmed_words=='arab','stemmed_words'] = 'arabisch'
df.loc[df.stemmed_words=='kurd','stemmed_words'] = 'kurdisch'
df.loc[df.stemmed_words=='jugoslawe','stemmed_words'] = 'jugoslawisch'
df.loc[df.stemmed_words=='jugoslaw','stemmed_words'] = 'jugoslawisch'
#skin color:
df.loc[df.stemmed_words=='schwarzer','stemmed_words'] = 'schwarz'
df.loc[df.stemmed_words=='weiss','stemmed_words'] = 'weiß'
#population group - geografic or national
#Europa
df.loc[df.stemmed_words=='europa','stemmed_words'] = 'europäisch'
#Westeuropa
df.loc[df.stemmed_words=='franzo','stemmed_words'] = 'französisch'
df.loc[df.stemmed_words=='franzosisch','stemmed_words'] = 'französisch'
df.loc[df.stemmed_words=='engla','stemmed_words'] = 'englisch'
df.loc[df.stemmed_words=='niederla','stemmed_words'] = 'niederländisch'
df.loc[df.stemmed_words=='niederlandisch','stemmed_words'] = 'niederländisch'
df.loc[df.stemmed_words=='holla','stemmed_words'] = 'holländisch'
df.loc[df.stemmed_words=='hollandisch','stemmed_words'] = 'holländisch'
#Mitteleuropa (Central Europa)
df.loc[df.stemmed_words=='deutscher','stemmed_words'] = 'deutsch'
df.loc[df.stemmed_words=='pol','stemmed_words'] = 'polnisch'
df.loc[df.stemmed_words=='osterreich','stemmed_words'] = 'österreichisch'
df.loc[df.stemmed_words=='osterreichisch','stemmed_words'] = 'österreichisch'
df.loc[df.stemmed_words=='schweiz','stemmed_words'] = 'schweizerisch'
#Osteuropa
df.loc[df.stemmed_words=='russ','stemmed_words'] = 'russisch'
df.loc[df.stemmed_words=='ukrai','stemmed_words'] = 'ukrainisch'
#Südeuropa
df.loc[df.stemmed_words=='italie','stemmed_words'] = 'italienisch'
df.loc[df.stemmed_words=='portugie','stemmed_words'] = 'portugiesisch'
df.loc[df.stemmed_words=='spanier','stemmed_words'] = 'spanisch'
#Südosteuropa
df.loc[df.stemmed_words=='griech','stemmed_words'] = 'griechisch'
df.loc[df.stemmed_words=='kroa','stemmed_words'] = 'kroatisch'
df.loc[df.stemmed_words=='rumanisch','stemmed_words'] = 'rumänisch'
df.loc[df.stemmed_words=='ruma','stemmed_words'] = 'rumänisch'
df.loc[df.stemmed_words=='serb','stemmed_words'] = 'serbisch'
df.loc[df.stemmed_words=='bosnier','stemmed_words'] = 'bosnisch'
df.loc[df.stemmed_words=='bulgar','stemmed_words'] = 'bulgarisch'
#df.loc[df.stemmed_words=='türke','stemmed_words'] = 'türkisch'

#Afrika
df.loc[df.stemmed_words=='afrika','stemmed_words'] = 'afrikanisch'
#Amerika
df.loc[df.stemmed_words=='amerika','stemmed_words'] = 'amerikanisch'
#Asien
df.loc[df.stemmed_words=='asia','stemmed_words'] = 'asiatisch'
#Voderasien
df.loc[df.stemmed_words=='ira','stemmed_words'] = 'iranisch'
df.loc[df.stemmed_words=='irak','stemmed_words'] = 'irakisch'
df.loc[df.stemmed_words=='syrer','stemmed_words'] = 'syrisch'
df.loc[df.stemmed_words=='turk','stemmed_words'] = 'türkisch'
df.loc[df.stemmed_words=='turkisch','stemmed_words'] = 'türkisch'
#Zentralasien
df.loc[df.stemmed_words=='afgha','stemmed_words'] = 'afghanisch'
#Ostasien
df.loc[df.stemmed_words=='chi','stemmed_words'] = 'chinesisch'
#Südasien
df.loc[df.stemmed_words=='inder','stemmed_words'] = 'indisch'
df.loc[df.stemmed_words=='pakista','stemmed_words'] = 'pakistanisch'
#Südostasien
df.loc[df.stemmed_words=='vietnam','stemmed_words'] = 'vietnamesisch'
df.loc[df.stemmed_words=='israeli','stemmed_words'] = 'israelisch'
#religious community
df.loc[df.stemmed_words=='agnostik','stemmed_words'] = 'Agnostizismus'
df.loc[df.stemmed_words=='agnostizismu','stemmed_words'] = 'Agnostizismus'
df.loc[df.stemmed_words=='agnostisch','stemmed_words'] = 'Agnostizismus'
df.loc[df.stemmed_words=='atheismu','stemmed_words'] = 'Atheismus'
df.loc[df.stemmed_words=='atheistisch','stemmed_words'] = 'Atheismus'
df.loc[df.stemmed_words=='atheti','stemmed_words'] = 'Atheismus'
df.loc[df.stemmed_words=='athei','stemmed_words'] = 'Atheismus'
df.loc[df.stemmed_words=='christentum','stemmed_words'] = 'Christentum'
df.loc[df.stemmed_words=='chri','stemmed_words'] = 'Christentum'
df.loc[df.stemmed_words=='christlich','stemmed_words'] = 'Christentum'
df.loc[df.stemmed_words=='evangelisch','stemmed_words'] = 'Evangelikalismus'
df.loc[df.stemmed_words=='jud','stemmed_words'] = 'Judentum'
df.loc[df.stemmed_words=='judisch','stemmed_words'] = 'Judentum'
df.loc[df.stemmed_words=='judentum','stemmed_words'] = 'Judentum'
df.loc[df.stemmed_words=='katholik','stemmed_words'] = 'Katholizismus'
df.loc[df.stemmed_words=='katholisch','stemmed_words'] = 'Katholizismus'
df.loc[df.stemmed_words=='jehova','stemmed_words'] = 'Jehova'
df.loc[df.stemmed_words=='protesta', 'stemmed_words'] = 'Protestantismus'
df.loc[df.stemmed_words=='protestantisch', 'stemmed_words'] = 'Protestantismus'
df.loc[df.stemmed_words=='pro', 'stemmed_words'] = 'Protestantismus'
df.loc[df.stemmed_words=='muslim','stemmed_words'] = 'Islam'
df.loc[df.stemmed_words=='muslimisch','stemmed_words'] = 'Islam'
df.loc[df.stemmed_words=='islam','stemmed_words'] = 'Islam'


In [19]:
#creating backups
#headers_list = ['suggestion_id', 'label', 'queryterm', 'date', 'client', 'lang', 'geolocation', 'url', 'proxy_ip', 'proxy_city', 'proxy_country', 'proxy_port', 'raw_data', 'cleaned', 'tokens_suggestions', 'tokensroot', 'Frage?', 'tokens_suggestions_cleaned_root', 'stemmed_words', 'synsets_ger', 'lexunits', 'hypernyms', 'lexunits_hypernyms', 'hyponyms', 'lexunits_hyponyms', 'lexunits_tensor', 'lexunits_tensor_sum', 'tokens_suggestions_tensor']
df_backup = df
df_backup2 = df
df_backup3 = df
df_backup4 = df
df.to_csv('richtig.csv', sep=';', index=False, encoding='utf-8')
df_from_csv = pd.read_csv("richtig.csv", sep=';', encoding='utf-8')