In [2]:
import pandas as pd
import numpy as np

import re
import matplotlib.pyplot as plt
import plotly.express as px

import collections
from plotly import graph_objects as go
import emoji

import sys
sys.path.append("..")

flatten = lambda t: [item for sublist in t for item in sublist]
def deEmojify(text):
    regrex_pattern = re.compile(pattern = "["
        u"\U0001F600-\U0001F64F"  # emoticons
        u"\U0001F300-\U0001F5FF"  # symbols & pictographs
        u"\U0001F680-\U0001F6FF"  # transport & map symbols
        u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           "]+", flags = re.UNICODE)
    return regrex_pattern.sub(r'',text)

In [3]:
comment_df = pd.read_excel(r"..\data\psaw_comments_from_sept1_pol.xlsx")
comment_df["lens"] = comment_df.text.str.split().map(lambda x: len(x))
comment_df.head()

Unnamed: 0.1,Unnamed: 0,comment_id,text,author,upvotes,timestamp,post_id,lens
0,0,hfu3t3z,Minden oké otthon?,drakvuf,15,1633686153,q3uef4,3
1,1,hfu3wlv,"Ez azért elég izzadtságszagú azt tekintve, hog...",Debre1024,6,1633686239,q3uef4,15
2,2,hfu491h,téged ki bántott :(,Endymion2748,6,1633686552,q3uef4,4
3,3,hfu407d,Még nincs szombat...,LazyTomTom,3,1633686328,q3uef4,3
4,4,hfu4nyg,"Ezazz! Te is szereted ha megbasznak szárazon, ...",DrTorrente,3,1633686923,q3uef4,21


In [4]:
comment_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 12950 entries, 0 to 12949
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   Unnamed: 0  12950 non-null  int64 
 1   comment_id  12950 non-null  object
 2   text        12950 non-null  object
 3   author      12950 non-null  object
 4   upvotes     12950 non-null  int64 
 5   timestamp   12950 non-null  int64 
 6   post_id     12950 non-null  object
 7   lens        12950 non-null  int64 
dtypes: int64(4), object(4)
memory usage: 809.5+ KB


In [5]:
#Remove web links
print("Before:\n",comment_df["text"].iloc[25])
print("Num of http:", comment_df["text"].map(lambda x: "http" in x).sum())


comment_df["text"] = comment_df["text"].map( lambda x: re.sub(r'http\S+', '', x) )
print("\nAfter:\n",comment_df["text"].iloc[25])
print("Num of http:",comment_df.text.map(lambda x: "http" in x).sum())
print("Num of https:",comment_df.text.map(lambda x: "https" in x).sum())



Before:
 [u/shiffer_andras](https://youtu.be/dQw4w9WgXcQ) vélemény?
Num of http: 589

After:
 [u/shiffer_andras]( vélemény?
Num of http: 0
Num of https: 0


In [6]:
#remove user mentions 

print("Before:\n",comment_df["text"].iloc[25])
print("Num of user mentions:", comment_df["text"].map(lambda x: "u/" in x).sum())

comment_df["text"] = comment_df["text"].map( lambda x: re.sub(r"u/\S+", "", x))

print("\nAfter:\n",comment_df["text"].iloc[25])
print("Num of user mentions:",comment_df.text.map(lambda x: "u/" in x).sum())
print("Num of user mentions:",comment_df.text.map(lambda x: "u/" in x).sum())

Before:
 [u/shiffer_andras]( vélemény?
Num of user mentions: 47

After:
 [ vélemény?
Num of user mentions: 0
Num of user mentions: 0


In [7]:
#remove emojis
print("Before:",comment_df.text[15]) 
comment_df["text"] = comment_df["text"].map( lambda text: emoji.get_emoji_regexp().sub(u'', text) )
print("After:",comment_df.text[15])

Before: Lmao, fosik a malac 🥴
Before: Lmao, fosik a malac 


In [8]:
from string import punctuation, digits

#Remove numbers
for d in digits:
    comment_df["text"] = comment_df["text"].str.replace(d, "")

#Remove punctuations
for s in punctuation:
    comment_df["text"] = comment_df["text"].str.replace(s, "")

#Make lowercase
comment_df["text"] = comment_df["text"].str.lower()

#remove newline 
comment_df["text"] = comment_df["text"].str.replace("\n", " ")

In [9]:
# split every comment on whitespace
comment_df.text = comment_df.text.str.split(" ")

#remove empty string ''
notEmpty = lambda word: word != ""
comment_df.text = comment_df.text.map(lambda line: list(filter(notEmpty, line)))

In [10]:
comment_df.text

0                                    [minden, oké, otthon]
1        [ez, azért, elég, izzadtságszagú, azt, tekintv...
2                                     [téged, ki, bántott]
3                                    [még, nincs, szombat]
4        [ezazz, te, is, szereted, ha, megbasznak, szár...
                               ...                        
12945    [protip, for, life, a, bírósági, ítéleteket, m...
12946    [végül, a, hangfelvételen, a, dulakodás, is, c...
12947    [schiffer, és, srb, urak, kavarják, a, szart, ...
12948    [milyen, érdekes, hogy, ez, a, per, is, pont, ...
12949                                 [de, schiffer, fasz]
Name: text, Length: 12950, dtype: object

# Removing stopwords and stemming

In [13]:
import nltk
nltk.download("stopwords")
from nltk.corpus import stopwords
hu_stop = stopwords.words("hungarian")
print(hu_stop)

['a', 'ahogy', 'ahol', 'aki', 'akik', 'akkor', 'alatt', 'által', 'általában', 'amely', 'amelyek', 'amelyekben', 'amelyeket', 'amelyet', 'amelynek', 'ami', 'amit', 'amolyan', 'amíg', 'amikor', 'át', 'abban', 'ahhoz', 'annak', 'arra', 'arról', 'az', 'azok', 'azon', 'azt', 'azzal', 'azért', 'aztán', 'azután', 'azonban', 'bár', 'be', 'belül', 'benne', 'cikk', 'cikkek', 'cikkeket', 'csak', 'de', 'e', 'eddig', 'egész', 'egy', 'egyes', 'egyetlen', 'egyéb', 'egyik', 'egyre', 'ekkor', 'el', 'elég', 'ellen', 'elõ', 'elõször', 'elõtt', 'elsõ', 'én', 'éppen', 'ebben', 'ehhez', 'emilyen', 'ennek', 'erre', 'ez', 'ezt', 'ezek', 'ezen', 'ezzel', 'ezért', 'és', 'fel', 'felé', 'hanem', 'hiszen', 'hogy', 'hogyan', 'igen', 'így', 'illetve', 'ill.', 'ill', 'ilyen', 'ilyenkor', 'ison', 'ismét', 'itt', 'jó', 'jól', 'jobban', 'kell', 'kellett', 'keresztül', 'keressünk', 'ki', 'kívül', 'között', 'közül', 'legalább', 'lehet', 'lehetett', 'legyen', 'lenne', 'lenni', 'lesz', 'lett', 'maga', 'magát', 'majd', 'majd

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\balin\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [14]:
with open(r"..\data\misc\hu_stopwords.txt", "r",  encoding='utf-8') as f:
    additional_stopwords = f.readlines()

additional_stopwords = [ word.replace("\n","") for word in additional_stopwords ]
hu_stop += additional_stopwords
hu_stop += ["kb", "es", "", "  "]
hu_stop = list(set(hu_stop))

In [16]:
#filtering stopwords
def filterStopword(word):
    if word in hu_stop:
        return False
    else:
        return True

#filtered_text = list(filter(filterStopword, comment_words))
comment_df.text = comment_df.text.map(lambda line: list(filter(filterStopword, line)))

In [18]:
cntr = collections.Counter( flatten(comment_df.text) )
ws, cnt = zip( *cntr.most_common() )

fig = go.Figure([go.Bar(x=ws[:25], y=cnt[:25], marker_color = 'rgb(17,165,121)')])
fig.update_yaxes(type="log")

fig.update_traces(marker_line_color='rgb(17,165,121)',
                  marker_line_width=0.5, opacity=0.6)

fig.update_layout(title="Barchart of the most frequent words",
                  yaxis_title="word frequency",)

fig.show()

In [15]:
"""comment_df["text"] = comment_df.text.map( lambda word: " ".join(list(filter(filterStopword, word.split(" ")))))
comment_df.text.head()"""

0                                           oké otthon
1    izzadtságszagú tekintve dobrev jelöltsége pont...
2                                              bántott
3                                              szombat
4    ezazz szereted megbasznak szárazon köpnek fize...
Name: text, dtype: object

In [29]:
notBlank = lambda w: len(w) > 0

comment_df.text.map(lambda line: list(filter(notBlank, [word for word in line.split(" ")] )))



0                                            [oké, otthon]
1        [izzadtságszagú, tekintve, dobrev, jelöltsége,...
2                                                [bántott]
3                                                [szombat]
4        [ezazz, szereted, megbasznak, szárazon, köpnek...
                               ...                        
12945    [protip, for, life, bírósági, ítéleteket, magu...
12946    [hangfelvételen, dulakodás, hamisítvány, szegé...
12947    [schiffer, srb, kavarják, szart, rendesen, „lá...
12948    [érdekes, per, pont, tudott, befejeződni, hóna...
12949                                     [schiffer, fasz]
Name: text, Length: 12950, dtype: object

In [20]:
from nltk.stem import SnowballStemmer

#Stemming tweets
snow = SnowballStemmer('hungarian',ignore_stopwords=True)

comment_df.text = comment_df.text.map(lambda line: " ".join([snow.stem(word) for word in line]))

comment_df.to_excel("..\data\cleaned_swed_stemmed.xlsx")

'lma fos malac 🥴'