In [2]:
import json
import glob
import os
import re
import spacy
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

from itertools import combinations
from spacymoji import Emoji
from typing import Tuple


nlp = spacy.load('it_core_news_sm')
nlp.add_pipe("emoji", first=True)
_data_path = "./Tikapi/data/"
video_ids_file = _data_path + "video_list.csv"

  from .autonotebook import tqdm as notebook_tqdm


In [3]:
# load spreadsheet of videos
df = pd.read_csv(video_ids_file)
df = df.fillna("")

# create additional columns
df["id"] = [re.sub("video/", "", re.findall("video/[0-9]{19}", link)[0]) for link in df["Link"]]
df["name"] = np.where(df["Politician"]=="", df["Influencer/tiktoker"].replace(" ", "_", regex=True), df["Politician"].replace(" ", "_", regex=True))
df["file"] = _data_path + df['name'] + "_com_" + df["id"] + ".json"

In [4]:
# load comments with meta data into dictionary (key = video_id)
dic = {}

for i, row in df.iterrows():
    if os.path.isfile(row["file"]):
        with open(row["file"]) as infile:
            comments = json.load(infile)
        dic[row["id"]] = {"raw_comments":comments["comments"], "meta":comments["meta"]}

In [5]:
print(f"Currently we have comments for {len(dic.keys())} videos.")

Currently we have comments for 145 videos.


# Define Preprocessing functions

In [7]:
import emoji

In [283]:
class preprocessing:

    def __init__(self) -> None:
        pass

    def clean_text(self, text: str, search_words: list = []) -> list:
        """ 
        Cleans a string removing punctuation, emoji, stopwords, lemmatizazion, ...

        text: Input string like a sentence
        search_words: WOrd that were use to query for the input data and should therefore be removed
        output: bag of words
        """
        doc = nlp(text)

        bog = []

        for token in doc:

            # filter unwanted tokens
            if token.is_stop:
               continue

            if token.is_punct:
                continue
            
            if token.like_url:
                continue
            
            if token.like_email:
                continue
            if token.is_space:
                continue
            
            if token.lemma_ in ['\n', ' ']:
                continue
            
            if str(token) in search_words:
                continue

            # demojize
            if token._.is_emoji:
                token.lemma_ = f"[{token._.emoji_desc.replace(' ', '_')}]" 
                                       
            bog.append(token.lemma_)                         
        
        return bog

    def demojize(self, text: str, method: str = "none") -> str:
        """ 
        text: Input string like a sentence
        method:

        - remove: remove emoji (happens automatically if no other method is given)
        - none: do not remove emojis
        - simple: replace emoji with [emoji] token
        - text: replace emoji with text describin it
        - token: like text but put [brackets] around the emoji token

        output: string with emojis replaced by text
        """
        
        for token in nlp(text):
                 
            if token._.is_emoji:

                if method=="none":
                    continue

                if method=="simple":
                    text = text.replace(str(token), f" [emoji] ") 
                    continue

                if method=="simple_text":
                    text = text.replace(str(token), f" emoji ") 
                    continue

                if method=="text":
                    text = text.replace(str(token), f" {token._.emoji_desc.replace(' ', '_')} ") 
                    continue

                if method=="token":
                    text = text.replace(str(token), f" [{token._.emoji_desc.replace(' ', '_')}] ") 
                    continue

                if method!="remove":
                    raise ValueError('You did not select a correct demojization method!')

            # remove non-ascii
            if token.is_ascii:
                continue

            # make an exeption for italian characters (e.g. è)
            if token.is_alpha:
                continue

            text = text.replace(str(token), "") 
    
        text_single_spaces = " ".join(text.split())
        
        return text_single_spaces

    def create_comment_list(self, dic: dict, cleaning=True, search_words: list = []) -> list:
        """ 
        Creates a flat list of cleaned comments from a dictionary.

        dic: dictionnary of comments
        search_words: search words to pass to self.clean_text
        """

        com_list = []

        for key, video in dic.items():
            for com in video["raw_comments"]:
                if cleaning:
                    com_list.append(" ".join(self.clean_text(com["text"])))
                else:
                    com_list.append(com["text"])

        return com_list
            
    def create_edge_dictionary(self, comments: list) -> dict:
        """ 
        Creates edges between two words that appear in the same comment. A weight is assigned according to the number of occurences of an edge in the dataset.

        comments: list of comments where each comment is a list of words
        output: dicitonary with edges as keys and weight as values
        """
        
        weights = {}

        for com in comments:
            for edge in combinations(com.split(), 2):

                # if not self loop
                if edge[0] != edge[1]:
                    
                    if edge not in weights.keys():
                        weights[edge] = 1
                    else:
                        weights[edge] += 1
                    
        return weights

    def edges_to_dataframe(self, edges: dict) -> pd.DataFrame:
        """ 
        Creates data frame of edges from an edge dictionary (self.create_edge_dictionary). This can be saved to csv to use in gephi. In python better use the dicitionary as it is much faster.

        edges: dictionary of edge
        output: pd.DataFrame of edges
        """
        df = pd.DataFrame({"source": [], "target": [], "weigth": []})

        for edge, weight in edges.items():
            row = pd.DataFrame({"source": [edge[0]], "target": [edge[1]], "weigth": [weight]})
            df = pd.concat([df, row], ignore_index=True)
        
        return df

    def create_comment_table(self, dic: dict, method: str = "none") -> pd.DataFrame:
        """ 
        dic: dictionnary of comments
        demojize: string defining the demojization method
        output: pandas dataframe with columns id, name and comments
        """

        com_list = []
        id_list = []
        name_list = []

        for key, video in dic.items():

            name = df.loc[df.id==key].name

            for com in video["raw_comments"]:
                
                demojized_text = self.demojize(com["text"], method=method) 
                
                com_list.append(demojized_text)
                id_list.append(key)
                name_list.extend(name)

        df_out = pd.DataFrame({"id": id_list, "name": name_list, "comments": com_list})
        return df_out




In [271]:
test_str = "Questa, è  Ö  una bellissima prova 😻👍🏿 @GiorgiaMeloni"

# Create comment lists

In [285]:
# create instance of our preprocessing class
prepro = preprocessing()

In [54]:
# loop over videos in spreadsheet and clean and add comments from d

com_list_all = prepro.create_comment_list(dic)

KeyboardInterrupt: 

In [64]:
# define a subdictionary via some fiter criteria from the spreadsheet (e.g. name)
subdic_meloni = {k: dic[k] for k in df.loc[df["name"]=="Meloni", "id"]}
com_list_meloni = prepro.create_comment_list(subdic_melonib)

# Create & save edges

In [119]:
# create weighted edges. Two words are connected if they are in the same comment
edges_all = prepro.create_edge_dictionary(com_list_all)

In [153]:
# create dataframe of edges to be saved to csv for gephi
edges_all_df = prepro.edges_to_dataframe(edges_all)

In [155]:
# save edges to csv
edges_all_df.to_csv(_data_path + "edges_coms_all.csv")

# Create comment table for LIWC

In [279]:
comment_table_with_tokens = prepro.create_comment_table(dic, method="token")
comment_table_with_text = prepro.create_comment_table(dic, method="text")
comment_table_with_emoji_token = prepro.create_comment_table(dic, method="simple")

In [286]:
comment_table_with_emoji_text = prepro.create_comment_table(dic, method="simple_text")

In [211]:
comment_table.head()

Unnamed: 0,id,name,comments
0,7144986246215929094,Meloni,grazie adesso ho capito voto Conte
1,7144986246215929094,Meloni,conteeeeeeee [smiling_face_with_3_hearts] [smi...
2,7144986246215929094,Meloni,lei parla di lavoro mi dica oggi nella situazi...
3,7144986246215929094,Meloni,a sentire tutti qui doveva passare Conte [smir...
4,7144986246215929094,Meloni,bravissima [beaming_face_with_smiling_eyes]


In [280]:
dic["7144986246215929094"]["raw_comments"][1]["text"]

'conteeeeeeee🥰🥰🥰🥰🥰🥰 Giorgia e come la mettiamo quelli delle case popolari 😡😡😡😡'

In [281]:
print(comment_table_with_text.loc[comment_table_with_text.id=="7144986246215929094"].comments[1])
print(comment_table_with_tokens.loc[comment_table_with_tokens.id=="7144986246215929094"].comments[1])
print(comment_table_with_emoji_token.loc[comment_table_with_emoji_token.id=="7144986246215929094"].comments[1])

conteeeeeeee smiling_face_with_3_hearts smiling_face_with_3_hearts smiling_face_with_3_hearts smiling_face_with_3_hearts smiling_face_with_3_hearts smiling_face_with_3_hearts Giorgia e come la mettiamo quelli delle case popolari pouting_face pouting_face pouting_face pouting_face
conteeeeeeee [smiling_face_with_3_hearts] [smiling_face_with_3_hearts] [smiling_face_with_3_hearts] [smiling_face_with_3_hearts] [smiling_face_with_3_hearts] [smiling_face_with_3_hearts] Giorgia e come la mettiamo quelli delle case popolari [pouting_face] [pouting_face] [pouting_face] [pouting_face]
conteeeeeeee [emoji] [emoji] [emoji] [emoji] [emoji] [emoji] Giorgia e come la mettiamo quelli delle case popolari [emoji] [emoji] [emoji] [emoji]


In [287]:
print(comment_table_with_emoji_text.loc[comment_table_with_emoji_text.id=="7144986246215929094"].comments[1])

conteeeeeeee emoji emoji emoji emoji emoji emoji Giorgia e come la mettiamo quelli delle case popolari emoji emoji emoji emoji


In [282]:
comment_table_with_text.to_excel(_data_path + "comments_with_emoji_text.xlsx", index=False)
comment_table_with_tokens.to_excel(_data_path + "comments_with_emoji_tokens.xlsx", index=False)
comment_table_with_emoji_token.to_excel(_data_path + "comments_with_simple_emoji_token.xlsx", index=False)

In [288]:
comment_table_with_emoji_text.to_excel(_data_path + "comments_with_simple_emoji_text.xlsx", index=False)

<p style="text-align:center"> <i><b>Fin</b></i> </p>