In [None]:
import os
import re
import copy

import pandas as pd
pd.options.display.max_colwidth = 150

from nltk.corpus import stopwords

In [None]:
def criar_diretorio(dirpath):
    if not os.path.exists(dirpath):
        os.makedirs(dirpath)

In [None]:
basedir = os.getcwd()

data_basedir = os.path.join(basedir, "data", "super-mario-odyssey")
results_basedir = os.path.join(basedir, "results", "super-mario-odyssey")
gephi_filename = "super-mario-odyssey.dl"

criar_diretorio(data_basedir)
criar_diretorio(results_basedir)

In [None]:
files = os.listdir(data_basedir)

exports = []

for file in files:
    filename = os.fsdecode(file)
    
    filepath = os.path.join(data_basedir, filename)
    
    export = pd.read_csv(filepath_or_buffer=filepath, header=0, sep="\t", encoding="UTF-8")
    
    exports.append(export)

export = pd.concat(exports, ignore_index=True)

In [None]:
export_filename = os.path.join(results_basedir, "super-mario-odyssey-final.tsv")
export.to_csv(path_or_buf=export_filename, sep="\t", encoding="UTF-8", index=None)

In [None]:
def remove_links(text):
    regex = r'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}     /)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))'
    
    copy_text = copy.copy(text)
    
    copy_text = re.sub("https|http", " https", copy_text)
    
    copy_text = re.sub(regex, "", copy_text)
    
    copy_text = re.sub("https|http", "", copy_text)
    
    return copy_text

In [None]:
def remove_numbers_special_characters(text):
    regex = "[^a-zA-Z@#]"
    return re.sub(regex, " ", text)

In [None]:
def remove_empty_hashtags(words):
    return [word for word in words if word != "#"]

In [None]:
def remove_rt(words):
    return [word for word in words if word != "rt"]

In [None]:
def remove_mentions(words):
    return [word for word in words if not re.search("@", word)]

In [None]:
def remove_stopwords(words):
    return [word for word in words if word not in stopwords.words("english")]

In [None]:
def clean_tweets(tweets):
    clean_tweets = []
    
    for tweet in tweets:
        tweet_copy = copy.copy(tweet)

        # Remove links
        tweet_copy = remove_links(tweet_copy)

        # Deixa apenas as letras e os @
        tweet_copy = remove_numbers_special_characters(tweet_copy)

        # Transforma para lower case
        tweet_copy = tweet_copy.lower()

        # Quebra em uma lista de palavras
        tweet_words = tweet_copy.split()
        
        # Remove todas as hashtags vazias
        tweet_words = remove_empty_hashtags(tweet_words)

        # Remove as palavras RT
        tweet_words = remove_rt(tweet_words)

        # Remove todos os mentions
        tweet_words = remove_mentions(tweet_words)

        # Remove as stopwords
        tweet_words = remove_stopwords(tweet_words)
        
        clean_tweets.append(tweet_words)
        
    return clean_tweets

In [None]:
tweets = clean_tweets(export["description"])

In [None]:
filepath = os.path.join(results_basedir, gephi_filename)

unique_words_total = set()

with open(filepath, "w+") as file:
    header = "DL n=120\n" \
            + "format = edgelist1\n" \
            + "labels embedded:\n" \
            + "data:\n"
    
    file.write(header)
    
    for tweet in tweets:
        unique_words = set(tweet)
        unique_words_total.update(unique_words)
        unique_words = list(unique_words)
        
        n_words = len(tweet)
        for i in range(n_words):
            for j in range(i+1, n_words):
                file.write(tweet[i] + "\t" + tweet[j] + "\n")

file.close()

In [None]:
num_linhas = None
num_linhas_cabecalho = 4

with open(filepath, "r") as f:
    num_linhas = sum(1 for line in f)

In [None]:
print("Número de tweets: ", len(tweets))
print("Número de tweets não vazios: ", len([tweet for tweet in tweets if len(" ".join(tweet).split(" ")) >= 1]))
print("Número de palavras únicas: ", len(unique_words_total))
print("Número de relações entre palavras: ", num_linhas - num_linhas_cabecalho)