In [61]:
import os
import csv
import re

def preprocess_tweets():
    input_folder = "original_tweets"
    output_folder = "preprocessed_tweets/step1"
    file_name = "indicator_tweets_push_factors_social_factors.csv"
    
    input_file_path = os.path.join(input_folder, file_name)
    output_file_name = file_name.replace(".csv", "_preprocessed1.csv")
    output_file_path = os.path.join(output_folder, output_file_name)

    os.makedirs(output_folder, exist_ok=True)

    url_with_lang_pattern = re.compile(r"https?://\S+,\w{2}$")

    with open(input_file_path, mode='r', encoding='utf-8') as infile, open(output_file_path, mode='w', encoding='utf-8', newline='') as outfile:
        reader = csv.DictReader(infile)
        writer = csv.writer(outfile)

        for row in reader:
            text = row["text"]

            if url_with_lang_pattern.search(text):
                continue

            writer.writerow([text])

if __name__ == "__main__":
    preprocess_tweets()


In [62]:
import os
import csv
import re

def preprocess_tweets_step2():
    input_folder = "preprocessed_tweets/step1"
    output_folder = "preprocessed_tweets/step2"
    file_name = "indicator_tweets_push_factors_social_factors_preprocessed1.csv"

    input_file_path = os.path.join(input_folder, file_name)
    output_file_name = file_name.replace("_preprocessed1", "_preprocessed2")
    output_file_path = os.path.join(output_folder, output_file_name)

    os.makedirs(output_folder, exist_ok=True)

    url_pattern = re.compile(r"https?://\S+")

    with open(input_file_path, mode='r', encoding='utf-8') as infile, open(output_file_path, mode='w', encoding='utf-8', newline='') as outfile:
        reader = csv.reader(infile)
        writer = csv.writer(outfile)

        for row in reader:
            text = row[0]

            cleaned_text = url_pattern.sub("", text).strip()

            writer.writerow([cleaned_text])

if __name__ == "__main__":
    preprocess_tweets_step2()


In [63]:
import os
import csv
import re
import emoji

def preprocess_tweets_step3():
    input_folder = "preprocessed_tweets/step2"
    output_folder = "preprocessed_tweets/step3"
    file_name = "indicator_tweets_push_factors_social_factors_preprocessed2.csv"

    input_file_path = os.path.join(input_folder, file_name)
    output_file_name = file_name.replace("_preprocessed2", "_preprocessed3")
    output_file_path = os.path.join(output_folder, output_file_name)

    os.makedirs(output_folder, exist_ok=True)

    def convert_emojis_to_text(text):
        return emoji.demojize(text, delimiters=(":", ":"))

    with open(input_file_path, mode='r', encoding='utf-8') as infile, open(output_file_path, mode='w', encoding='utf-8', newline='') as outfile:
        reader = csv.reader(infile)
        writer = csv.writer(outfile)

        for row in reader:
            text = row[0]
            converted_text = convert_emojis_to_text(text)

            if text != converted_text:
                print(f"Original: {text}")
                print(f"Converted: {converted_text}")

            writer.writerow([converted_text])

if __name__ == "__main__":
    preprocess_tweets_step3()


Original: <unk> and TGRT Main News 📌IBB investigative country agenda 📌Crime announcement. Who is the BDDK accusing of "damaging assets"? 34 years later. Historical developments on the border of hunger and poverty 📌Market discount announcement #how to be remembered
Converted: <unk> and TGRT Main News :pushpin:IBB investigative country agenda :pushpin:Crime announcement. Who is the BDDK accusing of "damaging assets"? 34 years later. Historical developments on the border of hunger and poverty :pushpin:Market discount announcement #how to be remembered
Original: They denounce the economic management of the Motril City Council (Granada) for lack of services 👉
Converted: They denounce the economic management of the Motril City Council (Granada) for lack of services :backhand_index_pointing_right:
Original: <unk> Pre-cooked at home calculator Tariff <unk>️Turkish-Labour hunger limit of 8 thousand 864 TL. The poverty line is 28 thousand 875 TL. Before entering the kitchen, the materials were c

In [73]:
import os
import csv
import ftfy
import unicodedata

def preprocess_tweets_step4():
    input_folder = "preprocessed_tweets/step3"
    output_folder = "preprocessed_tweets/step4"
    file_name = "indicator_tweets_push_factors_social_factors_preprocessed3.csv"

    input_file_path = os.path.join(input_folder, file_name)
    output_file_name = file_name.replace("_preprocessed3", "_preprocessed4")
    output_file_path = os.path.join(output_folder, output_file_name)

    os.makedirs(output_folder, exist_ok=True)

    def fix_encoding(text):
        text = ftfy.fix_text(text)
        text = unicodedata.normalize('NFC', text)
        return text.strip()

    with open(input_file_path, mode='r', encoding='utf-8') as infile, \
         open(output_file_path, mode='w', encoding='utf-8', newline='') as outfile:

        reader = csv.reader(infile)
        writer = csv.writer(outfile)

        for row in reader:
            original_text = row[0]
            cleaned_text = fix_encoding(original_text)

            if original_text != cleaned_text:
                print("Original:", original_text)
                print("Cleaned:", cleaned_text)

            writer.writerow([cleaned_text])

if __name__ == "__main__":
    preprocess_tweets_step4()


Original: Ukraine war is causing ‘famine in East Africa’
Cleaned: Ukraine war is causing 'famine in East Africa'
Original: Pragya Thakur says headscarves only worn by women ‘unsafe in homes’ amid #hijab row
Cleaned: Pragya Thakur says headscarves only worn by women 'unsafe in homes' amid #hijab row
Original: Dear Nadhim Zahawi, the Tories vowed to ‘eradicate illiteracy’ years ago. What went wrong? | Michael Rosen
Cleaned: Dear Nadhim Zahawi, the Tories vowed to 'eradicate illiteracy' years ago. What went wrong? | Michael Rosen
Original: In “Hungry Ghosts”, spectres of Trinidad’s past haunt the island
Cleaned: In "Hungry Ghosts", spectres of Trinidad's past haunt the island
Original: Poland’s poverty-busting pandemic - explained #InnovationActionChange
Cleaned: Poland's poverty-busting pandemic - explained #InnovationActionChange
Original: :cityscape: San Francisco is at the heart of America’s tech boom. 

Yet homelessness is rampant, drug use soaring and violent crime rising.

:studio_