# Imports

In [7]:
import numpy as np
import pandas as pd
import re
from tqdm import tqdm

# Data preprocessing

In [8]:
def replace_mentions(text):
    """
    Replaces twitter mentions in the given text by a contant tag.
    Returns the clean text.
    """
    MENTIONS_REPLACEMENT_TAG = "mention"
    MENTIONS_REGEXS = [
        r'(@ [\w\d-]+(( _)|(_ )|( _ )|_)+[\w\d-]+)', # something + _ + something
        r'(@ [\w\d-]+( [\d-]+))', # something + space + numbers
        r'(@ [\w\d-]+)', # something
        ] 

    matches = re.findall('|'.join([f'({regex})' for regex in MENTIONS_REGEXS]), text)
    for tuple_matches in matches:
        if isinstance(tuple_matches, tuple):
            mention = [mention for mention in tuple_matches if len(mention) >= 3][0] # keep the first match which contains at least 3 characters
        else:
            mention = tuple_matches
        text = text.replace(mention, MENTIONS_REPLACEMENT_TAG)
    return text

In [9]:
def replace_hashtags(text):
    """
    Replaces twitter hashtags in the given text by a contant tag.
    Returns the clean text.
    """
    HASHTAGS_REPLACEMENT_TAG = "hashtag"
    HASHTAGS_REGEXS = [
        r'(# [\w\d_-]+( \d+)*)',
        r'(# [\w\d_-]+)',
        ] 

    matches = re.findall('|'.join([f'({regex})' for regex in HASHTAGS_REGEXS]), text)
    for tuple_matches in matches:
        if isinstance(tuple_matches, tuple):
            hashtag = [hashtag for hashtag in tuple_matches if len(hashtag) >= 3][0] # keep the first match which contains at least 3 characters
        else:
            hashtag = tuple_matches
        text = text.replace(hashtag, HASHTAGS_REPLACEMENT_TAG)
    return text

In [15]:
def load_data():
    """
    Loads the data in a pandas' dataframe.
    Returns the dataframe containing following columns : 'author_id' (int), 'label' (int) and 'content' (str).
    """
    data = pd.DataFrame()
    data_files = ['./data/AMale.txt', './data/AFemale.txt']

    for data_file in data_files:
        data = data.append(pd.read_csv(data_file, sep="\t", header=None, encoding='latin-1'))

    data = data.drop(1, axis='columns') # drop profile's column (human/bot)
    data = data.rename(columns={0: 'author_id', 2: 'label', 3: 'content'})

    data['author_id'] = data['author_id'].apply(lambda x: int(x)) # ids should be integers
    data['label'] = data['label'].apply(lambda gender: 0 if gender == 'M' else 1) # male:0, female:1
    return data

In [16]:
def preprocess_data(data):
    """
    Applies preprocessing methods on the given dataframe.
    The dataframe must contain following columns :  'author_id' (int), 'label' (int) and 'content' (str).
    Returns the dataframe.
    """
    # text preprocessing
    data['content'] = data['content'].apply(replace_mentions)
    data['content'] = data['content'].apply(replace_hashtags)
    # emojis ?
    # concat tweets of same author ?
    
    return data

# Constants

In [17]:
# define constants here...

# Main

In [18]:
data = load_data()
data = preprocess_data(data)