In [27]:
from pathlib import Path
import pandas as pd

import emoji
import preprocessor as p
import truecase

p.set_options(p.OPT.URL)

# Preprocessing Function

In [28]:
def lowercasing(sentence):
    return str(sentence).lower()

def truecasing(sentence):
    """ Alternative function to 'lowercasing().
        Instead of turning everything into lower case, the grammatically true case
        is used."""
    return truecase.get_true_case(str(sentence))

def rm_whitespace(sentence):
    return str(sentence).strip()

def replace_emoji(sentence):
    return emoji.demojize(str(sentence))

def replace_url(sentence):
    return p.tokenize(str(sentence))

In [29]:
def preprocessing(s):
    #s = lowercasing(s)
    s = truecasing(s)
    s = rm_whitespace(s)
    s = replace_emoji(s)    
    s = replace_url(s)
    return s


#e.g.
s = ' Preprocessor is #awesome 👍 https://github.com/s/preprocessor '
preprocessing(s)

'Preprocessor is #awesome :thumbs_up: Https: //$URL$'

In [30]:
def one_hot_encoding(df, column):
    """ One-hot encoding for a column in a dataframe """
    one_hot_df = pd.get_dummies(df[column])
    df[column] = one_hot_df.apply(lambda x: x.argmax(), axis=1)
    return df


# Files cleaning

In [31]:
datafolder = Path("./data/practise_data")
print(list(datafolder.rglob('*.csv')))

train_f= Path('data/practise_data/development_train.csv')
test_f= Path('data/practise_data/development_test.csv')

[PosixPath('data/practise_data/development_test.csv'), PosixPath('data/practise_data/development_train.csv'), PosixPath('data/practise_data/cleaned/cleaned_development_train.csv'), PosixPath('data/practise_data/cleaned/cleaned_development_test.csv')]


In [32]:
def clean_df(df, f_path):
    df["cleaned_tweet"] = df["tweet"].apply(lambda s: preprocessing(s))

    df = one_hot_encoding(df, 'gender')
    df = one_hot_encoding(df, 'profession')
    df = one_hot_encoding(df, 'ideology_binary')
    df = one_hot_encoding(df, 'ideology_multiclass')

    df_path = f_path.parent/"cleaned"/f"cleaned_{f_path.name}"
    df_path.parent.mkdir(parents=True, exist_ok=True)
    df.to_csv(df_path)
    return df_path

In [33]:
train_df = pd.read_csv(train_f)
test_df = pd.read_csv(test_f)

In [34]:
clean_df(train_df,train_f)
clean_df(test_df,test_f)

PosixPath('data/practise_data/cleaned/cleaned_development_test.csv')

In [35]:
# spilt csv data into train and validation by category
from pathlib import Path
import pandas as pd

fulldata= Path('/Users/abby/Desktop/UZH/Courses/NLP2/Project/code/IberLEF2023/data/practise_data/development_test.csv')
df = pd.read_csv(fulldata)


from itertools import combinations

gender  = ['female','male']
profession = ['politician','journalist','celebrity']
ideology_multiclass = ['left','right','moderate_left', 'moderate_right']

for i in gender:
    for j in profession:
        for k in ideology_multiclass:
            selected_df = df.iloc[
                list((df.gender == i) & \
                    (df.profession == j)& \
                    (df.ideology_multiclass == k)), 
                :,
            ]
            print(f"{i},{j},{k}",len(selected_df))



female,politician,left 200
female,politician,right 80
female,politician,moderate_left 200
female,politician,moderate_right 200
female,journalist,left 120
female,journalist,right 40
female,journalist,moderate_left 280
female,journalist,moderate_right 120
female,celebrity,left 0
female,celebrity,right 0
female,celebrity,moderate_left 120
female,celebrity,moderate_right 0
male,politician,left 120
male,politician,right 120
male,politician,moderate_left 320
male,politician,moderate_right 160
male,journalist,left 200
male,journalist,right 200
male,journalist,moderate_left 480
male,journalist,moderate_right 600
male,celebrity,left 0
male,celebrity,right 0
male,celebrity,moderate_left 40
male,celebrity,moderate_right 0
