In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import math


In [3]:
data_folder = "dontpatronizeme_v1.4/"

pcl_df = pd.read_csv(f"{data_folder}/dontpatronizeme_pcl.tsv", sep='\t', on_bad_lines='skip')
cat_df = pd.read_csv(f"{data_folder}/dontpatronizeme_categories.tsv", sep='\t', on_bad_lines='skip')

def load_and_preprocess_data(path, col_names):

    original_data=[]

    with open (path) as data:
        for line in data:
            original_data.append(line)
            
    print('The original data contains ', len(original_data), ' lines.')
    
    lines = []

    for line in original_data:
        elements=line.strip().split('\t')
        lines.append(elements)
    
    df = pd.DataFrame(lines, columns = col_names)

    # remove the 0-3 rows since they don't contain any data
    df = df.iloc[4:].reset_index(drop=True)

    return df



In [8]:
pcl_cols = ["par_id", "art_id", "keyword", "country_code", "text", "label"]
pcl_df = load_and_preprocess_data(f"{data_folder}/dontpatronizeme_pcl.tsv", pcl_cols)

pcl_df["label"] = pcl_df["label"].astype(int)
# create class (1 for PCL 0 for non-PCL) based on label 
pcl_df["class"] = pcl_df.apply(lambda x: 1 if x["label"] > 1 else 0, axis=1)

# get length of words in text
print('Word Count:')
pcl_df["word_count"] = pcl_df["text"].apply(lambda x: len(x.split()))
print(pcl_df["word_count"].describe())
# get length of characters in text
print('Character Count:')
pcl_df["char_count"] = pcl_df["text"].apply(lambda x: len(x))
pcl_df["char_count"].describe()




The original data contains  10473  lines.
Word Count:
Character Count:


count    10469.000000
mean       267.582864
std        160.209494
min          0.000000
25%        169.000000
50%        233.000000
75%        330.000000
max       5501.000000
Name: char_count, dtype: float64