# Generate smaller Datasets

In [1]:
import pandas as pd
import seaborn as sns
from sklearn.model_selection import train_test_split

In [2]:
df = pd.read_csv("../data/data.csv")
df.head()

Unnamed: 0,dataset,text,logical_fallacies,source
0,1,The World Coal Association disputed the conclu...,false_dilemma,https://www.nytimes.com/2018/10/07/climate/ipc...
1,1,Refusing to approve the document would place t...,appeal_to_emotion,https://www.nytimes.com/2018/10/07/climate/ipc...
2,1,At 3 6 degrees of warming the report predicts ...,faulty_generalization,https://www.nytimes.com/2018/10/07/climate/ipc...
3,1,Scribbler and Beckwith said the anomalies were...,faulty_generalization,https://www.independent.co.uk/news/science/cli...
4,1,Meanwhile Mr Beckwith confirmed the changes wo...,appeal_to_emotion,https://www.independent.co.uk/news/science/cli...


In [3]:
# Remove faulty generalization
df =df[df["logical_fallacies"] != "faulty_generalization"]
df

Unnamed: 0,dataset,text,logical_fallacies,source
0,1,The World Coal Association disputed the conclu...,false_dilemma,https://www.nytimes.com/2018/10/07/climate/ipc...
1,1,Refusing to approve the document would place t...,appeal_to_emotion,https://www.nytimes.com/2018/10/07/climate/ipc...
4,1,Meanwhile Mr Beckwith confirmed the changes wo...,appeal_to_emotion,https://www.independent.co.uk/news/science/cli...
5,1,We must declare a global climate emergency,appeal_to_emotion,https://www.independent.co.uk/news/science/cli...
6,1,Some plants and wildlife taking their thermal ...,appeal_to_emotion,https://www.npr.org/2018/07/23/630181622/sprin...
...,...,...,...,...
11735,9,I could call the heads of Wall Street the head...,slippery_slope,
11736,9,They d impose the Green New Deal which would c...,slippery_slope,
11737,9,that once you do this follow this course of ac...,slippery_slope,
11738,9,I don t believe that Hillary has the stamina,ad_hominem,


In [4]:
df["logical_fallacies"].value_counts()

logical_fallacies
none                   5418
appeal_to_emotion      1619
ad_hominem             1062
false_dilemma           941
appeal_to_authority     726
slippery_slope          627
Name: count, dtype: int64

### Add binary fallacy column (fallacy vs no fallacy)

In [5]:
def binary_classification(x):
    if x == 'none':
        return 'none'
    else:
        return 'fallacy'

In [6]:
df['two_class_target'] = df['logical_fallacies'].apply(binary_classification)
df.shape

(10393, 5)

### Dataset with val for creating larger datasets

In [7]:
df_large = df.copy()

### Validation set

In [8]:
def get_stratified_samples(df):
    df_current = pd.DataFrame([])
    for fallacies in df["logical_fallacies"].unique():
        if fallacies == "slippery_slope":
            n = 100 
        else:
            n = 250

        df_fallacy = df[df["logical_fallacies"] == fallacies]
        df_fallacy = df_fallacy.sample(n=n, random_state=42)
        df_current = pd.concat([df_current, df_fallacy], ignore_index=True)
  
    return df_current

In [9]:
df_val = get_stratified_samples(df)

In [10]:
df_val["logical_fallacies"].value_counts()

logical_fallacies
false_dilemma          250
appeal_to_emotion      250
ad_hominem             250
none                   250
appeal_to_authority    250
slippery_slope         100
Name: count, dtype: int64

In [11]:
df_val.to_csv("../data/data_val.csv", index=False)

### Remove validation set from df

In [12]:
df = pd.concat([df, df_val], ignore_index=True)
df = df.drop_duplicates(keep=False)
df.shape

(9043, 5)

We want to have:
- binary dataset with fallacy vs none
- multiclass slippery slope with none
- multiclass slippery slope without none
- binary with ad hominem and appeal to emotion

### Get balanced binary dataset

In [13]:
df["two_class_target"].value_counts()

two_class_target
none       5168
fallacy    3875
Name: count, dtype: int64

In [14]:
def get_balanced_binary(df):
    df_current = pd.DataFrame([])
    for fallacies in df["two_class_target"].unique():
        df_fallacy = df[df["two_class_target"] == fallacies]
        df_fallacy = df_fallacy.sample(n=3875, random_state=42)
        df_current = pd.concat([df_current, df_fallacy], ignore_index=True)
  
    return df_current

In [15]:
df_binary = get_balanced_binary(df)

In [16]:
df_binary["two_class_target"].value_counts()

two_class_target
fallacy    3875
none       3875
Name: count, dtype: int64

In [17]:
df_binary.to_csv("../data/data_binary.csv", index=False)

### Get imbalanced binary dataset

In [18]:
def get_imbalanced_binary(df):
    df_current = pd.DataFrame([])
    for fallacies in df["two_class_target"].unique():
        if fallacies == "none":
            n = 5168
        else:
            n = 1000
        df_fallacy = df[df["two_class_target"] == fallacies]
        df_fallacy = df_fallacy.sample(n=n, random_state=42)
        df_current = pd.concat([df_current, df_fallacy], ignore_index=True)
  
    return df_current

In [19]:
df_binary_imbalanced = get_imbalanced_binary(df)

In [20]:
df_binary_imbalanced["two_class_target"].value_counts()

two_class_target
none       5168
fallacy    1000
Name: count, dtype: int64

In [21]:
df_binary_imbalanced.to_csv("../data/data_binary_imbalanced.csv", index=False)

### Multiclass with slippery slope and none

In [22]:
df["logical_fallacies"].value_counts()

logical_fallacies
none                   5168
appeal_to_emotion      1369
ad_hominem              812
false_dilemma           691
slippery_slope          527
appeal_to_authority     476
Name: count, dtype: int64

In [23]:
# Pick fallacies randomly, but as balanced as possible
# Configuration parameters
TARGET_SIZE = 3200
CLASSES = df['logical_fallacies'].unique()

# Calculate target per class (integer division)
target_per_class = TARGET_SIZE // len(CLASSES)

# Stratified sampling with undersampling
sampled_dfs = []
for class_name in CLASSES:
    class_df = df[df['logical_fallacies'] == class_name]
    sample_size = min(target_per_class, len(class_df))
    sampled_dfs.append(class_df.sample(sample_size, random_state=42))

# Handle remaining samples
balanced_df = pd.concat(sampled_dfs)
remaining = TARGET_SIZE - len(balanced_df)

if remaining > 0:
    extra_samples = df[~df.index.isin(balanced_df.index)]
    extra_samples = extra_samples[extra_samples["logical_fallacies"] != "none"]
    balanced_df = pd.concat([
        balanced_df,
        extra_samples.sample(remaining, random_state=42)
    ])

# Final shuffle
balanced_df = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# remove the nons 
# balanced_df_ex_none = balanced_df[balanced_df['logical_fallacies'] != 'none']

# Verify distribution
print(balanced_df['logical_fallacies'].value_counts())

logical_fallacies
appeal_to_emotion      579
false_dilemma          543
ad_hominem             542
none                   533
slippery_slope         527
appeal_to_authority    476
Name: count, dtype: int64


In [24]:
balanced_df.to_csv("../data/data_multiclass_with_none.csv", index=False)

### Multiclass with slippery slope and none with validation dataset

In [25]:
df_large["logical_fallacies"].value_counts()

logical_fallacies
none                   5418
appeal_to_emotion      1619
ad_hominem             1062
false_dilemma           941
appeal_to_authority     726
slippery_slope          627
Name: count, dtype: int64

In [26]:
# Pick fallacies randomly, but as balanced as possible
# Configuration parameters
TARGET_SIZE = 4500
CLASSES = df_large['logical_fallacies'].unique()

# Calculate target per class (integer division)
target_per_class = TARGET_SIZE // len(CLASSES)

# Stratified sampling with undersampling
sampled_dfs = []
for class_name in CLASSES:
    class_df = df_large[df_large['logical_fallacies'] == class_name]
    sample_size = min(target_per_class, len(class_df))
    sampled_dfs.append(class_df.sample(sample_size, random_state=42))

# Handle remaining samples
balanced_df_large = pd.concat(sampled_dfs)
remaining = TARGET_SIZE - len(balanced_df_large)

if remaining > 0:
    extra_samples = df_large[~df_large.index.isin(balanced_df_large.index)]
    extra_samples = extra_samples[extra_samples["logical_fallacies"] != "none"]
    balanced_df_large = pd.concat([
        balanced_df_large,
        extra_samples.sample(remaining, random_state=42)
    ])

# Final shuffle
balanced_df_large = balanced_df_large.sample(frac=1, random_state=42).reset_index(drop=True)

# Verify distribution
print(balanced_df_large['logical_fallacies'].value_counts())

logical_fallacies
appeal_to_emotion      846
ad_hominem             778
false_dilemma          773
none                   750
appeal_to_authority    726
slippery_slope         627
Name: count, dtype: int64


In [27]:
balanced_df_large.to_csv("../data/data_multiclass_with_none_large.csv", index=False)

### Multiclass with slippery slope without none

In [28]:
df_without_none = df[df['logical_fallacies'] != 'none']

In [29]:
df_without_none["logical_fallacies"].value_counts()

logical_fallacies
appeal_to_emotion      1369
ad_hominem              812
false_dilemma           691
slippery_slope          527
appeal_to_authority     476
Name: count, dtype: int64

In [30]:
# Pick fallacies randomly, but as balanced as possible
# Configuration parameters
TARGET_SIZE = 2500
CLASSES = df_without_none['logical_fallacies'].unique()

# Calculate target per class (integer division)
target_per_class = TARGET_SIZE // len(CLASSES)

# Stratified sampling with undersampling
sampled_dfs = []
for class_name in CLASSES:
    class_df = df_without_none[df_without_none['logical_fallacies'] == class_name]
    sample_size = min(target_per_class, len(class_df))
    sampled_dfs.append(class_df.sample(sample_size, random_state=42))

# Handle remaining samples
balanced_df = pd.concat(sampled_dfs)
remaining = TARGET_SIZE - len(balanced_df)

if remaining > 0:
    extra_samples = df_without_none[~df_without_none.index.isin(balanced_df.index)]
    extra_samples = extra_samples[extra_samples["logical_fallacies"] != "none"]
    balanced_df = pd.concat([
        balanced_df,
        extra_samples.sample(remaining, random_state=42)
    ])

# Final shuffle
balanced_df_without_none = balanced_df.sample(frac=1, random_state=42).reset_index(drop=True)

# Verify distribution
print(balanced_df_without_none['logical_fallacies'].value_counts())

logical_fallacies
appeal_to_emotion      515
false_dilemma          505
ad_hominem             503
slippery_slope         501
appeal_to_authority    476
Name: count, dtype: int64


In [31]:
balanced_df_without_none.to_csv("../data/data_multiclass_without_none.csv", index=False)

### Binary dataset with ad hominem and appeal to emotion

In [32]:
def get_balanced_binary_new(df):
    df_current = pd.DataFrame([])
    for fallacies in ["appeal_to_emotion", "ad_hominem"]:
        df_fallacy = df[df["logical_fallacies"] == fallacies]
        df_fallacy = df_fallacy.sample(n=812, random_state=42)
        df_current = pd.concat([df_current, df_fallacy], ignore_index=True)
  
    return df_current

In [33]:
df_binary_new = get_balanced_binary_new(df)

In [34]:
df_binary_new["logical_fallacies"].value_counts()

logical_fallacies
appeal_to_emotion    812
ad_hominem           812
Name: count, dtype: int64

In [35]:
df_binary_new.to_csv("../data/data_binary_emotion_hominem.csv", index=False)

## Generate tiny dataset (100 rows)

In [36]:
# Slipery slope
number_of_rows = 100
RSEED = 42

df_tiny = df.sample(number_of_rows, axis=0, random_state=RSEED)
df_tiny.to_csv("../data/data_tiny.csv", index=False)