# Training Logistic Regression Classifier

## Dependencies

In [1]:
import pandas as pd
import numpy as np

import ast # Needed to deserialize lists in CSV file
from collections import Counter # Needed for word frequency counting, requires Python 3.10
import json # For vocabularies serialization

from tqdm import tqdm # Needed to the progress indicators
tqdm.pandas(desc='Progress: ')

from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

import joblib # For classifiers serialization

from math import log # For log loss score calculation

## Data Prep

### Read In Data

In [2]:
input_df = pd.read_csv("../input/feedback-prize-effectiveness/train_preproc_v4.csv", \
                       converters={"discourse_stemmed" : ast.literal_eval})

# We will only retain the "discourse_type", "discourse_effectiveness", and "discourse_stemmed" columns for the training
input_df = input_df[["discourse_type", "discourse_effectiveness", "discourse_stemmed"]]
input_df.head()

Unnamed: 0,discourse_type,discourse_effectiveness,discourse_stemmed
0,Lead,Adequate,"[hi, i, isaac, i, go, write, about, natur, lan..."
1,Position,Adequate,"[perspect, i, think, natur, landform, i, dont,..."
2,Claim,Adequate,"[i, think, natur, landform, life, we, discov, ..."
3,Evidence,Adequate,"[if, life, we, would, i, think, natur, landfor..."
4,Counterclaim,Adequate,"[thought, form, alien, thought, life]"


In [3]:
# Let's see how our learning samples are distributed across different discourse types and effectiveness
sample_counts = input_df.groupby(["discourse_type", "discourse_effectiveness"]).count()
sample_counts.rename(columns={"discourse_stemmed": "count"}, inplace=True) # Rename the last column into "count"
sample_counts.reset_index(inplace=True) # Best do this after the grouping
sample_counts

Unnamed: 0,discourse_type,discourse_effectiveness,count
0,Claim,Adequate,7097
1,Claim,Effective,3405
2,Claim,Ineffective,1475
3,Concluding Statement,Adequate,1945
4,Concluding Statement,Effective,825
5,Concluding Statement,Ineffective,581
6,Counterclaim,Adequate,1150
7,Counterclaim,Effective,418
8,Counterclaim,Ineffective,205
9,Evidence,Adequate,6064


In [4]:
# Let's convert above data frame into a python dictionary
sample_counts_by_type = {} # Retains the breakup by dicourse type
sample_counts_typeless = {} # Doesn't retain the breakup by dicourse type

for discourse_type, discourse_effectiveness, count in sample_counts.itertuples(index=False):
    if discourse_type not in sample_counts_by_type:
        sample_counts_by_type[discourse_type] = {}
    if discourse_effectiveness not in sample_counts_typeless:
        sample_counts_typeless[discourse_effectiveness] = 0
        
    sample_counts_by_type[discourse_type][discourse_effectiveness] = count
    sample_counts_typeless[discourse_effectiveness] += count

print(sample_counts_typeless)
print(sample_counts_by_type)

{'Adequate': 20977, 'Effective': 9326, 'Ineffective': 6462}
{'Claim': {'Adequate': 7097, 'Effective': 3405, 'Ineffective': 1475}, 'Concluding Statement': {'Adequate': 1945, 'Effective': 825, 'Ineffective': 581}, 'Counterclaim': {'Adequate': 1150, 'Effective': 418, 'Ineffective': 205}, 'Evidence': {'Adequate': 6064, 'Effective': 2885, 'Ineffective': 3156}, 'Lead': {'Adequate': 1244, 'Effective': 683, 'Ineffective': 364}, 'Position': {'Adequate': 2784, 'Effective': 770, 'Ineffective': 470}, 'Rebuttal': {'Adequate': 693, 'Effective': 340, 'Ineffective': 211}}


### Prepare Representative (Sample-Balanced) Data Frames

In [5]:
# As number of training samples for different discourse types and effectiveness is rather different,
# we will prepare dataframes with balanced number of samples

# Data frames with unbalanced samples
X_unbalanced, y_unbalanced = input_df[["discourse_type", "discourse_stemmed"]], input_df["discourse_effectiveness"]

# Data frames with samples balanced by effectiveness

# Split input_df by discourse_effectiveness
adequate_df = input_df[input_df["discourse_effectiveness"] == "Adequate"]
effective_df = input_df[input_df["discourse_effectiveness"] == "Effective"]
ineffective_df = input_df[input_df["discourse_effectiveness"] == "Ineffective"]

# Retain the largest possible number of samples while keeping them balanced by discourse_effectiveness
min_samples = min([len(adequate_df), len(effective_df), len(ineffective_df)])
adequate_df_sampled = adequate_df.sample(n = min_samples, random_state = 32167)
effective_df_sampled = effective_df.sample(n = min_samples, random_state = 32167)
ineffective_df_sampled = ineffective_df.sample(n = min_samples, random_state = 32167)

effectiveness_balanced_df = pd.concat([adequate_df_sampled, effective_df_sampled, ineffective_df_sampled]).sample(frac = 1, random_state = 32167)
X_effectiveness_balanced, y_effectiveness_balanced = effectiveness_balanced_df[["discourse_type", "discourse_stemmed"]], effectiveness_balanced_df["discourse_effectiveness"]

# Data frames with samples unbalanced by effectivenss but split by time, and balanced by type and effectivenes
discourse_types = ["Claim", "Concluding Statement", "Counterclaim", "Evidence", "Lead", "Position", "Rebuttal"]

unbalance_type_split_dfs = {}
X_unbalanced_type_split, y_unbalanced_type_split = {}, {}

type_effectiveness_balanced_dfs = {}
X_type_and_effectivess_balanced, y_type_and_effectivess_balanced = {}, {}

for dt in discourse_types: # For every discourse type
    
    # Split adequate_df, effective_df, ineffective_df by discourse_type
    unbalance_type_split_dfs[dt] = {}
    unbalance_type_split_dfs[dt]["Adequate"] = adequate_df[adequate_df["discourse_type"] == dt]
    unbalance_type_split_dfs[dt]["Effective"] = effective_df[effective_df["discourse_type"] == dt]
    unbalance_type_split_dfs[dt]["Ineffective"] = ineffective_df[ineffective_df["discourse_type"] == dt]
    
    # Again, retain the largest possible number of samples while keeping them balanced by discourse type and effectiveness
    min_samples = min([len(unbalance_type_split_dfs[dt]["Adequate"]), len(unbalance_type_split_dfs[dt]["Effective"]), len(unbalance_type_split_dfs[dt]["Ineffective"])])
    type_effectiveness_balanced_dfs[dt] = {}
    type_effectiveness_balanced_dfs[dt]["Adequate"] = unbalance_type_split_dfs[dt]["Adequate"].sample(n = min_samples, random_state = 32167)
    type_effectiveness_balanced_dfs[dt]["Effective"] = unbalance_type_split_dfs[dt]["Effective"].sample(n = min_samples, random_state = 32167)
    type_effectiveness_balanced_dfs[dt]["Ineffective"] = unbalance_type_split_dfs[dt]["Ineffective"].sample(n = min_samples, random_state = 32167)

    unbalance_type_split_dfs[dt] = pd.concat([unbalance_type_split_dfs[dt]["Adequate"], unbalance_type_split_dfs[dt]["Effective"], unbalance_type_split_dfs[dt]["Ineffective"]]).sample(frac = 1, random_state = 32167)
    X_unbalanced_type_split[dt] = unbalance_type_split_dfs[dt][["discourse_type", "discourse_stemmed"]]
    y_unbalanced_type_split[dt] = unbalance_type_split_dfs[dt]["discourse_effectiveness"]
    
    type_effectiveness_balanced_dfs[dt] = pd.concat([type_effectiveness_balanced_dfs[dt]["Adequate"], type_effectiveness_balanced_dfs[dt]["Effective"], type_effectiveness_balanced_dfs[dt]["Ineffective"]]).sample(frac = 1, random_state = 32167)
    X_type_and_effectivess_balanced[dt] = type_effectiveness_balanced_dfs[dt][["discourse_type", "discourse_stemmed"]]
    y_type_and_effectivess_balanced[dt] = type_effectiveness_balanced_dfs[dt]["discourse_effectiveness"]

# Outcomes of this cell:
#    X_unbalanced, y_unbalanced
#    X_unbalanced_type_split, y_unbalanced_type_split
#    X_effectiveness_balanced, y_effectiveness_balanced
#    X_type_and_effectivess_balanced, y_type_and_effectivess_balanced

### Vocabularies Preparation

In [6]:
"""
    Here we will prepare the vocabularies with word occurences counts from the previously prepared datasets,
    and convert their "discourse_stemmed" columns into feature vectors
"""
vocabs = {"unbalanced": {}, "unbalanced_type_split": {}, "effectiveness_balanced": {}, "type_and_effectivess_balanced": {}}

unbalanced_df = pd.concat([X_unbalanced, y_unbalanced], axis=1) # Recover the input DF

# Concatenate all the discourse_stemmed for every discourse_effectiveness
unbalanced_df = unbalanced_df[["discourse_effectiveness", "discourse_stemmed"]].groupby("discourse_effectiveness").agg(sum)
unbalanced_df.rename(columns={"discourse_stemmed": "all_words"}, inplace=True) # Rename the last column into "top100_word_count"
unbalanced_df.reset_index(inplace=True)

print("Total words in the unbalanced vocabulary: ")

# For every discourse_effectiveness
for row in unbalanced_df.itertuples():
    discourse_effectiveness, all_words = row[1], row[2]
    vocabs["unbalanced"][discourse_effectiveness] = dict(Counter(all_words)) # Count all the words for that discourse_effectiveness

    print(f"   - {discourse_effectiveness}: {Counter(all_words).total()}")

Total words in the unbalanced vocabulary: 
   - Adequate: 344851
   - Effective: 287093
   - Ineffective: 150948


In [7]:
# Now we will do the exactly same thing, but for the effectiveness balanced dataset
effectiveness_balanced_df = pd.concat([X_effectiveness_balanced, y_effectiveness_balanced], axis=1) # Recover the input DF

# Concatenate all the discourse_stemmed for every discourse_effectiveness
effectiveness_balanced_df = effectiveness_balanced_df[["discourse_effectiveness", "discourse_stemmed"]].groupby("discourse_effectiveness").agg(sum)
effectiveness_balanced_df.rename(columns={"discourse_stemmed": "all_words"}, inplace=True) # Rename the last column into "top100_word_count"
effectiveness_balanced_df.reset_index(inplace=True)

print("Total words in the effectiveness balanced vocabulary: ")

# For every discourse_effectiveness
for row in effectiveness_balanced_df.itertuples():
    discourse_effectiveness, all_words = row[1], row[2]
    vocabs["effectiveness_balanced"][discourse_effectiveness] = dict(Counter(all_words)) # Count all the words for that discourse_effectiveness

    print(f"   - {discourse_effectiveness}: {Counter(all_words).total()}")

Total words in the effectiveness balanced vocabulary: 
   - Adequate: 108581
   - Effective: 197355
   - Ineffective: 150948


In [8]:
# The same thing, but for the effectivess unbalanced type split dataset
for dt in discourse_types:

    # Recover the input DF
    effectivenss_unbalance_type_split_dfs = pd.concat([X_unbalanced_type_split[dt], y_unbalanced_type_split[dt]], axis=1)
    
    # Concatenate all the discourse_stemmed for every discourse_effectiveness
    effectivenss_unbalance_type_split_dfs = effectivenss_unbalance_type_split_dfs[["discourse_effectiveness", "discourse_stemmed"]].groupby("discourse_effectiveness").agg(sum)
    effectivenss_unbalance_type_split_dfs.rename(columns={"discourse_stemmed": "all_words"}, inplace=True) # Rename the last column into "top100_word_count"
    effectivenss_unbalance_type_split_dfs.reset_index(inplace=True)
    
    print(f"Total words in the {dt} effectiveness unbalanced type split vocabulary: ")
    vocabs["unbalanced_type_split"][dt] = {}
    
    # For every discourse_effectiveness
    for row in effectivenss_unbalance_type_split_dfs.itertuples():
        discourse_effectiveness, all_words = row[1], row[2]
        vocabs["unbalanced_type_split"][dt][discourse_effectiveness] = dict(Counter(all_words)) # Count all the words for that discourse_effectiveness

        print(f"   - {discourse_effectiveness}: {Counter(all_words).total()}")

Total words in the Claim effectiveness unbalanced type split vocabulary: 
   - Adequate: 58079
   - Effective: 33059
   - Ineffective: 12717
Total words in the Concluding Statement effectiveness unbalanced type split vocabulary: 
   - Adequate: 46120
   - Effective: 37532
   - Ineffective: 9586
Total words in the Counterclaim effectiveness unbalanced type split vocabulary: 
   - Adequate: 12638
   - Effective: 6279
   - Ineffective: 1969
Total words in the Evidence effectiveness unbalanced type split vocabulary: 
   - Adequate: 171021
   - Effective: 167051
   - Ineffective: 113522
Total words in the Lead effectiveness unbalanced type split vocabulary: 
   - Adequate: 25740
   - Effective: 26531
   - Ineffective: 6488
Total words in the Position effectiveness unbalanced type split vocabulary: 
   - Adequate: 23334
   - Effective: 9631
   - Ineffective: 4507
Total words in the Rebuttal effectiveness unbalanced type split vocabulary: 
   - Adequate: 7919
   - Effective: 7010
   - Ineffec

In [9]:
# And again, the same thing, but for type and effectivess balanced dataset
for dt in discourse_types:
    # Recover the input DF
    type_and_effectivess_balanced_df = pd.concat([X_type_and_effectivess_balanced[dt], y_type_and_effectivess_balanced[dt]], axis=1)
    
    # Concatenate all the discourse_stemmed for every discourse_effectiveness
    type_and_effectivess_balanced_df = type_and_effectivess_balanced_df[["discourse_effectiveness", "discourse_stemmed"]].groupby("discourse_effectiveness").agg(sum)
    type_and_effectivess_balanced_df.rename(columns={"discourse_stemmed": "all_words"}, inplace=True) # Rename the last column into "top100_word_count"
    type_and_effectivess_balanced_df.reset_index(inplace=True)
    
    print(f"Total words in the {dt} effectiveness balanced vocabulary: ")
    vocabs["type_and_effectivess_balanced"][dt] = {}
    
    # For every discourse_effectiveness
    for row in type_and_effectivess_balanced_df.itertuples():
        discourse_effectiveness, all_words = row[1], row[2]
        vocabs["type_and_effectivess_balanced"][dt][discourse_effectiveness] = dict(Counter(all_words)) # Count all the words for that discourse_effectiveness

        print(f"   - {discourse_effectiveness}: {Counter(all_words).total()}")

Total words in the Claim effectiveness balanced vocabulary: 
   - Adequate: 12158
   - Effective: 14163
   - Ineffective: 12717
Total words in the Concluding Statement effectiveness balanced vocabulary: 
   - Adequate: 13984
   - Effective: 26462
   - Ineffective: 9586
Total words in the Counterclaim effectiveness balanced vocabulary: 
   - Adequate: 2433
   - Effective: 3225
   - Ineffective: 1969
Total words in the Evidence effectiveness balanced vocabulary: 
   - Adequate: 82579
   - Effective: 167051
   - Ineffective: 104355
Total words in the Lead effectiveness balanced vocabulary: 
   - Adequate: 7573
   - Effective: 13979
   - Ineffective: 6488
Total words in the Position effectiveness balanced vocabulary: 
   - Adequate: 3913
   - Effective: 6012
   - Ineffective: 4507
Total words in the Rebuttal effectiveness balanced vocabulary: 
   - Adequate: 2348
   - Effective: 4301
   - Ineffective: 2159


In [10]:
# Finaly we will write out the vocabularies in a file
with open('./data/vocab_v2.json', 'w', encoding ='utf8') as json_file:
    json.dump(vocabs, json_file)
    
# We then immidiately reload the vocabs from file to be able to catch the potential serialization issues faster
with open('./data/vocab_v2.json', 'r', encoding ='utf8') as json_file:
    vocabs = json.load(json_file)

### Feature Extraction

In [11]:
"""
    Next we need to convert "discourse_stemmed" columns into feature vector of the form:
        (1.0, ac, ec, ic), where ac, ec, ic  are sum of counts of every stem counts in the
        adequate, effective, ineffective vocabularies.
"""

X_unbalanced_feat = X_unbalanced

X_unbalanced_feat["bias"] = 1.0
X_unbalanced_feat["ac"] = X_unbalanced_feat["discourse_stemmed"].progress_apply( \
                lambda x: sum([(vocabs["unbalanced"]["Adequate"][w] if w in vocabs["unbalanced"]["Adequate"] else 0) for w in x]))
X_unbalanced_feat["ec"] = X_unbalanced_feat["discourse_stemmed"].progress_apply( \
                lambda x: sum([(vocabs["unbalanced"]["Effective"][w] if w in vocabs["unbalanced"]["Effective"] else 0) for w in x]))
X_unbalanced_feat["ic"] = X_unbalanced_feat["discourse_stemmed"].progress_apply( \
                lambda x: sum([(vocabs["unbalanced"]["Ineffective"][w] if w in vocabs["unbalanced"]["Ineffective"] else 0) for w in x]))

# Now prepare numpy feature vectors and split the datasets into training and validation dataset
X_unbalanced_feat = X_unbalanced_feat[["bias", "ac", "ec", "ic"]]
X_unbalanced_feat, y_unbalanced_feat = X_unbalanced_feat.to_numpy(), y_unbalanced.to_numpy()

X_unbalanced_feat_train, X_unbalanced_feat_val, y_unbalanced_feat_train, y_unbalanced_feat_val = \
    train_test_split(X_unbalanced_feat, y_unbalanced_feat, test_size = 0.20, random_state = 32167)

X_unbalanced_feat_train.shape, X_unbalanced_feat_val.shape, y_unbalanced_feat_train.shape, y_unbalanced_feat_val.shape

Progress: 100%|██████████████████████████████████████████████████████████████| 36765/36765 [00:00<00:00, 198189.58it/s]
Progress: 100%|██████████████████████████████████████████████████████████████| 36765/36765 [00:00<00:00, 197040.86it/s]
Progress: 100%|██████████████████████████████████████████████████████████████| 36765/36765 [00:00<00:00, 198188.82it/s]


((29412, 4), (7353, 4), (29412,), (7353,))

In [12]:
# Now we will do the same, but for the dataset balanced by effectiveness
X_effectiveness_balanced_feat = X_effectiveness_balanced

X_effectiveness_balanced_feat["bias"] = 1.0
X_effectiveness_balanced_feat["ac"] = X_effectiveness_balanced_feat["discourse_stemmed"].progress_apply( \
                lambda x: sum([(vocabs["effectiveness_balanced"]["Adequate"][w] if w in vocabs["effectiveness_balanced"]["Adequate"] else 0) for w in x]))
X_effectiveness_balanced_feat["ec"] = X_effectiveness_balanced_feat["discourse_stemmed"].progress_apply( \
                lambda x: sum([(vocabs["effectiveness_balanced"]["Effective"][w] if w in vocabs["effectiveness_balanced"]["Effective"] else 0) for w in x]))
X_effectiveness_balanced_feat["ic"] = X_effectiveness_balanced_feat["discourse_stemmed"].progress_apply( \
                lambda x: sum([(vocabs["effectiveness_balanced"]["Ineffective"][w] if w in vocabs["effectiveness_balanced"]["Ineffective"] else 0) for w in x]))

# Now prepare numpy feature vectors and split the datasets into training and validation dataset
X_effectiveness_balanced_feat = X_effectiveness_balanced_feat[["bias", "ac", "ec", "ic"]]
X_effectiveness_balanced_feat, y_effectiveness_balanced_feat = X_effectiveness_balanced_feat.to_numpy(), y_effectiveness_balanced.to_numpy()

X_effectiveness_balanced_feat_train, X_effectiveness_balanced_feat_val, y_effectiveness_balanced_feat_train, y_effectiveness_balanced_feat_val = \
    train_test_split(X_effectiveness_balanced_feat, y_effectiveness_balanced_feat, test_size = 0.20, random_state = 32167)

X_effectiveness_balanced_feat_train.shape, X_effectiveness_balanced_feat_val.shape, y_effectiveness_balanced_feat_train.shape, y_effectiveness_balanced_feat_val.shape

Progress: 100%|██████████████████████████████████████████████████████████████| 19386/19386 [00:00<00:00, 164022.31it/s]
Progress: 100%|██████████████████████████████████████████████████████████████| 19386/19386 [00:00<00:00, 160591.99it/s]
Progress: 100%|██████████████████████████████████████████████████████████████| 19386/19386 [00:00<00:00, 159323.87it/s]


((15508, 4), (3878, 4), (15508,), (3878,))

In [13]:
# Now we will do the same, but for the dataset unbalanced balanced by the discourse effectiveness and split by type
X_unbalanced_type_split_feat, y_unbalanced_type_split_feat = {}, {}
X_unbalanced_type_split_feat_train, X_unbalanced_type_split_feat_val = {}, {}
y_unbalanced_type_split_feat_train, y_unbalanced_type_split_feat_val = {}, {}

for dt in discourse_types:
    X_unbalanced_type_split_feat[dt] = X_unbalanced_type_split[dt]

    X_unbalanced_type_split_feat[dt]["bias"] = 1.0
    X_unbalanced_type_split_feat[dt]["ac"] = X_unbalanced_type_split_feat[dt]["discourse_stemmed"].progress_apply( \
                    lambda x: sum([(vocabs["unbalanced_type_split"][dt]["Adequate"][w] if w in vocabs["unbalanced_type_split"][dt]["Adequate"] else 0) for w in x]))
    X_unbalanced_type_split_feat[dt]["ec"] = X_unbalanced_type_split_feat[dt]["discourse_stemmed"].progress_apply( \
                    lambda x: sum([(vocabs["unbalanced_type_split"][dt]["Effective"][w] if w in vocabs["unbalanced_type_split"][dt]["Effective"] else 0) for w in x]))
    X_unbalanced_type_split_feat[dt]["ic"] = X_unbalanced_type_split_feat[dt]["discourse_stemmed"].progress_apply( \
                    lambda x: sum([(vocabs["unbalanced_type_split"][dt]["Ineffective"][w] if w in vocabs["unbalanced_type_split"][dt]["Ineffective"] else 0) for w in x]))
    
    # Now prepare numpy feature vectors and split the datasets into training and validation dataset
    X_unbalanced_type_split_feat[dt] = X_unbalanced_type_split_feat[dt][["bias", "ac", "ec", "ic"]]
    X_unbalanced_type_split_feat[dt], y_unbalanced_type_split_feat[dt] = X_unbalanced_type_split_feat[dt].to_numpy(), y_unbalanced_type_split[dt].to_numpy()

    X_unbalanced_type_split_feat_train[dt], X_unbalanced_type_split_feat_val[dt], y_unbalanced_type_split_feat_train[dt], y_unbalanced_type_split_feat_val[dt] = \
        train_test_split(X_unbalanced_type_split_feat[dt], y_unbalanced_type_split_feat[dt], test_size = 0.20, random_state = 32167)

    print(dt, X_unbalanced_type_split_feat_train[dt].shape, X_unbalanced_type_split_feat_val[dt].shape, y_unbalanced_type_split_feat_train[dt].shape, y_unbalanced_type_split_feat_val[dt].shape)

Progress: 100%|██████████████████████████████████████████████████████████████| 11977/11977 [00:00<00:00, 286031.72it/s]
Progress: 100%|██████████████████████████████████████████████████████████████| 11977/11977 [00:00<00:00, 292892.59it/s]
Progress: 100%|██████████████████████████████████████████████████████████████| 11977/11977 [00:00<00:00, 300210.83it/s]


Claim (9581, 4) (2396, 4) (9581,) (2396,)


Progress: 100%|████████████████████████████████████████████████████████████████| 3351/3351 [00:00<00:00, 129226.97it/s]
Progress: 100%|████████████████████████████████████████████████████████████████| 3351/3351 [00:00<00:00, 124437.69it/s]
Progress: 100%|████████████████████████████████████████████████████████████████| 3351/3351 [00:00<00:00, 120093.24it/s]


Concluding Statement (2680, 4) (671, 4) (2680,) (671,)


Progress: 100%|████████████████████████████████████████████████████████████████| 1773/1773 [00:00<00:00, 253970.19it/s]
Progress: 100%|████████████████████████████████████████████████████████████████| 1773/1773 [00:00<00:00, 222217.27it/s]
Progress: 100%|████████████████████████████████████████████████████████████████| 1773/1773 [00:00<00:00, 254109.04it/s]


Counterclaim (1418, 4) (355, 4) (1418,) (355,)


Progress: 100%|██████████████████████████████████████████████████████████████| 12105/12105 [00:00<00:00, 100952.32it/s]
Progress: 100%|███████████████████████████████████████████████████████████████| 12105/12105 [00:00<00:00, 99073.99it/s]
Progress: 100%|███████████████████████████████████████████████████████████████| 12105/12105 [00:00<00:00, 96327.94it/s]


Evidence (9684, 4) (2421, 4) (9684,) (2421,)


Progress: 100%|████████████████████████████████████████████████████████████████| 2291/2291 [00:00<00:00, 135260.13it/s]
Progress: 100%|████████████████████████████████████████████████████████████████| 2291/2291 [00:00<00:00, 135121.29it/s]
Progress: 100%|████████████████████████████████████████████████████████████████| 2291/2291 [00:00<00:00, 135011.18it/s]


Lead (1832, 4) (459, 4) (1832,) (459,)


Progress: 100%|████████████████████████████████████████████████████████████████| 4024/4024 [00:00<00:00, 288210.23it/s]
Progress: 100%|████████████████████████████████████████████████████████████████| 4024/4024 [00:00<00:00, 288195.47it/s]
Progress: 100%|████████████████████████████████████████████████████████████████| 4024/4024 [00:00<00:00, 268965.90it/s]


Position (3219, 4) (805, 4) (3219,) (805,)


Progress: 100%|████████████████████████████████████████████████████████████████| 1244/1244 [00:00<00:00, 207885.34it/s]
Progress: 100%|████████████████████████████████████████████████████████████████| 1244/1244 [00:00<00:00, 206748.59it/s]
Progress: 100%|████████████████████████████████████████████████████████████████| 1244/1244 [00:00<00:00, 207885.34it/s]

Rebuttal (995, 4) (249, 4) (995,) (249,)





In [14]:
# Finally we will do the same, but for the dataset balanced by the discourse type and effectiveness
X_type_and_effectivess_balanced_feat, y_type_and_effectivess_balanced_feat = {}, {}
X_type_and_effectivess_balanced_feat_train, X_type_and_effectivess_balanced_feat_val = {}, {}
y_type_and_effectivess_balanced_feat_train, y_type_and_effectivess_balanced_feat_val = {}, {}

for dt in discourse_types:
    X_type_and_effectivess_balanced_feat[dt] = X_type_and_effectivess_balanced[dt]

    X_type_and_effectivess_balanced_feat[dt]["bias"] = 1.0
    X_type_and_effectivess_balanced_feat[dt]["ac"] = X_type_and_effectivess_balanced_feat[dt]["discourse_stemmed"].progress_apply( \
                    lambda x: sum([(vocabs["type_and_effectivess_balanced"][dt]["Adequate"][w] if w in vocabs["type_and_effectivess_balanced"][dt]["Adequate"] else 0) for w in x]))
    X_type_and_effectivess_balanced_feat[dt]["ec"] = X_type_and_effectivess_balanced_feat[dt]["discourse_stemmed"].progress_apply( \
                    lambda x: sum([(vocabs["type_and_effectivess_balanced"][dt]["Effective"][w] if w in vocabs["type_and_effectivess_balanced"][dt]["Effective"] else 0) for w in x]))
    X_type_and_effectivess_balanced_feat[dt]["ic"] = X_type_and_effectivess_balanced_feat[dt]["discourse_stemmed"].progress_apply( \
                    lambda x: sum([(vocabs["type_and_effectivess_balanced"][dt]["Ineffective"][w] if w in vocabs["type_and_effectivess_balanced"][dt]["Ineffective"] else 0) for w in x]))

    # Now prepare numpy feature vectors and split the datasets into training and validation dataset
    X_type_and_effectivess_balanced_feat[dt] = X_type_and_effectivess_balanced_feat[dt][["bias", "ac", "ec", "ic"]]
    X_type_and_effectivess_balanced_feat[dt], y_type_and_effectivess_balanced_feat[dt] = X_type_and_effectivess_balanced_feat[dt].to_numpy(), y_type_and_effectivess_balanced[dt].to_numpy()

    X_type_and_effectivess_balanced_feat_train[dt], X_type_and_effectivess_balanced_feat_val[dt], y_type_and_effectivess_balanced_feat_train[dt], y_type_and_effectivess_balanced_feat_val[dt] = \
        train_test_split(X_type_and_effectivess_balanced_feat[dt], y_type_and_effectivess_balanced_feat[dt], test_size = 0.20, random_state = 32167)

    print(dt, X_type_and_effectivess_balanced_feat_train[dt].shape, X_type_and_effectivess_balanced_feat_val[dt].shape, y_type_and_effectivess_balanced_feat_train[dt].shape, y_type_and_effectivess_balanced_feat_val[dt].shape)

Progress: 100%|████████████████████████████████████████████████████████████████| 4425/4425 [00:00<00:00, 295792.48it/s]
Progress: 100%|████████████████████████████████████████████████████████████████| 4425/4425 [00:00<00:00, 277289.16it/s]
Progress: 100%|████████████████████████████████████████████████████████████████| 4425/4425 [00:00<00:00, 295806.63it/s]


Claim (3540, 4) (885, 4) (3540,) (885,)


Progress: 100%|████████████████████████████████████████████████████████████████| 1743/1743 [00:00<00:00, 116514.02it/s]
Progress: 100%|████████████████████████████████████████████████████████████████| 1743/1743 [00:00<00:00, 124827.92it/s]
Progress: 100%|████████████████████████████████████████████████████████████████| 1743/1743 [00:00<00:00, 116250.92it/s]


Concluding Statement (1394, 4) (349, 4) (1394,) (349,)


Progress: 100%|██████████████████████████████████████████████████████████████████| 615/615 [00:00<00:00, 206707.02it/s]
Progress: 100%|██████████████████████████████████████████████████████████████████| 615/615 [00:00<00:00, 154156.28it/s]
Progress: 100%|██████████████████████████████████████████████████████████████████| 615/615 [00:00<00:00, 205603.14it/s]


Counterclaim (492, 4) (123, 4) (492,) (123,)


Progress: 100%|█████████████████████████████████████████████████████████████████| 8655/8655 [00:00<00:00, 90396.06it/s]
Progress: 100%|█████████████████████████████████████████████████████████████████| 8655/8655 [00:00<00:00, 85055.33it/s]
Progress: 100%|█████████████████████████████████████████████████████████████████| 8655/8655 [00:00<00:00, 89466.38it/s]


Evidence (6924, 4) (1731, 4) (6924,) (1731,)


Progress: 100%|████████████████████████████████████████████████████████████████| 1092/1092 [00:00<00:00, 136310.82it/s]
Progress: 100%|████████████████████████████████████████████████████████████████| 1092/1092 [00:00<00:00, 121645.06it/s]
Progress: 100%|████████████████████████████████████████████████████████████████| 1092/1092 [00:00<00:00, 136872.96it/s]


Lead (873, 4) (219, 4) (873,) (219,)


Progress: 100%|████████████████████████████████████████████████████████████████| 1410/1410 [00:00<00:00, 282789.11it/s]
Progress: 100%|████████████████████████████████████████████████████████████████| 1410/1410 [00:00<00:00, 235625.67it/s]
Progress: 100%|████████████████████████████████████████████████████████████████| 1410/1410 [00:00<00:00, 235635.06it/s]


Position (1128, 4) (282, 4) (1128,) (282,)


Progress: 100%|██████████████████████████████████████████████████████████████████| 633/633 [00:00<00:00, 211553.34it/s]
Progress: 100%|██████████████████████████████████████████████████████████████████| 633/633 [00:00<00:00, 211553.34it/s]
Progress: 100%|██████████████████████████████████████████████████████████████████| 633/633 [00:00<00:00, 211587.06it/s]

Rebuttal (506, 4) (127, 4) (506,) (127,)





## Classifier Training

In [15]:
# Train the LogisticRegression classifier on the unbalanced training data
unbalanced_clf = LogisticRegression(multi_class='multinomial', random_state = 32167).fit( \
                    X_unbalanced_feat_train, y_unbalanced_feat_train)

# Check the accuracy on the training data
unbalanced_clf.score(X_unbalanced_feat_train, y_unbalanced_feat_train)

0.5368557051543588

In [16]:
# Train the LogisticRegression classifier on the effectivenees balanced training data
effectiveness_balanced_clf = LogisticRegression(multi_class='multinomial', random_state = 32167).fit( \
                    X_effectiveness_balanced_feat_train, y_effectiveness_balanced_feat_train)

# Check the accuracy on the training data
effectiveness_balanced_clf.score(X_effectiveness_balanced_feat_train, y_effectiveness_balanced_feat_train)

0.4913592984266185

In [17]:
# Train the LogisticRegression classifiers on the effectivenees unbalanced and type split training data
effectiveness_unbalanced_type_split_clfs = {}

for dt in discourse_types:
    effectiveness_unbalanced_type_split_clfs[dt] = LogisticRegression(multi_class='multinomial', max_iter = 10000, random_state = 32167).fit( \
                    X_unbalanced_type_split_feat_train[dt], y_unbalanced_type_split_feat_train[dt])
    
    # Check the accuracy on the training data
    print(dt, "classifier accuracy:", effectiveness_unbalanced_type_split_clfs[dt].score(X_unbalanced_type_split_feat_train[dt], y_unbalanced_type_split_feat_train[dt]))

Claim classifier accuracy: 0.618828932261768
Concluding Statement classifier accuracy: 0.6309701492537313
Counterclaim classifier accuracy: 0.6995768688293371
Evidence classifier accuracy: 0.4847170590665014
Lead classifier accuracy: 0.6124454148471615
Position classifier accuracy: 0.7104690897794346
Rebuttal classifier accuracy: 0.6150753768844222


In [18]:
# Train the LogisticRegression classifiers on the effectivenees and type balanced training data
type_and_effectiveness_balanced_clfs = {}

for dt in discourse_types:
    type_and_effectiveness_balanced_clfs[dt] = LogisticRegression(multi_class='multinomial', random_state = 32167).fit( \
                    X_type_and_effectivess_balanced_feat_train[dt], y_type_and_effectivess_balanced_feat_train[dt])
    
    # Check the accuracy on the training data
    print(dt, "classifier accuracy:", type_and_effectiveness_balanced_clfs[dt].score(X_type_and_effectivess_balanced_feat_train[dt], y_type_and_effectivess_balanced_feat_train[dt]))

Claim classifier accuracy: 0.5152542372881356
Concluding Statement classifier accuracy: 0.6119081779053085
Counterclaim classifier accuracy: 0.6239837398373984
Evidence classifier accuracy: 0.4904679376083189
Lead classifier accuracy: 0.586483390607102
Position classifier accuracy: 0.5851063829787234
Rebuttal classifier accuracy: 0.6304347826086957


## Classifiers Serialization

In [19]:
# We will save our trained classifiers into a file
classifiers = {"unbalanced" : unbalanced_clf, "effectiveness_balanced" : effectiveness_balanced_clf, \
               "unbalanced_type_split": effectiveness_unbalanced_type_split_clfs, "type_and_effectivess_balanced" : type_and_effectiveness_balanced_clfs}

joblib.dump(classifiers, './data/log_regression_classifiers_v2.joblib')

# We immidiately load them from the file, to make sure the deserialization was succesfull
classifiers = joblib.load('./data/log_regression_classifiers_v2.joblib') 

unbalanced_clf = classifiers["unbalanced"]
effectiveness_balanced_clf = classifiers["effectiveness_balanced"]
effectiveness_unbalanced_type_split_clfs = classifiers["unbalanced_type_split"]
type_and_effectiveness_balanced_clfs = classifiers["type_and_effectivess_balanced"]

## Classifier Testing

In [20]:
# Make predictions on the unbalanced validation data
y_unbalanced_feat_pred = unbalanced_clf.predict_proba(X_unbalanced_feat_val)

# Check the accuracy on the unbalanced validation data
unbalanced_clf.score(X_unbalanced_feat_val, y_unbalanced_feat_val)

0.5344757241942064

In [21]:
# Make predictions on the effectiveness balanced validation data
y_effectiveness_balanced_feat_pred = effectiveness_balanced_clf.predict_proba(X_effectiveness_balanced_feat_val)

# Also make a prediction for the unbalanced validation data using the 
y_unbalanced_feat_pred_v2 = effectiveness_balanced_clf.predict_proba(X_unbalanced_feat_val)

# Check the accuracy on the effectiveness balanced validation data
effectiveness_balanced_clf.score(X_effectiveness_balanced_feat_val, y_effectiveness_balanced_feat_val)

0.502578648788035

In [22]:
# Check the accuracy of the effectiveness balanced classifier on the unbalanced validation data
effectiveness_balanced_clf.score(X_unbalanced_feat_val, y_unbalanced_feat_val)

0.5813953488372093

In [23]:
# Make predictions on the effectiveness unbalanced and type split validation data
y_effectiveness_unbalanced_type_split_feat_pred = {}

for dt in discourse_types:
    y_effectiveness_unbalanced_type_split_feat_pred[dt] = effectiveness_unbalanced_type_split_clfs[dt].predict_proba(X_unbalanced_type_split_feat_val[dt])

    # Check the accuracy on the type and effectiveness balanced validation data
    print(dt, "classifier accuracy:", effectiveness_unbalanced_type_split_clfs[dt].score(X_unbalanced_type_split_feat_val[dt], y_unbalanced_type_split_feat_val[dt]))

Claim classifier accuracy: 0.6181135225375626
Concluding Statement classifier accuracy: 0.6199701937406855
Counterclaim classifier accuracy: 0.6985915492957746
Evidence classifier accuracy: 0.4861627426683189
Lead classifier accuracy: 0.6034858387799564
Position classifier accuracy: 0.715527950310559
Rebuttal classifier accuracy: 0.6546184738955824


In [24]:
# Make predictions on the type and effectiveness balanced validation data
y_type_and_effectiveness_balanced_feat_pred = {}

for dt in discourse_types:
    y_type_and_effectiveness_balanced_feat_pred[dt] = type_and_effectiveness_balanced_clfs[dt].predict_proba(X_type_and_effectivess_balanced_feat_val[dt])

    # Check the accuracy on the type and effectiveness balanced validation data
    print(dt, "classifier accuracy:", type_and_effectiveness_balanced_clfs[dt].score(X_type_and_effectivess_balanced_feat_val[dt], y_type_and_effectivess_balanced_feat_val[dt]))

Claim classifier accuracy: 0.5525423728813559
Concluding Statement classifier accuracy: 0.5587392550143266
Counterclaim classifier accuracy: 0.6422764227642277
Evidence classifier accuracy: 0.49451184286539573
Lead classifier accuracy: 0.5570776255707762
Position classifier accuracy: 0.599290780141844
Rebuttal classifier accuracy: 0.6614173228346457


## Kaggle Scoring

In [25]:
# Check the classification score according to logloss used by kaggle

# Calculates the log_loss
def log_loss(classes, y_pred, y_true):

    log_loss = 0
    for i in range(len(y_pred)): # For every prediction
        for j in range(len(classes)): # For every class
            if y_true[i] == classes[j]: # If this is the class we were suppose to predict
                log_loss += log(y_pred[i][j])

    return - log_loss / len(y_pred)

# Check the log loss of the unbalanced classifier predictions on the unbalanced validation data
log_loss(unbalanced_clf.classes_, y_unbalanced_feat_pred, y_unbalanced_feat_val)

1.0050887953386125

In [26]:
# Check the log loss of the effectiveness balanced classifier predictions on the effectiveness balanced validation data
log_loss(effectiveness_balanced_clf.classes_, y_effectiveness_balanced_feat_pred, y_effectiveness_balanced_feat_val)

0.9992421308286923

In [27]:
# Check the log loss of the effectiveness balanced classifier predictions on the unbalanced validation data
log_loss(effectiveness_balanced_clf.classes_, y_unbalanced_feat_pred_v2, y_unbalanced_feat_val)

2.6410932524055752

In [28]:
# Check the log loss of the effectiveness unbalanced type split classifiers predictions

total_log_loss, total_predictions = 0, 0
for dt in discourse_types:
    ll = log_loss(effectiveness_unbalanced_type_split_clfs[dt].classes_, y_effectiveness_unbalanced_type_split_feat_pred[dt], y_unbalanced_type_split_feat_val[dt])
    total_log_loss += (ll * len(y_effectiveness_unbalanced_type_split_feat_pred[dt]))
    total_predictions += len(y_effectiveness_unbalanced_type_split_feat_pred[dt])
    print(dt, "Log Loss: ", ll)
    
print("Log loss accross all predictions: ", total_log_loss / total_predictions)

Claim Log Loss:  0.8677560164203595
Concluding Statement Log Loss:  0.8255081967781289
Counterclaim Log Loss:  0.7657091506063284
Evidence Log Loss:  1.0038510831357623
Lead Log Loss:  0.8638954601493577
Position Log Loss:  0.7231432980088076
Rebuttal Log Loss:  0.8179144209620549
Log loss accross all predictions:  0.8860153205605533


In [29]:
# Check the log loss of the effectiveness and type balanced classifier predictions on the effectiveness and type balanced validation data

total_log_loss, total_predictions = 0, 0
for dt in discourse_types:
    ll = log_loss(type_and_effectiveness_balanced_clfs[dt].classes_, y_type_and_effectiveness_balanced_feat_pred[dt], y_type_and_effectivess_balanced_feat_val[dt])
    total_log_loss += (ll * len(y_type_and_effectiveness_balanced_feat_pred[dt]))
    total_predictions += len(y_type_and_effectiveness_balanced_feat_pred[dt])
    print(dt, "Log Loss: ", ll)
    
print("Log loss accross all predictions: ", total_log_loss / total_predictions)

Claim Log Loss:  0.9777727675222816
Concluding Statement Log Loss:  0.9138941093147707
Counterclaim Log Loss:  0.7559243775557885
Evidence Log Loss:  0.9702979163488463
Lead Log Loss:  0.9130097639329984
Position Log Loss:  0.9175974071069756
Rebuttal Log Loss:  0.837826481372549
Log loss accross all predictions:  0.9477819981935136


## Conclusions

* Results are pretty bad (around the statistics based prediction baseline, although significantly better than guessing at random), which is not very surprising for the clasifier that simple
* Numbers for accuracy and log loss don't agree on classifiers trained on which datasets are best, so we'll try to submit all of them to kaggle