# Logistic Regression Kaggle Submission Notebook
In this notebook we simply attempt to:
* Read in the test data from the input file
* Deserialize pre-trained vocabularies and classifiers
* Perform feature extraction on the test data using the vocabularies
* Make predictions on test data using the classifiers 
* Write it out into a test submission file

## Read in the test data

In [1]:
import pandas as pd

input_df = pd.read_csv("../input/feedback-prize-effectiveness/test.csv")
input_df.head()

# Data Validation
# Check for any missing values
assert input_df.isna().any().any() == False

# Check for any empty strings
assert input_df.eq("").any().any() == False

## Deserialize the pre-trained vocabularies and classifiers

In [2]:
import json # For vocabularies de-serialization

vocabs = None
with open('./data/vocab_v2.json', 'r', encoding ='utf8') as json_file:
    vocabs = json.load(json_file)
    
type(vocabs)

dict

In [3]:
import joblib # For classifiers de-serialization

classifiers = joblib.load('./data/log_regression_classifiers_v2.joblib')
type(classifiers)

dict

## Feature Extraction

In [4]:
import nltk # Natural Language Tool Kit
# nltk.download('punkt') # Needed for nltk.word_tokenize, hopefully already done on Kaggle
# nltk.download('stopwords') # Needed for nltk.corpus.stopwords, hopefully already done on Kaggle
from nltk.stem import PorterStemmer # Needed for stemming

# We start with lowercasing and tokenizing 'disourse_text'
input_df['discourse_tokenized'] = input_df['discourse_text'].transform(lambda d: nltk.word_tokenize(d.lower()))
input_df.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_tokenized
0,a261b6e14276,D72CB1C11673,Making choices in life can be very difficult. ...,Lead,"[making, choices, in, life, can, be, very, dif..."
1,5a88900e7dc1,D72CB1C11673,Seeking multiple opinions can help a person ma...,Position,"[seeking, multiple, opinions, can, help, a, pe..."
2,9790d835736b,D72CB1C11673,it can decrease stress levels,Claim,"[it, can, decrease, stress, levels]"
3,75ce6d68b67b,D72CB1C11673,a great chance to learn something new,Claim,"[a, great, chance, to, learn, something, new]"
4,93578d946723,D72CB1C11673,can be very helpful and beneficial.,Claim,"[can, be, very, helpful, and, beneficial, .]"


In [5]:
# Next we prepare the punctuation to be removed from the discource
import string
punctuation = set(string.punctuation)

print("Punctuation to be removed from the discourse text:\n", punctuation)

Punctuation to be removed from the discourse text:
 {']', '{', '$', '.', ':', '-', ';', '?', ',', '!', '[', '*', '@', '(', '^', '%', '&', '`', '>', '<', '=', '#', '}', '+', '\\', '/', ')', '_', '~', '"', "'", '|'}


In [6]:
# Now we will remove the punctuation from the sanitized discourse
input_df['discourse_no_punct'] = input_df['discourse_tokenized'].transform(lambda d: list(filter(lambda w: w not in punctuation, d)))
input_df.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_tokenized,discourse_no_punct
0,a261b6e14276,D72CB1C11673,Making choices in life can be very difficult. ...,Lead,"[making, choices, in, life, can, be, very, dif...","[making, choices, in, life, can, be, very, dif..."
1,5a88900e7dc1,D72CB1C11673,Seeking multiple opinions can help a person ma...,Position,"[seeking, multiple, opinions, can, help, a, pe...","[seeking, multiple, opinions, can, help, a, pe..."
2,9790d835736b,D72CB1C11673,it can decrease stress levels,Claim,"[it, can, decrease, stress, levels]","[it, can, decrease, stress, levels]"
3,75ce6d68b67b,D72CB1C11673,a great chance to learn something new,Claim,"[a, great, chance, to, learn, something, new]","[a, great, chance, to, learn, something, new]"
4,93578d946723,D72CB1C11673,can be very helpful and beneficial.,Claim,"[can, be, very, helpful, and, beneficial, .]","[can, be, very, helpful, and, beneficial]"


In [7]:
# We will not do spell-checking as it's too computationaly heavy

# Then we prepare the list of the stop words (meaningless words) to remove from the discourse
stopwords_english = set() # set(nltk.corpus.stopwords.words('english'))
stopwords_english = stopwords_english.union({'scientist', 'do', 'as', 'emotional', '``', 'at', 'and', 'than', 'this', 'get', 'into', 'face', 'us', 'day', 'having', 'others', 'your', "''", 'were', 'any', 'be', 'go', 'could', 'state', 'cause', 'know', 'out', 'every', 'that', 'there', 'have', 'voter', 'just', 'most', 'many', 'give', 'has', 'with', 'then', 'time', 'reason', 'also', 'why', 'an', 'want', 'mars', 'or', 'sport', 'a', 'elect', 'alien', 'people', 'after', 'system', 'is', 'summer', 'it', 'driverless', 'planet', 'when', 'all', 'not', 'by', 'they', 'how', 'on', 'our', 'what', 'doing', 'to', 'person', 'where', 'find', 'thing', 'getting', 'for', 'did', 'more', 'can', 'only', 'emotions', 'see', 'no', 'like', "'s", 'use', 'who', 'make', 'technological', 'lot', 'from', 'of', 'around', 'class', 'attend', 'better', 'same', 'importing', 'ability', "n't", 'vote', 'drive', 'now', 'home', 'someone', 'other', 'my', 'point', "'m", '...', 'up', 'complete', 'different', 'was', 'the', 'over', 'taking', 'grade', 'able', 'so', 'work', 'need', 'are', 'way', 'them', 'even', 'making', 'school', 'show', 'import', 'something', 'difference', 'am', 'because', 'scientists', 'one', 'technology', 'in', 'emotion', 'car', 'create', 'distance', 'take', 'driving', 'picture', 'picturing', 'needing', 'important', 'each'}) # Some of our own additions

print("Stop(meaningless) words to be removed from the discourse text:\n", stopwords_english)

Stop(meaningless) words to be removed from the discourse text:
 {'every', 'has', 'up', 'drive', 'distance', 'an', 'to', 'get', 'where', 'importing', 'mars', 'is', 'picture', 'work', 'planet', 'just', 'was', 'lot', 'picturing', 'show', 'and', 'ability', 'did', 'something', 'all', 'each', 'able', 'import', 'what', 'system', 'car', 'then', 'most', 'need', 'any', 'they', 'more', 'your', 'how', 'better', 'one', '...', 'the', 'different', 'emotion', 'it', 'find', 'taking', 'having', 'make', 'other', 'take', 'us', 'complete', 'over', 'be', 'create', 'elect', 'home', 'making', 'also', 'see', 'with', 'point', 'vote', 'at', 'that', 'thing', 'like', 'only', 'person', 'were', 'others', 'into', 'could', 'out', 'do', 'now', 'no', 'a', 'people', 'our', 'know', 'cause', 'someone', 'scientists', 'want', 'by', 'driving', 'can', 'technology', 'many', 'way', 'than', 'of', 'give', 'on', 'needing', 'why', 'same', 'day', 'state', 'class', 'grade', 'because', 'sport', 'important', 'in', "n't", '``', 'for', 'w

In [8]:
# Now we will remove the stop-words from the sanitized discourse
input_df['discourse_no_stopwords'] = input_df['discourse_no_punct'].transform(lambda d: list(filter(lambda w: w not in stopwords_english, d)))
input_df.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_tokenized,discourse_no_punct,discourse_no_stopwords
0,a261b6e14276,D72CB1C11673,Making choices in life can be very difficult. ...,Lead,"[making, choices, in, life, can, be, very, dif...","[making, choices, in, life, can, be, very, dif...","[choices, life, very, difficult, often, ask, a..."
1,5a88900e7dc1,D72CB1C11673,Seeking multiple opinions can help a person ma...,Position,"[seeking, multiple, opinions, can, help, a, pe...","[seeking, multiple, opinions, can, help, a, pe...","[seeking, multiple, opinions, help, choice]"
2,9790d835736b,D72CB1C11673,it can decrease stress levels,Claim,"[it, can, decrease, stress, levels]","[it, can, decrease, stress, levels]","[decrease, stress, levels]"
3,75ce6d68b67b,D72CB1C11673,a great chance to learn something new,Claim,"[a, great, chance, to, learn, something, new]","[a, great, chance, to, learn, something, new]","[great, chance, learn, new]"
4,93578d946723,D72CB1C11673,can be very helpful and beneficial.,Claim,"[can, be, very, helpful, and, beneficial, .]","[can, be, very, helpful, and, beneficial]","[very, helpful, beneficial]"


In [9]:
# Next we will perform the stemming of words in the sanitized discourse
stemmer = PorterStemmer()
input_df['discourse_stemmed'] = input_df['discourse_no_stopwords'].transform(lambda d: list(map(lambda w: stemmer.stem(w), d)))
input_df.head()

Unnamed: 0,discourse_id,essay_id,discourse_text,discourse_type,discourse_tokenized,discourse_no_punct,discourse_no_stopwords,discourse_stemmed
0,a261b6e14276,D72CB1C11673,Making choices in life can be very difficult. ...,Lead,"[making, choices, in, life, can, be, very, dif...","[making, choices, in, life, can, be, very, dif...","[choices, life, very, difficult, often, ask, a...","[choic, life, veri, difficult, often, ask, adv..."
1,5a88900e7dc1,D72CB1C11673,Seeking multiple opinions can help a person ma...,Position,"[seeking, multiple, opinions, can, help, a, pe...","[seeking, multiple, opinions, can, help, a, pe...","[seeking, multiple, opinions, help, choice]","[seek, multipl, opinion, help, choic]"
2,9790d835736b,D72CB1C11673,it can decrease stress levels,Claim,"[it, can, decrease, stress, levels]","[it, can, decrease, stress, levels]","[decrease, stress, levels]","[decreas, stress, level]"
3,75ce6d68b67b,D72CB1C11673,a great chance to learn something new,Claim,"[a, great, chance, to, learn, something, new]","[a, great, chance, to, learn, something, new]","[great, chance, learn, new]","[great, chanc, learn, new]"
4,93578d946723,D72CB1C11673,can be very helpful and beneficial.,Claim,"[can, be, very, helpful, and, beneficial, .]","[can, be, very, helpful, and, beneficial]","[very, helpful, beneficial]","[veri, help, benefici]"


In [10]:
# We will only retain the "discourse_id", "discourse_type", "discourse_stemmed" columns for the prediction making
input_df = input_df[["discourse_id", "discourse_type", "discourse_stemmed"]]
input_df.head()

Unnamed: 0,discourse_id,discourse_type,discourse_stemmed
0,a261b6e14276,Lead,"[choic, life, veri, difficult, often, ask, adv..."
1,5a88900e7dc1,Position,"[seek, multipl, opinion, help, choic]"
2,9790d835736b,Claim,"[decreas, stress, level]"
3,75ce6d68b67b,Claim,"[great, chanc, learn, new]"
4,93578d946723,Claim,"[veri, help, benefici]"


In [11]:
# Feature extraction, we will use effectiveness unbalanced and type split data collected vocabulary
X_unbalanced_type_split_feat = []

for row in input_df.itertuples():
    dt, discourse_stemmed = row[2], row[3]
    X_unbalanced_type_split_feat.append([dt, 1.0, \
                sum([(vocabs["unbalanced_type_split"][dt]["Adequate"][w] if w in vocabs["unbalanced_type_split"][dt]["Adequate"] else 0) for w in discourse_stemmed]), \
                sum([(vocabs["unbalanced_type_split"][dt]["Effective"][w] if w in vocabs["unbalanced_type_split"][dt]["Effective"] else 0) for w in discourse_stemmed]), \
                sum([(vocabs["unbalanced_type_split"][dt]["Ineffective"][w] if w in vocabs["unbalanced_type_split"][dt]["Ineffective"] else 0) for w in discourse_stemmed])])

X_unbalanced_type_split_feat[:5]

[['Lead', 1.0, 2740, 2844, 671],
 ['Position', 1.0, 478, 322, 53],
 ['Claim', 1.0, 111, 111, 5],
 ['Claim', 1.0, 626, 642, 114],
 ['Claim', 1.0, 1003, 553, 175]]

## Prediction making

In [12]:
y_unbalanced_type_split_preds = []

for X in X_unbalanced_type_split_feat:
    dt, x = X[0], X[1:]
    
    y_unbalanced_type_split_preds.append(classifiers["unbalanced_type_split"][dt].predict_proba([x]))

y_unbalanced_type_split_preds[:5]

[array([[0.47895261, 0.42113205, 0.09991534]]),
 array([[0.66077578, 0.25418015, 0.08504407]]),
 array([[0.6258065 , 0.2156754 , 0.15851809]]),
 array([[0.58763463, 0.27915415, 0.13321121]]),
 array([[0.6388164 , 0.22898584, 0.13219776]])]

In [13]:
import numpy as np

output_df = input_df[["discourse_id"]]

y_output = []
for i in range(len(X_unbalanced_type_split_feat)):
    dt = X_unbalanced_type_split_feat[i][0]
    y_output.append([ \
        y_unbalanced_type_split_preds[i][0][list(classifiers["unbalanced_type_split"][dt].classes_).index("Adequate")], \
        y_unbalanced_type_split_preds[i][0][list(classifiers["unbalanced_type_split"][dt].classes_).index("Effective")], \
        y_unbalanced_type_split_preds[i][0][list(classifiers["unbalanced_type_split"][dt].classes_).index("Ineffective")]])

output_df[["Adequate", "Effective", "Ineffective"]] = pd.DataFrame(np.array(y_output), columns=["Adequate", "Effective", "Ineffective"])
output_df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  output_df[["Adequate", "Effective", "Ineffective"]] = pd.DataFrame(np.array(y_output), columns=["Adequate", "Effective", "Ineffective"])


Unnamed: 0,discourse_id,Adequate,Effective,Ineffective
0,a261b6e14276,0.478953,0.421132,0.099915
1,5a88900e7dc1,0.660776,0.25418,0.085044
2,9790d835736b,0.625807,0.215675,0.158518
3,75ce6d68b67b,0.587635,0.279154,0.133211
4,93578d946723,0.638816,0.228986,0.132198


## Output

In [15]:
# Make sure output directory exists
import os
path = './data/output/'
os.makedirs(path, exist_ok=True)

# Write out the output_df into a sample output file.
output_df.to_csv("./data/output/submission.csv", index=False)