In [42]:
import pandas as pd
import re
import nltk

from pathlib import Path

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score

from nltk.corpus import wordnet
from nltk import pos_tag, word_tokenize
from nltk.stem import WordNetLemmatizer

In [54]:
## download required NLTK data files to a specific directory
# TODO: make a separate script that does this only once


NLTK_DIR = Path("../data/nltk_data")
NLTK_DIR.mkdir(parents=True, exist_ok=True)
nltk.data.path.append(str(NLTK_DIR))

resources = [
    "punkt",
    "stopwords",
    "wordnet",
    "omw-1.4",
    "averaged_perceptron_tagger_eng",
    "averaged_perceptron_tagger",
]

for resource in resources:
    nltk.download(resource, download_dir=str(NLTK_DIR))

print(f"NLTK data downloaded to {NLTK_DIR.resolve()}")

NLTK data downloaded to /Users/alinaponomareva/Documents/UZH/ETSP/ESTP_project/data/nltk_data


[nltk_data] Downloading package punkt to ../data/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to ../data/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to ../data/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to ../data/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     ../data/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     ../data/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [55]:
## preprocessing function for NLTK tokenizer + lemmatizer

from nltk.corpus import stopwords
STOP_WORDS = set(stopwords.words('english'))

def get_wordnet_pos(tag):
    """Map POS tag to first character lemmatize() accepts"""
    if tag.startswith('J'):
        return wordnet.ADJ
    elif tag.startswith('V'):
        return wordnet.VERB
    elif tag.startswith('N'):
        return wordnet.NOUN
    elif tag.startswith('R'):
        return wordnet.ADV
    else:
        return wordnet.NOUN


def nltk_preprocess(text):
    # make sure we get useful tokens
    token_pattern = re.compile(r"(?u)\b[^\W\d_]{2,}\b")  # yoinked from TfidfVectorizer
    txt = "" if not isinstance(text, str) else text.lower()
    words = token_pattern.findall(txt)
    pos_tags = pos_tag(words)
    lemmatizer = WordNetLemmatizer()
    lemmatized_words = [lemmatizer.lemmatize(word, get_wordnet_pos(tag)) for word, tag in pos_tags]
    # remove stopwords after lemmatization
    lemmatized_words = [w for w in lemmatized_words if w not in STOP_WORDS]
    return lemmatized_words

In [56]:
## get data and prepare train split and dev split

# get the combined dataset
df = pd.read_csv("../data/processed/text_combined.csv", index_col=0)

# get only train part of the dataset
df_train = df[df["split"] == "train"].reset_index(drop=True)

# get only dev part of the dataset
df_dev = df[df["split"] == "dev"].reset_index(drop=True)

# get the necessary columns out of df_train
X = df_train["text"]
y_depr = df_train["target_depr"]

print(df_train.shape)
display(df_train.head())
display(X[:3])
display(y_depr[:3])
X.index.equals(y_depr.index)

# get the necessary columns out of df_dev
X_val = df_dev["text"]
y_depr_val = df_dev["target_depr"]

print(df_dev.shape)
display(df_dev.head())
display(X_val[:3])
display(y_depr_val[:3])
X_val.index.equals(y_depr_val.index)

(163, 5)


Unnamed: 0,participant_id,text,target_depr,target_ptsd,split
0,302,just move around a little bit when you're fini...,0,0,train
1,303,wow okay when you're finished when she's done ...,0,0,train
2,304,so we'll just move around a little bit tonight...,0,0,train
3,305,okay looks good so we can just move around a l...,0,0,train
4,307,looking at you all right okay so now let's mak...,0,0,train


0    just move around a little bit when you're fini...
1    wow okay when you're finished when she's done ...
2    so we'll just move around a little bit tonight...
Name: text, dtype: object

0    0
1    0
2    0
Name: target_depr, dtype: int64

(56, 5)


Unnamed: 0,participant_id,text,target_depr,target_ptsd,split
0,300,so I'm going to interview in Spanish okay good...,0,0,dev
1,301,yeah there's also on Craigslist so that's why ...,0,0,dev
2,306,okay looks like we're good. Let's move around ...,0,0,dev
3,317,equipment okay how was okay thanks are you oka...,0,1,dev
4,320,perfect okay and okay and then I will let you ...,0,1,dev


0    so I'm going to interview in Spanish okay good...
1    yeah there's also on Craigslist so that's why ...
2    okay looks like we're good. Let's move around ...
Name: text, dtype: object

0    0
1    0
2    0
Name: target_depr, dtype: int64

True

In [57]:
# create pipeline

pipeline = Pipeline ([
    ('vectorizer',TfidfVectorizer(
        tokenizer=nltk_preprocess, #TODO: try different tokenizers in gridsearch
        preprocessor=None,
        lowercase=False,
        token_pattern=None,
        ngram_range=(1,2),
        min_df=2,
        max_df=0.9
    )),
    ('classifier',LogisticRegression(max_iter=2000, class_weight='balanced', solver='liblinear'))
])

In [58]:
# train 

from joblib import parallel_backend

# Use threads to avoid pickling issues with custom tokenizer
with parallel_backend("threading", n_jobs=-1):
    scores = cross_val_score(pipeline, X, y_depr, cv=3, n_jobs=-1)
print('Cross-validation scores', scores)
print('Average cross-validation score', scores.mean())

Cross-validation scores [0.76363636 0.77777778 0.77777778]
Average cross-validation score 0.773063973063973


In [62]:
# fit on full train
pipeline.fit(X, y_depr)

# evaluate on dev set
from sklearn.metrics import classification_report
y_pred = pipeline.predict(X_val)

# Detailed Report on standard metrics
report_dict = classification_report(y_depr_val, y_pred, output_dict=True)
report_df = pd.DataFrame(report_dict).transpose()
numeric_cols = ['precision', 'recall', 'f1-score']

print("Classification report on dev set:")
print(report_df)

Classification report on dev set:
              precision    recall  f1-score    support
0              0.800000  1.000000  0.888889  44.000000
1              1.000000  0.083333  0.153846  12.000000
accuracy       0.803571  0.803571  0.803571   0.803571
macro avg      0.900000  0.541667  0.521368  56.000000
weighted avg   0.842857  0.803571  0.731380  56.000000
