In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install spacy
!python -m spacy download en_core_web_sm
!pip install lexical-diversity
!pip install scipy
!pip install scikit-learn
!pip install tqdm

Collecting en-core-web-sm==3.7.1
  Using cached https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.7.1/en_core_web_sm-3.7.1-py3-none-any.whl (12.8 MB)
[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


In [4]:
import pandas as pd
df_train = pd.read_csv("/content/drive/MyDrive/SMM4H 2024/SMM4H-2024-Task5-Training.tsv",sep="\t")
df_dev = pd.read_csv("/content/drive/MyDrive/SMM4H 2024/SMM4H-2024-Task5-Validation.tsv",sep="\t")

In [5]:
df_train

Unnamed: 0,tweet_id,text,label
0,1364778927105933315,"Psych Med Twitter, wondering if you can help. ...",0
1,1130719434526511104,@dmx_biographer @HellaChillAF I wouldn’t fuck ...,0
2,1184221813733253120,Harley is autistic.... she has the mental capa...,1
3,1435958387020341248,I just made my son cry before going to school ...,1
4,1112559025927999488,Yooooooo my dad really just said “I’d rather h...,0
...,...,...,...
7393,1232778727186223108,Just got all kinds of dirty looks for ordering...,1
7394,1174782065532780544,"anyways- if they did cause autism, i’d rather ...",0
7395,1052612940438728704,2/4 trouble at work and I could have lost my j...,0
7396,1243556662964862976,@LawyerChamber $LgiLder22 I have severe asthma...,1


In [6]:
df_dev

Unnamed: 0,tweet_id,text,label
0,1265323726570225669,Mom Finds Secret to Homeschooling her Autistic...,0
1,1254180961622900738,I hope our child has my calmness and NOT Andre...,0
2,800336385399885824,Blind Girl With Autism Mimics Whitney Houston ...,0
3,1107948318150152192,"@amyschumer watching #growing, my 2yo son has ...",1
4,1438320313511473154,@robbystarbuck @AmericanAir would rather a mot...,0
...,...,...,...
384,1479160193808273413,my 7yo is being tested for adhd and struggles ...,0
385,1084086161667252225,@attachedmrsL the first reply on the tweet tho...,0
386,1404612356819374080,There's a mom who recorded her autistic son &a...,0
387,1458807608052174856,@itsadollthxng Lmfao girl !!! I’m upset fr cau...,1


In [7]:
df_train.to_json('SMM4H-5_train_data.jsonl', orient='records', lines=True)
df_dev.to_json('SMM4H-5_dev_data.jsonl', orient='records', lines=True)

In [8]:
import json
from copy import deepcopy
import pickle
import spacy
import en_core_web_sm
from lexical_diversity import lex_div as ld
from scipy.stats import pointbiserialr
from scipy.sparse._csr import csr_matrix
from scipy.sparse import save_npz
from sklearn.feature_extraction.text import TfidfVectorizer
from tqdm.auto import tqdm
import os
NLP = spacy.load("en_core_web_sm")
LATIN = ["i.e.", "e.g.", "etc.", "c.f.", "et", "al."]
FEATS = ["ttr", "root_ttr", "log_ttr", "maas_ttr", "msttr", "mattr", "hdd", "mtld", "mtld_ma_wrap", "mtld_ma_bid"]
def jsonl_read(file_path: str) -> list:
    with open(file_path, 'r') as f:
        data = [json.loads(line) for line in f]
    return data
def jsonl_write(data: list, file_path: str) -> None:
    with open(file_path, 'w') as f:
        for entry in data:
            f.write(json.dumps(entry) + "\n")
def jsonl_write_lines(entry: dict, file_path: str) -> None:
    with open(file_path, 'a') as f:
        f.write(json.dumps(entry) + "\n")
def style_features_processing(entry: dict) -> tuple:
    text = entry["text"]
    doc = NLP(text)
    pos_tokens = []
    shape_tokens = []
    for word in doc:
        if word.is_punct or word.is_stop or word.text in LATIN:
            pos_target = word.text
            shape_target = word.text
        else:
            pos_target = word.pos_
            shape_target = word.shape_
        pos_tokens.append(pos_target)
        shape_tokens.append(shape_target)
    return " ".join(pos_tokens), " ".join(shape_tokens)
def log_counts(texts: list) -> tuple:
    vectorizer = TfidfVectorizer(lowercase=False,ngram_range=(1, 2),use_idf=False,sublinear_tf=True)
    X = vectorizer.fit_transform(texts)
    return vectorizer, X
def preprocess(text: str, mode: str="spacy") -> list:
    if mode == "spacy":
        doc = NLP(text)
        result = [f"{w.lemma_}_{w.pos_}" for w in doc if not w.pos_ in ["PUNCT", "SYM", "SPACE"]]
    elif mode == "lemmatize":
        result = ld.flemmatize(text)
    else:
        result = ld.tokenize(text)
    return result
def lex_div_feats_extraction(entry: dict, preprocess_mode: str="spacy", features: list=FEATS) -> None:
    text = entry["text"]
    preprocessed = preprocess(text, preprocess_mode)
    # entry["text_preprocessed"] = preprocessed
    for feature in features:
        entry[feature] = getattr(ld, feature)(preprocessed)
    return entry
def features_evaluation(dataset: list) -> None:
    labels = [entry["label"] for entry in dataset]
    for feature in FEATS:
        if feature in dataset[0]:
            feat_values = [entry[feature] for entry in dataset]
            point_biserial_corr, p_value = pointbiserialr(labels, feat_values)
            print(f"{feature}: {round(point_biserial_corr, 2)} (p = {round(p_value, 2)})")
def get_texts_diversities(in_file_name: str, out_file_name: str):
    data = jsonl_read(in_file_name)
    for entry in tqdm(data):
        computed_entry = deepcopy(lex_div_feats_extraction(entry))
        del computed_entry["text"]
        jsonl_write_lines(computed_entry, out_file_name)
def get_texts_styles(in_file_name: str, in_dev_file_name: str):
    output_dir = "/content/"
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    data = jsonl_read(in_file_name)
    pos_data = []
    shape_data = []
    for entry in tqdm(data):
        pos, shape = style_features_processing(entry)
        pos_data.append(pos)
        shape_data.append(shape)
    dev_data = jsonl_read(in_dev_file_name)
    dev_pos_data = []
    dev_shape_data = []
    for entry in tqdm(dev_data):
        pos, shape = style_features_processing(entry)
        dev_pos_data.append(pos)
        dev_shape_data.append(shape)
    pos_vectorizer, pos_tf = log_counts(pos_data)
    pos_tf_dev = pos_vectorizer.transform(dev_pos_data)
    with open(os.path.join(output_dir, "pos_vectorizer.pkl"), "wb") as file:
        pickle.dump(pos_vectorizer, file)
    save_npz(os.path.join(output_dir, "pos_tf.npz"), pos_tf)
    save_npz(os.path.join(output_dir, "pos_tf_dev.npz"), pos_tf_dev)
    shape_vectorizer, shape_tf = log_counts(shape_data)
    shape_tf_dev = shape_vectorizer.transform(dev_shape_data)
    with open(os.path.join(output_dir, "shape_vectorizer.pkl"), "wb") as file:
        pickle.dump(shape_vectorizer, file)
    save_npz(os.path.join(output_dir, "shape_tf.npz"), shape_tf)
    save_npz(os.path.join(output_dir, "shape_tf_dev.npz"), shape_tf_dev)
def main():
    train_input_path = "/content/SMM4H-5_train_data.jsonl"
    dev_input_path = "/content/SMM4H-5_dev_data.jsonl"
    train_output_dir = "/content/"
    dev_output_dir = "/content/"
    get_texts_styles(train_input_path, dev_input_path)
    get_texts_diversities(train_input_path, train_output_dir + "train_diversities.jsonl")
    get_texts_diversities(dev_input_path, dev_output_dir + "dev_diversities.jsonl")
if __name__ == "__main__":
    main()

  0%|          | 0/7398 [00:00<?, ?it/s]

  0%|          | 0/389 [00:00<?, ?it/s]

  0%|          | 0/7398 [00:00<?, ?it/s]

  0%|          | 0/389 [00:00<?, ?it/s]