# Env-Claims


### Installing dependencies


In [12]:
from sklearn.svm import LinearSVC
from sklearn.feature_extraction.text import TfidfTransformer, CountVectorizer
from nltk.corpus import stopwords
from sklearn.pipeline import Pipeline
from sklearn import metrics
from sklearn.model_selection import cross_val_score
import numpy as np
from sklearn.model_selection import train_test_split
from scipy.stats import sem
import json
import os
from sklearn.model_selection import StratifiedKFold

In [25]:
# Data Helpers function
def get_dataset_splits():
    train_fn = "/content/train.jsonl"
    dev_fn = "/content/dev.jsonl"
    test_fn = "/content/test.jsonl"

    def load_data(file_path):
        with open(file_path) as f:
            data = [json.loads(line) for line in f]
            X = [item["text"] for item in data]
            y = [item["label"] for item in data]
        return X, y

    X_train, y_train = load_data(train_fn)
    X_validation, y_validation = load_data(dev_fn)
    X_test, y_test = load_data(test_fn)

    return X_train, y_train, X_validation, y_validation, X_test, y_test

def get_cv_splits():
    X_train, y_train, X_validation, y_validation, X_test, y_test = get_dataset_splits()
    X, y = np.array(X_train + X_validation + X_test), np.array(y_train + y_validation + y_test)

    skf = StratifiedKFold(n_splits=5, random_state=42, shuffle=True)
    skf.get_n_splits(X, y)

    for i, (train_index, test_index) in enumerate(skf.split(X, y)):
        X_train, y_train = X[train_index], y[train_index]
        X_test, y_test = X[test_index], y[test_index]
        yield X_train, y_train, X_test, y_test

def round_float(number):
    return str(round(number, 3) * 100)

* Created a load_data function to handle the common pattern of loading data from a file.


## SVM Model

In [19]:
#setting Train, Validations splits

X_train, y_train, X_validation, y_validation, X_test, y_test = get_dataset_splits()

In [26]:
def evaluate(gold, predictions):
    # Calculate and round evaluation metrics
    pr = round_float(metrics.precision_score(gold, predictions))
    rc = round_float(metrics.recall_score(gold, predictions))
    f1 = round_float(metrics.f1_score(gold, predictions))
    acc = round_float(metrics.accuracy_score(gold, predictions))

    # Return metrics as a formatted string
    return " & ".join((pr, rc, f1, acc))


def evaluate_all(gold, preds):
    # Calculate precision, recall, F1 score, and accuracy for each prediction
    pr = [metrics.precision_score(g, p) for g, p in zip(gold, preds)]
    rc = [metrics.recall_score(g, p) for g, p in zip(gold, preds)]
    f1 = [metrics.f1_score(g, p) for g, p in zip(gold, preds)]
    acc = [metrics.accuracy_score(g, p) for g, p in zip(gold, preds)]

    # Calculate mean and standard error for each metric
    mean_pr, sem_pr = np.mean(pr), sem(pr).round(3) * 100
    mean_rc, sem_rc = np.mean(rc), sem(rc).round(3) * 100
    mean_f1, sem_f1 = np.mean(f1), sem(f1).round(3) * 100
    mean_acc, sem_acc = np.mean(acc), sem(acc).round(3) * 100

    # Format the output string
    out_str = (
        f"{round_float(mean_pr)} \\pm {sem_pr} & "
        f"{round_float(mean_rc)} \\pm {sem_rc} & "
        f"{round_float(mean_f1)} \\pm {sem_f1} & "
        f"{round_float(mean_acc)} \\pm {sem_acc}"
    )

    return out_str


In [27]:
def create_table_1():
    X_train, y_train, X_validation, y_validation, X_test, y_test = get_dataset_splits()

    for split in ["train", "dev", "test", "all"]:
        out_str = f"{split} & "

        if split == "train":
            out_str += f"{len(X_train)} & "
            out_str += f"{np.mean([len(i.split()) for i in X_train]).round(1)} & "
            out_str += f"{np.mean(y_train).round(2)}"

        elif split == "dev":
            out_str += f"{len(X_validation)} & "
            out_str += f"{np.mean([len(i.split()) for i in X_validation]).round(1)} & "
            out_str += f"{np.mean(y_validation).round(2)}"

        elif split == "test":
            out_str += f"{len(X_test)} & "
            out_str += f"{np.mean([len(i.split()) for i in X_test]).round(1)} & "
            out_str += f"{np.mean(y_test).round(2)}"

        elif split == "all":
            X_all = X_train + X_validation + X_test
            y_all = y_train + y_validation + y_test
            out_str += f"{len(X_all)} & "
            out_str += f"{np.mean([len(i.split()) for i in X_all]).round(1)} & "
            out_str += f"{np.mean(y_all).round(2)}"

        out_str += r" \\ \hline"
        print(out_str)

create_table_1()


train & 2117 & 24.5 & 0.25 \\ \hline
dev & 265 & 24.4 & 0.25 \\ \hline
test & 265 & 24.2 & 0.25 \\ \hline
all & 2647 & 24.5 & 0.25 \\ \hline


In [29]:

def baselines():
    # Majority
    out_str = "majority & "
    all_preds, all_labels = [], []

    # Cross-validation
    for X_train, y_train, X_test, y_test in get_cv_splits():
        all_labels.extend(y_test)
        preds = [0] * len(y_test)
        all_preds.extend(preds)

    out_str += evaluate(all_labels, all_preds) + " & "

    # Dev set
    X_train, y_train, X_validation, y_validation, X_test, y_test = get_dataset_splits()
    preds = [0] * len(y_validation)
    out_str += evaluate(y_validation, preds) + " & "

    # Test set
    preds = [0] * len(y_test)
    out_str += evaluate(y_test, preds) + r" \\"

    print(out_str)

    # Random
    out_str = "random & "
    all_preds, all_labels = [], []

    for X_train, y_train, X_test, y_test in get_cv_splits():
        all_labels.extend(y_test)
        preds = np.random.randint(0, 2, size=len(y_test), dtype=int)
        all_preds.extend(preds)

    out_str += evaluate(all_labels, all_preds) + " & "

    # Dev set
    preds = np.random.randint(0, 2, size=len(y_validation), dtype=int)
    out_str += evaluate(y_validation, preds) + " & "

    # Test set
    preds = np.random.randint(0, 2, size=len(y_test), dtype=int)
    out_str += evaluate(y_test, preds) + r" \\"

    print(out_str)

baselines()


majority & 0.0 & 0.0 & 0.0 & 74.9 & 0.0 & 0.0 & 0.0 & 75.1 & 0.0 & 0.0 & 0.0 & 74.7 \\
random & 26.700000000000003 & 53.1 & 35.5 & 51.6 & 23.599999999999998 & 43.9 & 30.7 & 50.6 & 26.400000000000002 & 52.6 & 35.199999999999996 & 51.2 \\


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


In [30]:
def tf_idf_baseline():
    classifier = LinearSVC(max_iter=50000)
    parameters = {
        'vect__max_features': [10000, 20000, 40000],
        'clf__C': [0.1, 1, 10],
        'clf__loss': ('hinge', 'squared_hinge')
    }

    out_str = "TF-IDF SVM & "

    # Cross-validation
    all_preds, all_labels = [], []
    for X_train, y_train, X_test, y_test in get_cv_splits():
        text_clf = Pipeline([
            ('vect', CountVectorizer(stop_words='english', ngram_range=(1, 3), min_df=5)),
            ('tfidf', TfidfTransformer()),
            ('clf', classifier),
        ])
        text_clf.fit(X_train, y_train)
        all_preds.extend(text_clf.predict(X_test))
        all_labels.extend(y_test)

    out_str += evaluate(all_labels, all_preds) + " & "

    # Dev set and test set
    X_train, y_train, X_validation, y_validation, X_test, y_test = get_dataset_splits()

    for split, X, y in zip(["dev", "test"], [X_validation, X_test], [y_validation, y_test]):
        text_clf = Pipeline([
            ('vect', CountVectorizer(stop_words='english', ngram_range=(1, 3), min_df=5)),
            ('tfidf', TfidfTransformer()),
            ('clf', classifier),
        ])
        text_clf.fit(X_train, y_train)
        preds = text_clf.predict(X)
        out_str += evaluate(y, preds) + " & "

    print(out_str)

tf_idf_baseline()


TF-IDF SVM & 71.1 & 65.9 & 68.4 & 84.7 & 67.7 & 63.6 & 65.60000000000001 & 83.39999999999999 & 68.10000000000001 & 70.1 & 69.1 & 84.2 & 


In [31]:
def character_n_gram_baseline():
    classifier = LinearSVC(max_iter=50000)
    parameters = {
        'vect__max_features': [10000, 20000, 40000],
        'clf__C': [0.1, 1, 10],
        'clf__loss': ('hinge', 'squared_hinge')
    }

    out_str = "Character n-gram SVM & "

    # Cross-validation
    all_preds, all_labels = [], []
    for X_train, y_train, X_test, y_test in get_cv_splits():
        text_clf = Pipeline([
            ('vect', CountVectorizer(ngram_range=(1, 10), token_pattern=r"(?u)\b\w+\b", analyzer='char', min_df=5)),
            ('tfidf', TfidfTransformer()),
            ('clf', classifier),
        ])
        text_clf.fit(X_train, y_train)
        all_preds.extend(text_clf.predict(X_test))
        all_labels.extend(y_test)

    out_str += evaluate(all_labels, all_preds) + " & "

    # Dev set and test set
    X_train, y_train, X_validation, y_validation, X_test, y_test = get_dataset_splits()

    for split, X, y in zip(["dev", "test"], [X_validation, X_test], [y_validation, y_test]):
        text_clf = Pipeline([
            ('vect', CountVectorizer(ngram_range=(1, 10), token_pattern=r"(?u)\b\w+\b", analyzer='char', min_df=10)),
            ('tfidf', TfidfTransformer()),
            ('clf', classifier),
        ])
        text_clf.fit(X_train, y_train)
        preds = text_clf.predict(X)
        out_str += evaluate(y, preds) + " & "

    print(out_str)

character_n_gram_baseline()




Character n-gram SVM & 76.8 & 63.6 & 69.6 & 86.0 & 69.19999999999999 & 68.2 & 68.7 & 84.5 & 75.0 & 67.2 & 70.89999999999999 & 86.0 & 
