In [25]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.dummy import DummyClassifier
from xgboost import XGBClassifier
from sklearn.linear_model import LogisticRegressionCV
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import VarianceThreshold, SelectFromModel
from sklearn.base import clone
import numpy as np

In [26]:
stop_words = ["i", "you", "me", "they", "he", "him", "it", "she", "her"]

In [27]:
captions= pd.read_csv("captions.csv",sep=";")

In [40]:
def bow(textArray):
	vector = TfidfVectorizer(use_idf=True,
                                    smooth_idf=True,
                                    ngram_range=(1, 2), stop_words=stop_words)
	vector.fit(np.concatenate((textArray,captions.text)))
	bow = vector.transform(textArray)
	

	return bow.toarray(), vector


def csv_to_bow(PATH, sep=";"):
	data = pd.read_csv(PATH, sep)
	values, vector = bow(data.text)
	df_bow = pd.DataFrame(values, columns=vector.get_feature_names())
	df_bow["class"] = data["class"]

	return df_bow, vector

In [55]:
def removeFeatures(x,y, selector):
	features = list(selector.fit(x,y).get_support(indices=True))
	x = x.iloc[:, features]

	return x, features

In [56]:
def train(data, model):
	x = data.drop(columns=["class"])
	y = data["class"]
	x, features = removeFeatures(x,y,SelectFromModel(clone(model), max_features=20))
	train_x, test_x, train_y, test_y = train_test_split(
		x, y, stratify=y, test_size=0.2, random_state=67)
	model.fit(train_x, train_y)
	print(model.score(test_x, test_y))	
	return model, features
	

In [61]:
def test_sentences(text, y, model, transformer, features):
	x = transformer.transform(text)
	x = x[:, features]
	print(f"Orig. {list(y)}")
	print(f"Pred. {model.predict(x)}")
	print(model.score(x, y))

In [57]:
uncertainty, uncertainty_transformer = csv_to_bow("./incerteza.csv")
uncertainty_model, uncertainty_features = train(uncertainty, LinearSVC())

0.9090909090909091


In [63]:
hyperbole, hyperbole_transformer = csv_to_bow("./hyperbole.csv")
hyperbole_model, hyperbole_features = train(hyperbole, LinearSVC())

0.9047619047619048


In [65]:
metonymy, metonymy_transformer = csv_to_bow("./metonymy.csv")
metonymy_model, metonymy_features = train(metonymy, LinearSVC())

0.7272727272727273


In [64]:
antithesis, antithesis_transformer = csv_to_bow("./antithesis.csv")
antithesis_model, antithesis_features = train(antithesis, LinearSVC())

0.7857142857142857


In [62]:
test_sentences(captions.text, captions.uncertainty,uncertainty_model, uncertainty_transformer, uncertainty_features)

Orig. [1, 1, 1, 1, 1, 1, 1]
Pred. [1 1 0 1 1 1 0]
0.7142857142857143


In [66]:
test_sentences(captions.text, captions.hyperbole,hyperbole_model, hyperbole_transformer, hyperbole_features)

Orig. [1, 1, 1, 1, 1, 1, 1]
Pred. [1 1 1 1 0 0 0]
0.5714285714285714


In [68]:
test_sentences(captions.text, captions.metonymy, metonymy_model, metonymy_transformer, metonymy_features)

Orig. [1, 1, 1, 1, 1, 1, 1]
Pred. [0 0 0 0 0 0 0]
0.0


In [69]:
test_sentences(captions.text, captions.antithesis, antithesis_model, antithesis_transformer, antithesis_features)

Orig. [1, 1, 1, 1, 0, 0, 1]
Pred. [1 1 1 1 1 1 1]
0.7142857142857143


In [35]:
def replace_with_suffix(suffix, string):
	words = string.split(" ")
	words_suffix = [word if not word.endswith(suffix) else suffix for word in words ]
	return " ".join(words_suffix)

#def join_bigram(first, second, string):
