In [1]:
from sklearn import datasets
from sklearn.metrics import accuracy_score, roc_auc_score
from sklearn.model_selection import train_test_split

from frlearn.base import probabilities_from_scores, select_class
from frlearn.classifiers import FRNN
from frlearn.feature_preprocessors import RangeNormaliser
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.porter import PorterStemmer

In [2]:
def indescernibility(matrix,y):
    df=pd.DataFrame(matrix)
    # this will return the list of columns     
    y=list(df.columns)
    grouped_df=df.groupby(y)
    
    ind_R=list(list())
    for key, item in grouped_df:
#         print(grouped_df.get_group(key), "\n",grouped_df.get_group(key).index ,"\n\n")
        lis=[]
        for i in grouped_df.get_group(key).index:
            lis.append(i)
        ind_R.append(list(lis))
    return ind_R

def encoding_discourse_type(x):
    if x=="Lead":
        return 0
    if x=="Position":
        return 1
    if x=="Evidence":
        return 2
    if x=="Claim":
        return 3
    if x=="Concluding Statement":
        return 4
    if x=="Counterclaim":
        return 5
    if x=='Rebuttal':
        return 6
    
def stemming_stopwords_removing(df):
    corpus=[]
    for i in range(len(df)):
        review=re.sub('[^a-zA-Z]',' ',df["discourse_text"][i])
        review=review.lower()
        review=review.split()
        ps=PorterStemmer()
        all_stopwords=stopwords.words("english")
        review=[ps.stem(word) for word in review if not word in set(all_stopwords)]
        review=' '.join(review)
        corpus.append(review)
    return corpus

# storing the total occurrence.......
def get_total_index_words(corpus):
    index_word={}
    for i in corpus:
        s=i.split()
        for j in s:
            if j not in index_word:
                index_word[j]=1
            else:
                index_word[j]+=1
    return index_word
    
def get_values (dataset,threshold=1):
    
    # taking sample of 20 documents for lead category....
    df = dataset

    total_corpus = stemming_stopwords_removing(df)
    # print(total_corpus)

    # getting total index words and their count in the taken sample as a dict
    total_index_words = get_total_index_words(total_corpus)
    # print(len(lead_index_words))

    # Creating a list of total keywords before filtering..
    total_keywords = list(total_index_words.keys())
    # print(lead_keywords)

    # Creating a matrix of width equals len(lead_keywords)
    matrix=np.zeros((len(df),len(total_keywords)),np.float16)

    
    # Storing occurrence of each term in each document respectively
    for i in range(len(total_corpus)):
        s = total_corpus[i].split()
        for h in s:
            j = total_keywords.index(h)
            matrix[i,j] += 1


    # Storing their weights....
    weighted_matrix = np.copy(matrix)
    
    for i in range(len(df)):
        for j in range(len(total_keywords)):
            weighted_matrix[i,j] = weighted_matrix[i,j] / total_index_words[total_keywords[j]]
    #            print(weighted_matrix[i,j])



    # FILTERING WEIGHTS with a threshold.......
    valid_index = []
    for i in range(len(df)):
        for j in range(len(total_keywords)):
            if weighted_matrix[i,j] >= threshold:
                valid_index.append(j)

    # removing duplicates and storing them in a list.......    
    valid_index = list(set(valid_index))


    # # Storing the final keywords.... 
    valid_index_words = []
    for i in range(len(valid_index)):
        valid_index_words.append(total_keywords[valid_index[i]])
    # print(valid_lead_index_words)

    return total_keywords,total_index_words,matrix, weighted_matrix, valid_index, valid_index_words

def get_test_matrix(dataset,total_sample_keywords,threshold=1):
      # taking sample of 20 documents for lead category....
    df = dataset

    total_corpus = stemming_stopwords_removing(df)
    # print(total_corpus)

    # getting total index words and their count in the taken sample as a dict
    total_index_words = get_total_index_words(total_corpus)


    # Creating a list of total keywords before filtering..
    total_keywords = total_sample_keywords
    # print(lead_keywords)

    # Creating a matrix of width equals len(lead_keywords)
    matrix=np.zeros((len(df),len(total_keywords)),np.float32)
    print(len(df),len(total_keywords))

    
    # Storing occurrence of each term in each document respectively
    for i in range(len(total_corpus)):
        s = total_corpus[i].split()
        for h in s:
            try:
                j = total_keywords.index(h)
                matrix[i,j] += 1
            except:
                continue
            
    # Storing their weights....
    weighted_matrix = np.copy(matrix)
    
    for i in range(len(df)):
        for j in range(len(total_keywords)):
            try:
                weighted_matrix[i,j] = weighted_matrix[i,j] / total_index_words[total_keywords[j]]
            except:
                continue


    # FILTERING WEIGHTS with a threshold.......
    valid_index = []
    for i in range(len(df)):
        for j in range(len(total_keywords)):
            if weighted_matrix[i,j] >= threshold:
                valid_index.append(j)

    # removing duplicates and storing them in a list.......    
    valid_index = list(set(valid_index))


    # # Storing the final keywords.... 
    valid_index_words = []
    for i in range(len(valid_index)):
        valid_index_words.append(total_keywords[valid_index[i]])
    # print(valid_lead_index_words)

    return total_keywords,total_index_words,matrix, weighted_matrix, valid_index, valid_index_words


In [3]:

class RoughSetsReducer:

    def __size(self, x):
        return (1, x.shape[0]) if x.ndim == 1 else x.shape

    '''
    Calculates indiscernibility relation
    '''
    def indisc(self, a, x):

        def codea(a, x, b):
            yy = 0
            print(a,len(x),b)
            for i in range(0, a):
                yy += (x[i] * b**(a-(i+1)))

            return yy

        p, q = self.__size(x)
        ap, aq = self.__size(a)
        z = [e for e in range(1, q+1)]
        tt = np.setdiff1d(z, a)
        tt_ind = np.setdiff1d(z, tt)-1
        if x.ndim == 1:
            x = x[tt_ind]
        else:
            x = x[:, tt_ind]
        y = x
        v = [codea(aq, y, 10) for i in range(0, p)] if y.ndim == 1 \
            else [codea(aq, y[i, :], 10) for i in range(0, p)]
        y = np.transpose(v)
        if y.shape[0] == 1 and len(y.shape) == 1:
            I, yy = [1], [y]
            y = np.hstack((y, I))
            b, k, l = [y], [1], [1]
        else:
            ax = 1 if y.ndim > 1 else 0
            yy = np.sort(y, axis=ax)
            I = y.argsort(axis=ax)
            y = np.hstack((yy, I))
            b, k, l = np.unique(yy, return_index=True, return_inverse=True)
        y = np.hstack((l, I))
        m = np.max(l)
        aa = np.zeros((m+1, p), dtype=int)
        for ii in range(0, m+1):
            for j in range(0, p):
                if l[j] == ii:
                    aa[ii, j] = I[j]+1
        return aa

    '''
    Calculates lower approximation set of y
    '''
    def rslower(self, y, a, T):
        z = self.indisc(a, T)
        w = []
        p, q = self.__size(z)
        for u in range(0, p):
            zz = np.setdiff1d(z[u, :], 0)
            if np.in1d(zz, y).all():
                w = np.hstack((w, zz))
        return w.astype(dtype=int)

    '''
    Calculates upper approximation set of y
    '''
    def rsupper(self, y, a, T):
        z = self.indisc(a, T)
        w = []
        p, q = self.__size(z)
        for u in range(0, p):
            zz = np.setdiff1d(z[u, :], 0)
            zzz = np.intersect1d(zz, y)
            if len(zzz) > 0:
                w = np.hstack((w, zz))
        return w.astype(dtype=int)


    def __pospq(self, p, q):
        pm, pn = self.__size(p)
        qm, qn = self.__size(q)
        num = 0
        pp, qq = [[]] * pm, [[]] * qm
        for i in range(0, pm):
            pp[i] = np.unique(p[i, :])
        for j in range(0, qm):
            qq[j] = np.unique(q[j, :])
        b = []
        for i in range(0, qm):
            for j in range(0, pm):
                if np.in1d(pp[j], qq[i]).all():
                    num += 1
                    b = np.hstack((b, pp[j]))
        bb = np.unique(b)
        if bb.size == 0:
            dd = 1
        else:
            _, dd = self.__size(bb)
        y = float(dd - 1)/pn if 0 in bb else float(dd)/pn
        b = np.setdiff1d(bb, 0)
        return y, b

    '''
    Extract core set from C to D
    '''
    def core(self, C, D):
        x = np.hstack((C, D))
        c = np.array(range(1, C.shape[1]+1))
        d = np.array([C.shape[1]+1])
        cp, cq = self.__size(c)
        q = self.indisc(d, x)
        pp = self.indisc(c, x)
        b, w = self.__pospq(pp, q)
        a, k, kk, p = ([[]] * cq for i in range(4))
        y = []
        for u in range(0, cq):
            ind = u+1
            a[u] = np.setdiff1d(c, ind)
            p[u] = self.indisc(a[u], x)
            k[u], kk[u] = self.__pospq(p[u], q)
            if k[u] != b:
                y = np.hstack((y, ind))
        return np.array(y)

    def __sgf(self, a, r, d, x):
        pr = self.indisc(r, x)
        q = self.indisc(d, x)
        b = np.hstack((r, a))
        pb = self.indisc(b, x)
        p1, _ = self.__pospq(pb, q)
        p2, _ = self.__pospq(pr, q)
        return p1 - p2

    '''
    Return the set of irreducible attributes
    '''
    def reduce(self, C, D):

        def redu2(i, re, c, d, x):
            yre = re
            re1, re2 = self.__size(re)
            q = self.indisc(d, x)
            p = self.indisc(c, x)
            pos_cd, _ = self.__pospq(p, q)
            y, j = None, None
            for qi in range(i, re2):
                re = np.setdiff1d(re, re[qi])
                red = self.indisc(re, x)
                pos_red, _ = self.__pospq(red, q)
                if np.array_equal(pos_cd, pos_red):
                    y = re
                    j = i
                    break
                else:
                    y = yre
                    j = i + 1
                    break
            return y, j

        x = np.hstack((C, D))
        c = np.array(range(1, C.shape[1]+1))
        d = np.array([C.shape[1]+1])
        y = self.core(C, D)
        q = self.indisc(d, x)
        p = self.indisc(c, x)
        pos_cd, _ = self.__pospq(p, q)
        re = y
        red = self.indisc(y, x)
        pos_red, _ = self.__pospq(red, q)
        while pos_cd != pos_red:
            cc = np.setdiff1d(c, re)
            c1, c2 = self.__size(cc)
            yy = [0] * c2
            for i in range(0, c2):
                yy[i] = self.__sgf(cc[i], re, d, x)
            cd = np.setdiff1d(c, y)
            d1, d2 = self.__size(cd)
            for i in range(d2, c2, -1):
                yy[i] = []
            ii = np.argsort(yy)
            for v1 in range(c2-1, -1, -1):
                v2 = ii[v1]
                re = np.hstack((re, cc[v2]))
                red = self.indisc(re, x)
                pos_red, _ = self.__pospq(red, q)
        re1, re2 = self.__size(re)
        core = y
        for qi in range(re2-1, -1, -1):
            if re[qi] in core:
                y = re
                break
            re = np.setdiff1d(re, re[qi])
            red = self.indisc(re, x)
            pos_red, _ = self.__pospq(red, q)
            if np.array_equal(pos_cd, pos_red):
                y = re
        y1, y2 = self.__size(y)
        j = 0
        for i in range(0, y2):
            y, j = redu2(j, y, c, d, x)
        return y



In [4]:
# import numpy as np
from sklearn.base import BaseEstimator
from sklearn.feature_selection._base import SelectorMixin
# from scikit_roughsets.roughsets import RoughSetsReducer


class RoughSetsSelector(BaseEstimator, SelectorMixin):

    def _get_support_mask(self):
        return self.mask_

    def fit(self, X, y=None):
        # Missing values are not supported yet!
        if np.isnan(X).any():
            raise ValueError("X must not contain any missing values")
        if np.isnan(y).any():
            raise ValueError("y must not contain any missing values")
        # Check that X and Y contains only integer values
        if not np.all(np.equal(np.mod(X, 1), 0)):
            raise ValueError("X must contain only integer values")
        if not np.all(np.equal(np.mod(y, 1), 0)):
            raise ValueError("y must contain only integer values")

        reducer = RoughSetsReducer()
        selected_ = reducer.reduce(X, y)
        B_unique_sorted, B_idx = np.unique(np.array(range(X.shape[1])), return_index=True)
        B_unique_sorted = B_unique_sorted + 1  # Shift elements by one, as RS index array starts by one
        self.mask_ = np.in1d(B_unique_sorted, selected_, assume_unique=True)

        if self.mask_.size == 0:
            raise ValueError("No features were selected by rough sets reducer")
        return self

In [5]:
df1=pd.read_csv("rough.csv")

df1

Unnamed: 0.1,Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring
0,0,423A1CA112E2,1.622628e+12,8.0,229.0,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1,1,423A1CA112E2,1.622628e+12,230.0,312.0,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
2,2,423A1CA112E2,1.622628e+12,313.0,401.0,Some certain areas in the United States ban ph...,Evidence,Evidence 1,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
3,3,423A1CA112E2,1.622628e+12,402.0,758.0,"When people have phones, they know about certa...",Evidence,Evidence 2,76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...
4,4,423A1CA112E2,1.622628e+12,759.0,886.0,Driving is one of the way how to get around. P...,Claim,Claim 1,139 140 141 142 143 144 145 146 147 148 149 15...
...,...,...,...,...,...,...,...,...,...
135,135,354946A1CA46,1.623079e+12,2657.0,2824.0,Opponents whose work depends on a cell phone l...,Counterclaim,Counterclaim 2,450 451 452 453 454 455 456 457 458 459 460 46...
136,136,1D35A6980E7F,1.622824e+12,388.0,424.0,"Although it tends to be distracting,",Counterclaim,Counterclaim 1,73 74 75 76 77 78
137,137,1D35A6980E7F,1.622824e+12,1777.0,1920.0,Texting while driving is a really problematic ...,Counterclaim,Counterclaim 2,328 329 330 331 332 333 334 335 336 337 338 33...
138,138,40CC76613B2D,1.623002e+12,1120.0,1216.0,To say that there is a time devices should be ...,Counterclaim,Counterclaim 1,191 192 193 194 195 196 197 198 199 200 201 20...


In [6]:
# Storing discourse types in an array fo df1
df1_classes=df1["discourse_type"].values

# Removing stopwords and performing Porter Stemming....
df1_corpus = stemming_stopwords_removing(df1)

# getting total index words and their count in the taken sample as a dict
df1_index_words = get_total_index_words(df1_corpus)

# Creating a list of total keywords before filtering..
df1_keywords = list(df1_index_words.keys())

total_sample_keywords=df1_keywords.copy()

# creating our attributes table for first 20 values of all the 7 different classes
attributes=np.zeros((len(df1),len(total_sample_keywords)))

# Storing occurrence of each term in each document respectively
for i in range(len(df1_corpus)):
    s = df1_corpus[i].split()
    for h in s:
        if h in total_sample_keywords:
            j = total_sample_keywords.index(h)
            attributes[i,j] += 1
            

# df1_keywords=list(df1_index_words.keys())
# total_sample_keywords
# df1_keywords

In [7]:
attributes

array([[1., 1., 1., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 1., 0., 0.],
       [0., 0., 0., ..., 0., 1., 1.]])

In [8]:
attributes=np.array(attributes)
len(attributes[0])

778

In [9]:
df1["encoded_dicourse_type"]=df1["discourse_type"].copy()
df1["encoded_dicourse_type"]=df1["encoded_dicourse_type"].apply(lambda x: encoding_discourse_type(x))
y1=df1["encoded_dicourse_type"].values
y1=np.array([0, 1, 2, 2, 3, 2, 2, 3, 2, 4, 1, 3, 2, 4, 0, 1, 3, 2, 5, 6, 4, 0,
       1, 3, 3, 3, 3, 2, 3, 2, 3, 2, 4, 0, 1, 3, 2, 3, 2, 3, 2, 4, 0, 3,
       3, 3, 1, 3, 2, 3, 2, 3, 2, 4, 1, 3, 3, 3, 2, 2, 3, 2, 4, 0, 1, 3,
       2, 3, 2, 2, 4, 1, 3, 2, 3, 2, 3, 2, 4, 0, 1, 3, 2, 3, 2, 0, 1, 3,
       2, 3, 2, 4, 1, 0, 3, 2, 5, 2, 4, 0, 6, 6, 6, 6, 6, 6, 6, 6, 6, 6,
       6, 6, 6, 6, 6, 6, 6, 6, 6, 6, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5, 5,
       5, 5, 5, 5, 5, 5, 5, 5],np.int64)

In [10]:
# Split into train and test sets.
X_train, X_test, y_train, y_test = train_test_split(attributes, y1, stratify=y1, random_state=0)





In [14]:
clf = FRNN(preprocessors=(RangeNormaliser(), ))
model = clf(X_train, y_train)
scores = model(X_test)

In [15]:
# Convert scores to probabilities and calculate the AUROC.
probabilities = probabilities_from_scores(scores)
auroc = roc_auc_score(y_test, probabilities, multi_class='ovo')
print('AUROC:', auroc)

# Select classes with the highest scores and calculate the accuracy.
classes = select_class(scores)

accuracy = accuracy_score(y_test, classes)
print('accuracy:', accuracy)

AUROC: 0.7863095238095239
accuracy: 0.14285714285714285


In [16]:
classes

array([1, 1, 1, 1, 1, 1, 1, 5, 1, 1, 1, 1, 1, 1, 1, 1, 1, 6, 1, 1, 1, 1,
       1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1], dtype=int64)

In [17]:
y_test

array([3, 6, 3, 4, 2, 1, 0, 5, 3, 2, 2, 1, 4, 0, 2, 5, 3, 6, 3, 2, 5, 5,
       2, 3, 3, 6, 4, 5, 6, 2, 2, 6, 1, 5, 3], dtype=int64)