In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.porter import PorterStemmer

In [2]:

class RoughSetsReducer:

    def __size(self, x):
        return (1, x.shape[0]) if x.ndim == 1 else x.shape

    '''
    Calculates indiscernibility relation
    '''
    def indisc(self, a, x):

        def codea(a, x, b):
            yy = 0
            print(a,len(x),b)
            for i in range(0, a):
                yy += (x[i] * (b**(a-(i+1))%10000000000000))

            return yy

        p, q = self.__size(x)
        ap, aq = self.__size(a)
        z = [e for e in range(1, q+1)]
        tt = np.setdiff1d(z, a)
        tt_ind = np.setdiff1d(z, tt)-1
        if x.ndim == 1:
            x = x[tt_ind]
        else:
            x = x[:, tt_ind]
        y = x
        v = [codea(aq, y, 10) for i in range(0, p)] if y.ndim == 1 \
            else [codea(aq, y[i, :], 10) for i in range(0, p)]
        y = np.transpose(v)
        if y.shape[0] == 1 and len(y.shape) == 1:
            I, yy = [1], [y]
            y = np.hstack((y, I))
            b, k, l = [y], [1], [1]
        else:
            ax = 1 if y.ndim > 1 else 0
            yy = np.sort(y, axis=ax)
            I = y.argsort(axis=ax)
            y = np.hstack((yy, I))
            b, k, l = np.unique(yy, return_index=True, return_inverse=True)
        y = np.hstack((l, I))
        m = np.max(l)
        aa = np.zeros((m+1, p), dtype=int)
        for ii in range(0, m+1):
            for j in range(0, p):
                if l[j] == ii:
                    aa[ii, j] = I[j]+1
        return aa

    '''
    Calculates lower approximation set of y
    '''
    def rslower(self, y, a, T):
        z = self.indisc(a, T)
        w = []
        p, q = self.__size(z)
        for u in range(0, p):
            zz = np.setdiff1d(z[u, :], 0)
            if np.in1d(zz, y).all():
                w = np.hstack((w, zz))
        return w.astype(dtype=int)

    '''
    Calculates upper approximation set of y
    '''
    def rsupper(self, y, a, T):
        z = self.indisc(a, T)
        w = []
        p, q = self.__size(z)
        for u in range(0, p):
            zz = np.setdiff1d(z[u, :], 0)
            zzz = np.intersect1d(zz, y)
            if len(zzz) > 0:
                w = np.hstack((w, zz))
        return w.astype(dtype=int)


    def __pospq(self, p, q):
        pm, pn = self.__size(p)
        qm, qn = self.__size(q)
        num = 0
        pp, qq = [[]] * pm, [[]] * qm
        for i in range(0, pm):
            pp[i] = np.unique(p[i, :])
        for j in range(0, qm):
            qq[j] = np.unique(q[j, :])
        b = []
        for i in range(0, qm):
            for j in range(0, pm):
                if np.in1d(pp[j], qq[i]).all():
                    num += 1
                    b = np.hstack((b, pp[j]))
        bb = np.unique(b)
        if bb.size == 0:
            dd = 1
        else:
            _, dd = self.__size(bb)
        y = float(dd - 1)/pn if 0 in bb else float(dd)/pn
        b = np.setdiff1d(bb, 0)
        return y, b

    '''
    Extract core set from C to D
    '''
    def core(self, C, D):
        x = np.hstack((C, D))
        c = np.array(range(1, C.shape[1]+1))
        d = np.array([C.shape[1]+1])
        cp, cq = self.__size(c)
        q = self.indisc(d, x)
        pp = self.indisc(c, x)
        b, w = self.__pospq(pp, q)
        a, k, kk, p = ([[]] * cq for i in range(4))
        y = []
        for u in range(0, cq):
            ind = u+1
            a[u] = np.setdiff1d(c, ind)
            p[u] = self.indisc(a[u], x)
            k[u], kk[u] = self.__pospq(p[u], q)
            if k[u] != b:
                y = np.hstack((y, ind))
        return np.array(y)

    def __sgf(self, a, r, d, x):
        pr = self.indisc(r, x)
        q = self.indisc(d, x)
        b = np.hstack((r, a))
        pb = self.indisc(b, x)
        p1, _ = self.__pospq(pb, q)
        p2, _ = self.__pospq(pr, q)
        return p1 - p2

    '''
    Return the set of irreducible attributes
    '''
    def reduce(self, C, D):

        def redu2(i, re, c, d, x):
            yre = re
            re1, re2 = self.__size(re)
            q = self.indisc(d, x)
            p = self.indisc(c, x)
            pos_cd, _ = self.__pospq(p, q)
            y, j = None, None
            for qi in range(i, re2):
                re = np.setdiff1d(re, re[qi])
                red = self.indisc(re, x)
                pos_red, _ = self.__pospq(red, q)
                if np.array_equal(pos_cd, pos_red):
                    y = re
                    j = i
                    break
                else:
                    y = yre
                    j = i + 1
                    break
            return y, j

        x = np.hstack((C, D))
        c = np.array(range(1, C.shape[1]+1))
        d = np.array([C.shape[1]+1])
        y = self.core(C, D)
        q = self.indisc(d, x)
        p = self.indisc(c, x)
        pos_cd, _ = self.__pospq(p, q)
        re = y
        red = self.indisc(y, x)
        pos_red, _ = self.__pospq(red, q)
        while pos_cd != pos_red:
            cc = np.setdiff1d(c, re)
            c1, c2 = self.__size(cc)
            yy = [0] * c2
            for i in range(0, c2):
                yy[i] = self.__sgf(cc[i], re, d, x)
            cd = np.setdiff1d(c, y)
            d1, d2 = self.__size(cd)
            for i in range(d2, c2, -1):
                yy[i] = []
            ii = np.argsort(yy)
            for v1 in range(c2-1, -1, -1):
                v2 = ii[v1]
                re = np.hstack((re, cc[v2]))
                red = self.indisc(re, x)
                pos_red, _ = self.__pospq(red, q)
        re1, re2 = self.__size(re)
        core = y
        for qi in range(re2-1, -1, -1):
            if re[qi] in core:
                y = re
                break
            re = np.setdiff1d(re, re[qi])
            red = self.indisc(re, x)
            pos_red, _ = self.__pospq(red, q)
            if np.array_equal(pos_cd, pos_red):
                y = re
        y1, y2 = self.__size(y)
        j = 0
        for i in range(0, y2):
            y, j = redu2(j, y, c, d, x)
        return y

    


In [3]:
# import numpy as np
from sklearn.base import BaseEstimator
from sklearn.feature_selection._base import SelectorMixin
# from scikit_roughsets.roughsets import RoughSetsReducer


class RoughSetsSelector(BaseEstimator, SelectorMixin):

    def _get_support_mask(self):
        return self.mask_

    def fit(self, X, y=None):
        # Missing values are not supported yet!
        if np.isnan(X).any():
            raise ValueError("X must not contain any missing values")
        if np.isnan(y).any():
            raise ValueError("y must not contain any missing values")
        # Check that X and Y contains only integer values
        if not np.all(np.equal(np.mod(X, 1), 0)):
            raise ValueError("X must contain only integer values")
        if not np.all(np.equal(np.mod(y, 1), 0)):
            raise ValueError("y must contain only integer values")

        reducer = RoughSetsReducer()
        selected_ = reducer.reduce(X, y)
        B_unique_sorted, B_idx = np.unique(np.array(range(X.shape[1])), return_index=True)
        B_unique_sorted = B_unique_sorted + 1  # Shift elements by one, as RS index array starts by one
        self.mask_ = np.in1d(B_unique_sorted, selected_, assume_unique=True)

        if self.mask_.size == 0:
            raise ValueError("No features were selected by rough sets reducer")
        return self

In [4]:
def indiscernibility(matrix,y):
    df=pd.DataFrame(matrix)
    # this will return the list of columns     
    y=list(df.columns)
    grouped_df=df.groupby(y)
    
    ind_R=list(list())
    for key, item in grouped_df:
#         print(grouped_df.get_group(key), "\n",grouped_df.get_group(key).index ,"\n\n")
        lis=[]
        for i in grouped_df.get_group(key).index:
            lis.append(i)
        ind_R.append(list(lis))
    return ind_R

def encoding_discourse_type(x):
    if x=="Lead":
        return 0
    if x=="Position":
        return 1
    if x=="Evidence":
        return 2
    if x=="Claim":
        return 3
    if x=="Concluding Statement":
        return 4
    if x=="Counterclaim":
        return 5
    if x=='Rebuttal':
        return 6
    
def stemming_stopwords_removing(df):
    corpus=[]
    for i in range(len(df)):
        review=re.sub('[^a-zA-Z]',' ',df["discourse_text"][i])
        review=review.lower()
        review=review.split()
        ps=PorterStemmer()
        all_stopwords=stopwords.words("english")
        review=[ps.stem(word) for word in review if not word in set(all_stopwords)]
        review=' '.join(review)
        corpus.append(review)
    return corpus

# storing the total occurrence.......
def get_total_index_words(corpus):
    index_word={}
    for i in corpus:
        s=i.split()
        for j in s:
            if j not in index_word:
                index_word[j]=1
            else:
                index_word[j]+=1
    return index_word
    
def get_values(dataset,threshold=1):
    
    # taking sample of 20 documents for lead category....
    df = dataset

    total_corpus = stemming_stopwords_removing(df)
    # print(total_corpus)

    # getting total index words and their count in the taken sample as a dict
    total_index_words = get_total_index_words(total_corpus)
    # print(len(lead_index_words))

    # Creating a list of total keywords before filtering..
    total_keywords = list(total_index_words.keys())

    # Creating a matrix of width equals len(lead_keywords)
    matrix=np.zeros((len(df),len(total_keywords)))

    
    # Storing occurrence of each term in each document respectively
    for i in range(len(total_corpus)):
        s = total_corpus[i].split()
        for h in s:
            j = total_keywords.index(h)
            matrix[i,j] += 1


    # Storing their weights....
    weighted_matrix = np.copy(matrix)
    
    for i in range(len(df)):
        for j in range(len(total_keywords)):
            weighted_matrix[i,j] = weighted_matrix[i,j] / total_index_words[total_keywords[j]]
    #            print(weighted_matrix[i,j])



    # FILTERING WEIGHTS with a threshold.......
    valid_index = []
    for i in range(len(df)):
        for j in range(len(total_keywords)):
            if weighted_matrix[i,j] >= threshold:
                valid_index.append(j)

    # removing duplicates and storing them in a list.......    
    valid_index = list(set(valid_index))


    # # Storing the final keywords.... 
    valid_index_words = []
    for i in range(len(valid_index)):
        valid_index_words.append(total_keywords[valid_index[i]])
    # print(valid_lead_index_words)

    return total_keywords,total_index_words,matrix, weighted_matrix, valid_index, valid_index_words

def get_test_matrix(dataset,total_sample_keywords,threshold=1):
      # taking sample of 20 documents for lead category....
    df = dataset

    total_corpus = stemming_stopwords_removing(df)
    # print(total_corpus)

    # getting total index words and their count in the taken sample as a dict
    total_index_words = get_total_index_words(total_corpus)


    # Creating a list of total keywords before filtering..
    total_keywords = total_sample_keywords
    # print(lead_keywords)

    # Creating a matrix of width equals len(lead_keywords)
    matrix=np.zeros((len(df),len(total_keywords)))
    print(len(df),len(total_keywords))

    
    # Storing occurrence of each term in each document respectively
    for i in range(len(total_corpus)):
        s = total_corpus[i].split()
        for h in s:
            try:
                j = total_keywords.index(h)
                matrix[i,j] += 1
            except:
                continue
            
    # Storing their weights....
    weighted_matrix = np.copy(matrix)
    
    for i in range(len(df)):
        for j in range(len(total_keywords)):
            try:
                weighted_matrix[i,j] = weighted_matrix[i,j] / total_index_words[total_keywords[j]]
            except:
                continue


    # FILTERING WEIGHTS with a threshold.......
    valid_index = []
    for i in range(len(df)):
        for j in range(len(total_keywords)):
            if weighted_matrix[i,j] >= threshold:
                valid_index.append(j)

    # removing duplicates and storing them in a list.......    
    valid_index = list(set(valid_index))
    


    # # Storing the final keywords.... 
    valid_index_words = []
    for i in range(len(valid_index)):
        valid_index_words.append(total_keywords[valid_index[i]])
    # print(valid_lead_index_words)

    return total_keywords,total_index_words,matrix, weighted_matrix, valid_index, valid_index_words


In [5]:
train=pd.read_csv("train.csv")
train_head=train.head(5000)
train_tail=train.tail(5000)
train=pd.concat([train_head,train_tail])
train.reset_index(inplace=True)
train

Unnamed: 0,index,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring
0,0,423A1CA112E2,1.622628e+12,8.0,229.0,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1,1,423A1CA112E2,1.622628e+12,230.0,312.0,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
2,2,423A1CA112E2,1.622628e+12,313.0,401.0,Some certain areas in the United States ban ph...,Evidence,Evidence 1,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
3,3,423A1CA112E2,1.622628e+12,402.0,758.0,"When people have phones, they know about certa...",Evidence,Evidence 2,76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...
4,4,423A1CA112E2,1.622628e+12,759.0,886.0,Driving is one of the way how to get around. P...,Claim,Claim 1,139 140 141 142 143 144 145 146 147 148 149 15...
...,...,...,...,...,...,...,...,...,...
9995,144288,4C471936CD75,1.618153e+12,2234.0,3203.0,if I'm not sure what college I want to attend...,Evidence,Evidence 2,386 387 388 389 390 391 392 393 394 395 396 39...
9996,144289,4C471936CD75,1.618153e+12,3221.0,4509.0,seeking multiple opinions before making a har...,Evidence,Evidence 3,576 577 578 579 580 581 582 583 584 585 586 58...
9997,144290,4C471936CD75,1.618025e+12,4510.0,4570.0,it is better to seek multiple opinions instead...,Position,Position 1,828 829 830 831 832 833 834 835 836 837 838
9998,144291,4C471936CD75,1.618025e+12,4570.0,4922.0,The impact of asking people to help you make a...,Evidence,Evidence 4,839 840 841 842 843 844 845 846 847 848 849 85...


In [6]:
# position=train[train["discourse_type"]=="Position"].copy()
# position.reset_index(inplace=True)

# evidence=train[train["discourse_type"]=="Evidence"].copy()
# evidence.reset_index(inplace=True)

# claim=train[train["discourse_type"]=="Claim"].copy()
# claim.reset_index(inplace=True)

# counter_claim=train[train["discourse_type"]=="Counterclaim"].copy()
# counter_claim.reset_index(inplace=True)

# rebuttal=train[train["discourse_type"]=="Rebuttal"].copy()
# rebuttal.reset_index(inplace=True)

# lead=train[train["discourse_type"]=="Lead"].copy()
# lead.reset_index(inplace=True)

# concluding_statement=train[train["discourse_type"]=="Concluding Statement"].copy()
# concluding_statement.reset_index(inplace=True)

In [7]:
# len(concluding_statement)

In [8]:
# Now i will send my training data to get its list of final keywords after eliminating useles words from our list
# using weighted matrix .....

train_total_keywords,train_total_index_words, train_matrix, train_weighted_matrix, train_valid_index, train_final_keywords=get_values(train)

In [9]:

total_sample_keywords=train_final_keywords.copy()
    
total_sample_keywords=list(set(total_sample_keywords))

len(total_sample_keywords)
# this is the final set of sample keywords that we collected from 3500 different texts from all the 7 classes

2381

In [10]:
# Making a dictionary to store index of the keywords in "total_sample_keywords"......
keywords_index={}
for i in range(len(total_sample_keywords)):
    keywords_index[total_sample_keywords[i]]=i
# print(keywords_index)

index_keywords={}
for i in keywords_index:
    index_keywords[keywords_index[i]]=i
# index_keywords

In [11]:
train_attr=np.zeros((len(train),len(total_sample_keywords)),np.float32)
# Storing occurrence of each term in each document respectively
total_train_corpus=stemming_stopwords_removing(train)
for i in range(len(total_train_corpus)):
    s = total_train_corpus[i].split()
    for h in s:
        if h in train_final_keywords:
            try:
                j = total_sample_keywords.index(h)
                train_attr[i,j] += 1
            except:
                continue
                
# train_attr
  
# U can also check your accuracy whith the below code...
# for i in range(len(total_test_corpus)):
#     s = total_test_corpus[i].split()
#     for h in s:
#         try:
#             j = total_sample_keywords.index(h)
#             test_attributes[i,j] += 1
#         except:
#             continue
            

In [12]:
# Now find indiscernibility of ur attributes table........
decision_attr=train["discourse_type"].values
ind=indiscernibility(train_attr,decision_attr)
# print(ind)

#### PHASE-2

In [14]:
# NOW REMOVING IDENTICAL COLUMNS FROM OUR MATRIX INPLACE.......
print("shape of old training attribute",train_attr.shape)
grp=np.column_stack(train_attr)
temp=pd.DataFrame(grp)
# print(temp.shape)

temp_grp=indiscernibility(temp,list(temp.columns))
# print(temp_grp)

unique_columns=[]
for i in range(len(temp_grp)):
    unique_columns.append(temp_grp[i][0])
    
unique_columns.sort()
print()
# print("Unique columns:",unique_columns)
print("\nNo of unique columns:",len(unique_columns))

unique_col_table=list(list())
for i in unique_columns:
    unique_col_table.append(list(grp[i]))
unique_col_table=np.array(unique_col_table)
# print("\nunique_col_table:")
print(unique_col_table)
print("\n unique_col_table shape",unique_col_table.shape)

new_train_attributes=np.column_stack(unique_col_table)


print("\nnew train attributes after preserving only one repetitve element:\n",new_train_attributes)
print("\nshape of new train attributes:",new_train_attributes.shape)

repetitive_columns=list(set(list(range(len(train_attr[0]))))-set(unique_columns))
print("number of repeated columns are: ", len(repetitive_columns))

shape of old training attribute (10000, 2381)


No of unique columns: 1552
[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]

 unique_col_table shape (1552, 10000)

new train attributes after preserving only one repetitve element:
 [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]]

shape of new train attributes: (10000, 1552)
number of repeated columns are:  829


In [15]:
# Now you have unique columns and preserved only one copy of each repeated elements..
# that is you have the unique column's word.....now make a new index

# Now we will check if ind changed or not
new_ind=indiscernibility(new_train_attributes,decision_attr)
# print(ind)
# print("\n",new_ind)
# so ind of the relation didnt change....

new_sample_keywords=[]
for i in unique_columns:
    new_sample_keywords.append(index_keywords[i])
# print(new_sample_keywords)


# Making a dictionary to store index of the keywords in "total_sample_keywords"......
new_keywords_index={}
for i in range(len(new_sample_keywords)):
    new_keywords_index[new_sample_keywords[i]]=i
   
# print(new_keywords_index)

new_index_keywords={}
for i in new_keywords_index:
    new_index_keywords[new_keywords_index[i]]=i
# new_index_keywords


In [18]:
decision_attr=train["discourse_type"]
decision_attr=decision_attr.apply(lambda x:encoding_discourse_type(x))
decision_attr=decision_attr.values
decision_attr

array([0, 1, 2, ..., 1, 2, 4], dtype=int64)

In [19]:
# Calculation fo lower approximation starts here...
groups_index={}
grp=0 # iam following grp index from 0...
for i in new_ind:
  for j in range(len(i)):
    groups_index[i[j]]=grp
  grp+=1
# print(groups_index)


{0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, 8: 0, 9: 0, 10: 0, 11: 0, 12: 0, 13: 0, 15: 0, 18: 0, 19: 0, 20: 0, 21: 0, 22: 0, 23: 0, 24: 0, 25: 0, 26: 0, 27: 0, 28: 0, 29: 0, 30: 0, 33: 0, 34: 0, 35: 0, 36: 0, 38: 0, 39: 0, 40: 0, 41: 0, 42: 0, 43: 0, 44: 0, 45: 0, 46: 0, 47: 0, 48: 0, 49: 0, 51: 0, 52: 0, 53: 0, 54: 0, 56: 0, 57: 0, 58: 0, 60: 0, 61: 0, 62: 0, 63: 0, 64: 0, 65: 0, 66: 0, 67: 0, 68: 0, 69: 0, 70: 0, 72: 0, 74: 0, 76: 0, 78: 0, 85: 0, 86: 0, 87: 0, 88: 0, 89: 0, 90: 0, 91: 0, 92: 0, 95: 0, 96: 0, 97: 0, 99: 0, 101: 0, 102: 0, 103: 0, 104: 0, 106: 0, 108: 0, 109: 0, 110: 0, 111: 0, 112: 0, 115: 0, 116: 0, 117: 0, 118: 0, 119: 0, 120: 0, 121: 0, 122: 0, 123: 0, 124: 0, 126: 0, 127: 0, 128: 0, 129: 0, 130: 0, 131: 0, 133: 0, 135: 0, 137: 0, 139: 0, 140: 0, 141: 0, 142: 0, 143: 0, 144: 0, 145: 0, 146: 0, 147: 0, 148: 0, 149: 0, 150: 0, 151: 0, 153: 0, 155: 0, 156: 0, 157: 0, 158: 0, 159: 0, 160: 0, 161: 0, 163: 0, 164: 0, 165: 0, 166: 0, 167: 0, 168: 0, 169: 0, 171: 0,

In [20]:

# Step -1: Find all the rows under one category....
def get_class(grp,y):
  lead_index=[False]*len(y)
  for i in range(len(y)):
    if y[i]==grp:
      lead_index[i]=True

  return lead_index

def get_lower_approximation(grp,ind,y):
  lead_index=get_class(grp,y)
  lower_approximation=[]
  for i in ind:
    temp=True
    for j in range(len(i)):
      if not lead_index[i[j]]:
        temp=False
        break
    if temp:
      for j in range(len(i)):
        lower_approximation.append(i[j])
  return lower_approximation

def get_upper_approximation(grp,ind,y):
  lead_index=get_class(grp,y)
  upper_approximation=[]
  for i in ind:
    temp=False
    for j in range(len(i)):
      if lead_index[i[j]]:
        temp=True
        break
    if temp:
      for j in range(len(i)):
        upper_approximation.append(i[j])
  return upper_approximation
  

In [21]:
# Finding lower apporximation for all the different classes we have and storing it in a matrix....
y=decision_attr
def get_lower_approximation_matrix(ind,decision_attr):
  lower_approx_matrix=list(list())
  y1=np.unique(y)
  for i in y1:
    lower_approx_matrix.append(list(get_lower_approximation(i,ind,y)))
  return lower_approx_matrix

lower_approx_matrix=get_lower_approximation_matrix(ind,y)

# print("lower approximation matrix is:",lower_approx_matrix)

    

# Finding upper apporximation for all the different classes we have and storing it in a matrix....
def get_upper_approximation_matrix(ind,y):
  upper_approx_matrix=list(list())
  y1=np.unique(y)
  for i in y1:
    upper_approx_matrix.append(list(get_upper_approximation(i,ind,y)))
  return upper_approx_matrix

upper_approx_matrix=get_upper_approximation_matrix(ind,y)

# print("upper approximation matrix is:",upper_approx_matrix)

# Defining uuniverse again as U:
U=list(range(0,len(y)))
universe=set(U)


# Finding boundary region for each classes....  "MAYBE" region
# Now from here on we will use set() datastructure in python and its predefined operations for further calculation....

def get_boundary_region_matrix(upper_approx_matrix,lower_approx_matrix):
  boundary_reg_matrix=list(list())
  for i in range(len(upper_approx_matrix)):
    upper_set=set(upper_approx_matrix[i])
    lower_set=set(lower_approx_matrix[i])
    boundary_reg_matrix.append(list(upper_set-lower_set))
  return boundary_reg_matrix

boundary_reg_matrix=get_boundary_region_matrix(upper_approx_matrix,lower_approx_matrix)

# print("boundary region matrix is:",boundary_reg_matrix)
# now note boundary region for any class is zero which means that they have equal lower and upper approximation


# Finding outside region for each class.... "NO" region
def get_outside_region_matrix(universe,upper_approx_matrix):
  outside_reg_matrix=list(list())
  for i in range(len(upper_approx_matrix)):
    upper_set=set(upper_approx_matrix[i])
    outside_reg_matrix.append(list(universe-upper_set))
  return outside_reg_matrix

outside_reg_matrix=get_outside_region_matrix(universe,upper_approx_matrix)
# print("outside region matrix is:",outside_reg_matrix)


lower approximation matrix is: [[3649, 4052, 9454, 993, 1729, 2642, 1694, 4728, 2885, 2040, 3148, 4995, 4492, 8372, 4168, 712, 1846, 1821, 2496, 4858, 2956, 9125, 1023, 1547, 2242, 3589, 277, 8225, 370, 6051, 6226, 4820, 1837, 4970, 8105, 9977, 2811, 125, 5085, 1685, 3526, 2486, 7955, 5993, 9114, 3840, 2950, 3719, 7296, 7101, 3790, 4539, 7672, 4073, 2897, 2385, 7362, 3121, 3877, 9810, 8845, 8619, 766, 9787, 3130, 5378, 6449, 4476, 1434, 8912, 1454, 838, 9231, 2052, 5590, 4309, 4768, 7741, 674, 1007, 2717, 9365, 5987, 4624, 7343, 4003, 7964, 1650, 7761, 5369, 1564, 154, 6139, 1555, 5286, 4828, 9991, 5265, 4040, 6644, 3371, 3089, 3283, 6473, 5887, 1959, 9637, 8935, 6720, 5422, 5411, 4064, 3505, 79, 2400, 6685, 7931, 2471, 1106, 8315, 865, 1046, 9797, 940, 2830, 9723, 8722, 8974, 2166, 6079, 684, 6303, 6433, 14, 8116, 93, 8363, 4023, 7236, 5899, 6516, 235, 3221, 6321, 5818, 4059], [1891, 1355, 7580, 3031, 9638, 80, 8894, 3780, 71, 3891, 1249, 4829, 5141, 9684, 5770, 1525, 1531, 4971, 9820

outside region matrix is: [[8195, 16, 17, 31, 32, 37, 50, 55, 59, 71, 73, 75, 77, 80, 81, 82, 83, 84, 94, 98, 8290, 100, 8291, 105, 8298, 107, 8299, 8301, 113, 114, 8313, 8320, 8321, 8322, 132, 134, 136, 138, 8330, 8336, 8338, 8340, 8342, 8343, 152, 8352, 162, 170, 8369, 178, 8381, 202, 8394, 8406, 223, 242, 8435, 8437, 253, 256, 8452, 8454, 268, 8472, 8481, 290, 8483, 8485, 8488, 302, 8501, 8506, 8507, 8509, 320, 8512, 8514, 8519, 8524, 8529, 338, 8540, 8542, 8557, 367, 8559, 8580, 8583, 8591, 8593, 8605, 417, 419, 8616, 8617, 430, 436, 8629, 443, 8636, 8642, 461, 462, 8654, 464, 472, 479, 8676, 488, 504, 505, 8702, 522, 526, 8725, 8730, 541, 8733, 544, 8739, 8740, 8741, 553, 555, 557, 559, 564, 8764, 574, 580, 582, 595, 8794, 605, 609, 610, 8805, 615, 616, 620, 622, 8815, 624, 625, 631, 633, 634, 8825, 636, 8843, 8847, 8853, 663, 664, 665, 8857, 8858, 669, 8866, 680, 8873, 683, 8883, 697, 8892, 8894, 8895, 8896, 8897, 8898, 707, 711, 8903, 730, 739, 8933, 8934, 8937, 746, 8939, 8941,

In [26]:
def get_set(approx_matrix,attributes):
    res=list(set())
    temp=[]
    for i in approx_matrix:
        for j in i:
            temp=np.where(attributes[j]!=0)
            temp=np.unique(temp)
        res.append(set(temp))
    return res

attr_lower_approx=get_set(lower_approx_matrix,new_train_attributes)
attr_upper_approx=get_set(upper_approx_matrix,new_train_attributes)
attr_boundary_approx=get_set(boundary_reg_matrix,new_train_attributes)
attr_outside_approx=get_set(outside_reg_matrix,new_train_attributes)

In [27]:
# REMOVING INCONSISTENCY TUPLES FROM OUR MATRICES using accuracy method...
# STEP-1: Find the inconsistent rows.....
# NOTE: Deletion of rows is done inplace..so runnnig this cell more than once will give undesirable result

# !--NotebookApp.iopub_data_rate_limit=1.0e10

def get_inconsistent_rows(ind,y):
  inconsistent_rows=list(list())

  for i in ind:
    if len(i)>1:
      temp=y[i[0]]
      for j in i:
        if y[j]!=temp:
          inconsistent_rows.append(list(i))
  return inconsistent_rows
      

inconsistent_rows=get_inconsistent_rows(ind,y)
# print(inconsistent_rows)

def find_accuracy(inconsistent_rows,lower_approx_matrix):
  highest_accuracy=-1
  accuracies=list(list())
  if len(inconsistent_rows)>0:
    # print(inconsistent_rows)
    
    for i in inconsistent_rows:
      lis=[]
      for j in i:
        cat=y[j]
        lower=lower_approx_matrix[cat]
        accuracy=len(lower)/len(U)
        lis.append(accuracy)
      accuracies.append(list(lis))
  return accuracies

accuracies=find_accuracy(inconsistent_rows,attr_lower_approx)

# now note this.. len(inconsistent_rows) and redundant rows will be same...
# so each element  in inconsistent_rows has its accuracy stored in accuracies...in the same format that of inconsistent_rows

  
def find_redundant_rows(accuracies,inconsistent_rows):
  redundant_rows=[]
  for i in range(len(accuracies)):
    max_accuracy=max(accuracies[i])
    for j in range(len(accuracies[i])):
      if accuracies[i][j]!=max_accuracy:
        redundant_rows.append(inconsistent_rows[i][j])
  return redundant_rows
redundant_rows=find_redundant_rows(accuracies,inconsistent_rows)
# print(len(redundant_rows))
# print(redundant_rows)


# Removing redundant rows if any...

if len(redundant_rows)>0:
  new_train_attributes=np.delete(new_train_attributes,redundant_rows,axis=0)

print("Done......")






In [34]:
# DELETEING SAME ROWS FROM decision_attr
if len(redundant_rows)>0:
  decision_attr=np.delete(decision_attr,redundant_rows,axis=0)

In [28]:
table_selected=new_train_attributes

In [35]:
decision_attr.shape

(2223,)

In [29]:
# new_train_attributes.shape

(2223, 1552)

In [31]:
universe=set(list(range(len(table_selected))))
len(universe)

2223

In [37]:
# NOW FIND NEW IND and STORE IT in "ind"
y=decision_attr
ind=indiscernibility(table_selected,decision_attr)

In [38]:
# NOW FIND LOWER APPROX. MATRIX, UPPER APPROX. MATRIX, BOUNDARY REGION, OUTSIDE REGION FOR NEW REDUCED MATRIX
lower_approx_matrix=get_lower_approximation_matrix(ind,decision_attr)
# print("New lower approximation matrix is:",lower_approx_matrix)

upper_approx_matrix=get_upper_approximation_matrix(ind,decision_attr)
# print("New upper approximation matrix is:",upper_approx_matrix)

boundary_reg_matrix=get_boundary_region_matrix(upper_approx_matrix,lower_approx_matrix)
# print("New boundary region matrix is:",boundary_reg_matrix)


outside_reg_matrix=get_outside_region_matrix(universe,upper_approx_matrix)
# print("New outside region matrix is:",outside_reg_matrix)

In [42]:
# NOW AGAIN FIND ATTRIBUTES RELATED TO LOWER,UPPER and outside region
attr_lower_approx=get_set(lower_approx_matrix,table_selected)
attr_upper_approx=get_set(upper_approx_matrix,table_selected)
attr_boundary_approx=get_set(boundary_reg_matrix,table_selected)
attr_outside_approx=get_set(outside_reg_matrix,table_selected)

In [43]:
# Now in above matrices we have rows which are for sure belongs toa particular rows..now i will use all the rows and get all
# the term index which are absolutely neccessary for each object to belong to a particular class.

same=[]
diff=[]
for i in range(len(boundary_reg_matrix)):
    if len(boundary_reg_matrix[i])==0:
        same.append(i)
    else:
        diff.append(i)
print(same)
print(diff)

[0, 1, 2, 3, 4, 5, 6]
[]


In [45]:
# LETS CHECK THE ACCURACY WITHIN THE DATASET USING ONLY LOWER APPROXIMATION:

res=[-1]*len(y)
for i in range(len(lower_approx_matrix)):
    for j in lower_approx_matrix[i]:
        res[j]=i
        
# print(res)
def accuracy(res,y):
    matched=0
    unmatched=0
    for i in range(len(res)):
        if res[i]==y[i]:
            matched+=1
    unmatched=len(res)-matched
    return matched,unmatched
success,failed=accuracy(res,y)
print(success/len(y))

1.0


In [54]:
test=train.tail(5000).copy()
test.reset_index(inplace=True)

In [55]:
len(new_sample_keywords)

1552

In [56]:
test_total_keywords,test_total_index_words, test_matrix, test_weighted_matrix, test_valid_index, test_final_keywords=get_test_matrix(test,new_sample_keywords,threshold=1)

5000 1552


In [None]:
test_weighted_matrix

In [59]:
test["encoded_dicourse_type"]=test["discourse_type"].copy()
test["encoded_dicourse_type"]=test["encoded_dicourse_type"].apply(lambda x: encoding_discourse_type(x))
test_y1=test["encoded_dicourse_type"].values
test_y1

array([4, 0, 1, ..., 1, 2, 4], dtype=int64)

In [64]:
for row in test_weighted_matrix:
    region_approx=list(list())
    highest_overlapping=list(list())
    x=set(list(np.unique(np.where(row!=0))))
    print(x)
    break

set()
