In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import re
import nltk
# nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from nltk.stem.porter import PorterStemmer

In [2]:
def encoding_discourse_type(x):
    if x=="Lead":
        return 0
    if x=="Position":
        return 1
    if x=="Evidence":
        return 2
    if x=="Claim":
        return 3
    if x=="Concluding Statement":
        return 4
    if x=="Counterclaim":
        return 5
    if x=='Rebuttal':
        return 6
    
def stemming_stopwords_removing(df):
    corpus=[]
    for i in range(len(df)):
        review=re.sub('[^a-zA-Z]',' ',df["discourse_text"][i])
        review=review.lower()
        review=review.split()
        ps=PorterStemmer()
        all_stopwords=stopwords.words("english")
        review=[ps.stem(word) for word in review if not word in set(all_stopwords)]
        review=' '.join(review)
        corpus.append(review)
    return corpus

# storing the total occurrence.......
def get_total_index_words(corpus):
    index_word={}
    for i in corpus:
        s=i.split()
        for j in s:
            if j not in index_word:
                index_word[j]=1
            else:
                index_word[j]+=1
    return index_word
    
def get_values (dataset,threshold=1):
    
    # taking sample of 20 documents for lead category....
    df = dataset.head(20)

    total_corpus = stemming_stopwords_removing(df)
    # print(total_corpus)

    # getting total index words and their count in the taken sample as a dict
    total_index_words = get_total_index_words(total_corpus)
    # print(len(lead_index_words))

    # Creating a list of total keywords before filtering..
    total_keywords = list(total_index_words.keys())
    # print(lead_keywords)

    # Creating a matrix of width equals len(lead_keywords)
    matrix=np.zeros((len(df),len(total_keywords)),np.float16)

    
    # Storing occurrence of each term in each document respectively
    for i in range(len(total_corpus)):
        s = total_corpus[i].split()
        for h in s:
            j = total_keywords.index(h)
            matrix[i,j] += 1


    # Storing their weights....
    weighted_matrix = np.copy(matrix)
    
    for i in range(len(df)):
        for j in range(len(total_keywords)):
            weighted_matrix[i,j] = weighted_matrix[i,j] / total_index_words[total_keywords[j]]
    #            print(weighted_matrix[i,j])



    # FILTERING WEIGHTS with a threshold.......
    valid_index = []
    for i in range(len(df)):
        for j in range(len(total_keywords)):
            if weighted_matrix[i,j] >= 1:
                valid_index.append(j)

    # removing duplicates and storing them in a list.......    
    valid_index = list(set(valid_index))


    # # Storing the final keywords.... 
    valid_index_words = []
    for i in range(len(valid_index)):
        valid_index_words.append(total_keywords[valid_index[i]])
    # print(valid_lead_index_words)

    return total_keywords,total_index_words,matrix, weighted_matrix, valid_index, valid_index_words


In [3]:
df=pd.read_csv("train.csv")
df

Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring
0,423A1CA112E2,1.622628e+12,8.0,229.0,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1,423A1CA112E2,1.622628e+12,230.0,312.0,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
2,423A1CA112E2,1.622628e+12,313.0,401.0,Some certain areas in the United States ban ph...,Evidence,Evidence 1,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
3,423A1CA112E2,1.622628e+12,402.0,758.0,"When people have phones, they know about certa...",Evidence,Evidence 2,76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...
4,423A1CA112E2,1.622628e+12,759.0,886.0,Driving is one of the way how to get around. P...,Claim,Claim 1,139 140 141 142 143 144 145 146 147 148 149 15...
...,...,...,...,...,...,...,...,...
144288,4C471936CD75,1.618153e+12,2234.0,3203.0,if I'm not sure what college I want to attend...,Evidence,Evidence 2,386 387 388 389 390 391 392 393 394 395 396 39...
144289,4C471936CD75,1.618153e+12,3221.0,4509.0,seeking multiple opinions before making a har...,Evidence,Evidence 3,576 577 578 579 580 581 582 583 584 585 586 58...
144290,4C471936CD75,1.618025e+12,4510.0,4570.0,it is better to seek multiple opinions instead...,Position,Position 1,828 829 830 831 832 833 834 835 836 837 838
144291,4C471936CD75,1.618025e+12,4570.0,4922.0,The impact of asking people to help you make a...,Evidence,Evidence 4,839 840 841 842 843 844 845 846 847 848 849 85...


In [4]:
lead=df[df["discourse_type"]=="Lead"]
position=df[df["discourse_type"]=="Position"]
evidence=df[df["discourse_type"]=="Evidence"]
claim=df[df["discourse_type"]=="Claim"]
counter_claim=df[df["discourse_type"]=="Counterclaim"]
rebuttal=df[df["discourse_type"]=="Rebuttal"]
concluding_statement=df[df["discourse_type"]=="Concluding Statement"]

In [5]:
concluding_statement

Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring
9,423A1CA112E2,1.622628e+12,1891.0,2027.0,The news always updated when people do somethi...,Concluding Statement,Concluding Statement 1,355 356 357 358 359 360 361 362 363 364 365 36...
13,A8445CABFECE,1.622576e+12,1031.0,1243.0,"In conclusion, drivers should not able to work...",Concluding Statement,Concluding Statement 1,177 178 179 180 181 182 183 184 185 186 187 18...
20,6B4F7A0165B9,1.622644e+12,1682.0,1906.0,The creation of telecommunication devices was ...,Concluding Statement,Concluding Statement 1,296 297 298 299 300 301 302 303 304 305 306 30...
32,E05C7F5C1156,1.622839e+12,2878.0,3513.0,"In conclusion, people shouldn't use cellphones...",Concluding Statement,Concluding Statement 1,478 479 480 481 482 483 484 485 486 487 488 48...
41,50B3435E475B,1.622645e+12,1914.0,2047.0,"Over all texting and driving is dangerous, has...",Concluding Statement,Concluding Statement 1,328 329 330 331 332 333 334 335 336 337 338 33...
...,...,...,...,...,...,...,...,...
144251,8F4B595CF9E7,1.617987e+12,2907.0,3155.0,Seeking out multiple people for good advice is...,Concluding Statement,Concluding Statement 1,533 534 535 536 537 538 539 540 541 542 543 54...
144260,6B5809C83978,1.618239e+12,1804.0,2113.0,By gathering different opinions it will help p...,Concluding Statement,Concluding Statement 1,315 316 317 318 319 320 321 322 323 324 325 32...
144269,408A7D3D2EEC,1.618325e+12,1680.0,2052.0,"In conclusion, asking advice can help someone ...",Concluding Statement,Concluding Statement 1,295 296 297 298 299 300 301 302 303 304 305 30...
144283,AFEC37C2D43F,1.617803e+12,2907.0,3140.0,The odds are in your favor; and the on the off...,Concluding Statement,Concluding Statement 1,505 506 507 508 509 510 511 512 513 514 515 51...


In [6]:
# lead.to_csv("lead.csv")
# position.to_csv("position.csv")
# evidence.to_csv("evidence.csv")
# claim.to_csv("claim.csv")
# counter_claim.to_csv("counter_claim.csv")
# rebuttal.to_csv("rebuttal.csv")
# concluding_statement.to_csv("concluding_statement.csv")

In [7]:
position=pd.read_csv("position.csv")
evidence=pd.read_csv("evidence.csv")
claim=pd.read_csv("claim.csv")
counter_claim=pd.read_csv("counter_claim.csv")
rebuttal=pd.read_csv("rebuttal.csv")
lead=pd.read_csv("lead.csv")
concluding_statement=pd.read_csv("concluding_statement.csv")
# lead

In [8]:
lead_total_keywords,lead_total_index_words, lead_matrix, lead_weighted_matrix, lead_valid_index, lead_final_keywords=get_values(lead)

claim_total_keywords,claim_total_index_words, claim_matrix, claim_weighted_matrix, claim_valid_index, claim_final_keywords=get_values(claim)

counter_claim_total_keywords,counter_claim_total_index_words, counter_claim_matrix, counter_claim_weighted_matrix, counter_claim_valid_index, counter_claim_final_keywords=get_values(counter_claim)

evidence_keywords,evidence_index_words, evidence_matrix, evidence_weighted_matrix, evidence_valid_index, evidence_final_keywords=get_values(evidence)

position_total_keywords,position_total_index_words, position_matrix, position_weighted_matrix, position_valid_index, position_final_keywords=get_values(position)

rebuttal_keywords,rebuttal_index_words, rebuttal_matrix, rebuttal_weighted_matrix, rebuttal_valid_index, rebuttal_final_keywords=get_values(position)

concluding_statement_keywords,concluding_statement_index_words, concluding_statement_matrix, concluding_statement_weighted_matrix, concluding_statement_valid_index, concluding_statement_final_keywords=get_values(concluding_statement)

In [9]:
classes=[lead_final_keywords, claim_final_keywords, counter_claim_final_keywords, evidence_final_keywords,
        position_final_keywords, rebuttal_final_keywords, concluding_statement_final_keywords]

# Concatenating all the keywords in a single list
total_sample_keywords=[]
for i in classes:
    total_sample_keywords=total_sample_keywords+i

In [10]:
total_sample_keywords=list(set(total_sample_keywords))
# total_sample_keywords

In [11]:
# creating our attributes table for first 20 values of all the 7 different classes
attributes=np.zeros((140,len(total_sample_keywords)))

# taking sample values from the dataset that is 140 different
df1=df.head(100).copy()

# adding some values to rebuttal and counterclaim to have equal data points from each class..
df1=df1.append(rebuttal.head(20),ignore_index = True)
df1=df1.append(counter_claim.head(20),ignore_index = True)
df1.groupby(["discourse_type"]).count()
df1.drop(["Unnamed: 0"],axis="columns",inplace=True)
df1

Unnamed: 0,id,discourse_id,discourse_start,discourse_end,discourse_text,discourse_type,discourse_type_num,predictionstring
0,423A1CA112E2,1.622628e+12,8.0,229.0,Modern humans today are always on their phone....,Lead,Lead 1,1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 1...
1,423A1CA112E2,1.622628e+12,230.0,312.0,They are some really bad consequences when stu...,Position,Position 1,45 46 47 48 49 50 51 52 53 54 55 56 57 58 59
2,423A1CA112E2,1.622628e+12,313.0,401.0,Some certain areas in the United States ban ph...,Evidence,Evidence 1,60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75
3,423A1CA112E2,1.622628e+12,402.0,758.0,"When people have phones, they know about certa...",Evidence,Evidence 2,76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 9...
4,423A1CA112E2,1.622628e+12,759.0,886.0,Driving is one of the way how to get around. P...,Claim,Claim 1,139 140 141 142 143 144 145 146 147 148 149 15...
...,...,...,...,...,...,...,...,...
135,354946A1CA46,1.623079e+12,2657.0,2824.0,Opponents whose work depends on a cell phone l...,Counterclaim,Counterclaim 2,450 451 452 453 454 455 456 457 458 459 460 46...
136,1D35A6980E7F,1.622824e+12,388.0,424.0,"Although it tends to be distracting,",Counterclaim,Counterclaim 1,73 74 75 76 77 78
137,1D35A6980E7F,1.622824e+12,1777.0,1920.0,Texting while driving is a really problematic ...,Counterclaim,Counterclaim 2,328 329 330 331 332 333 334 335 336 337 338 33...
138,40CC76613B2D,1.623002e+12,1120.0,1216.0,To say that there is a time devices should be ...,Counterclaim,Counterclaim 1,191 192 193 194 195 196 197 198 199 200 201 20...


In [12]:
# Storing discourse types in an array fo df1
df1_classes=df1["discourse_type"].values

# Removing stopwords and performing Porter Stemming....
df1_corpus = stemming_stopwords_removing(df1)

# getting total index words and their count in the taken sample as a dict
df1_index_words = get_total_index_words(df1_corpus)

# Creating a list of total keywords before filtering..
df1_keywords = total_sample_keywords


# Storing occurrence of each term in each document respectively
for i in range(len(df1_corpus)):
    s = df1_corpus[i].split()
    for h in s:
        if h in total_sample_keywords:
            j = total_sample_keywords.index(h)
            attributes[i,j] += 1

# df1_index_words



In [21]:
# grouping similar rows in a dictionary.....
dict1=[]

u,indices=np.unique(attributes, return_index = True,axis=0)
# print(len(u))


indices.sort()
# print(indices)

non_unique_indices=[]
for i in range(1,len(indices)):
    if abs(indices[i-1]-indices[i])>1:
        for j in range(indices[i-1]+1,indices[i]):
            non_unique_indices.append(j)
# print(non_unique_indices)

# now check for the last element...
if indices[-1]!=len(attributes):
    for i in range(indices[-1]+1,len(attributes)):
        non_unique_indices.append(i)


# now non-unique are the elements which are repeating..now check if they are 
# redundant or not.....
b=False
if len(non_unique_indices)>0:
    a=df1_classes[non_unique_indices[0]]
    for i in non_unique_indices:
        if df1_classes[i]!=a:
            b=True
            break
# now the deal is that even one repetitive element is giving different ans..
# then delete those rows from ur matrix as well as from df1_classes
if b:
    for i in non_unique_indices:
        attributes=np.delete(attributes,i,0)
    for i in non_unique_indices:
        df1_classes=np.delete(df1_classes,i,0)
        
# print(len(df1_classes),len(non_unique_indices))
        

    
    


In [None]:
# NOW CALCULATE INDISCERNIBILITY..