## Data Science Session 3

**Assignment:**

Store some text in three files:

    •two of text -A and B -must be created by the same author -X;
    •the third text -C -is created by another author -Y. 
    
Create a program to compare:
•A and B
•A and C
•B and C 

and to recognize the authors of each

In [37]:
import pandas as pd
import sklearn as sk
import math
import sys
import nltk # stopwords("english")
#nltk.download()

 **Terminology**
 
 - t — term (word)
 - d — document (set of words)
 - N — count of corpus
 - corpus — the total document set
 
 

**Text links**

- Bill Clinton
    - https://edition.cnn.com/2020/08/18/politics/bill-clinton-speech-transcript-dnc/index.html
- Martin Fowler
    - https://martinfowler.com/agile.html
    - https://martinfowler.com/agile.html

**Functions**

In [38]:
# read text document
def read_text_document(file_name):
    try:
        with open(file_name, 'r') as f:
            text = f.readline()
        return text
    except IOError:
        print("Error opening or reading input file: ", file_name)
        sys.exit()


# delete characteres
def delete_unimportant_characters(corpus):
    cs = ['.', '"', ',', ';', '-', ':', '?', '!', '/']  # list of unimportant characters
    text = ""

    for elem in corpus:
        if elem == cs[0] or elem == cs[1] or elem == cs[2] or elem == cs[3] or elem == cs[4] or elem == cs[5] or elem == \
                cs[6] or elem == cs[7] or elem == cs[8]:
            text += " "
        elif elem.isdigit():  # removes all digits
            text += " "
        else:
            text += elem
    return ' '.join(text.split()).lower()  # removes whitespace and sets all words to lowercase



def computeTextFrequency(wordDict, doc):
    tfDict = {}
    corpusCount = len(doc)
    for word, count in wordDict.items():
        tfDict[word] = count / float(corpusCount)
    return tfDict


# IDF -  inverse of the document frequency which measures the informativeness of term t
#  lower occurrence - higher importance of the word
# idf(t) = N/df
# to avoid div by zero: idf(t) = log(N/(df + 1))
def computeIDF(docList):
    idf = {}
    N = len(docList)

    idf = dict.fromkeys(docList[0].keys(), 0)
    for word, wcount in idf.items():
        idf[word] = math.log10(N / (float(wcount) + 1))

    return (idf)

# calculate tf-idf as a measure for the importance of a word
# tf-idf(t, d) = tf(t, d) * log(N/(df + 1))
def computeTFIDF(tf, idfs):
    tfidf = {}
    for word, wcount in tf.items():
        tfidf[word] = wcount * idfs[word]
    return tfidf



# Calculate dot product of two vectors, divide it by the magnitudes to find the cos(angle between them)
# Use the result as a correlation coefficient
def cosine(vector1, vector2):
    # calculate nominator as a dot product
    intersect = set(vector1.keys()) & set(vector2.keys())
    numerator = sum([vector1[x] * vector2[x] for x in intersect])

    # calculate the denominator
    sum1 = sum([vector1[x] ** 2 for x in list(vector1.keys())])
    sum2 = sum([vector2[x] ** 2 for x in list(vector2.keys())])

    denominator = math.sqrt(sum1) * math.sqrt(sum2)
    if not denominator:
        return 0.0
    else:
        return float(numerator) / denominator


def count_words_in_text(union_text, original_text):
    for word in original_text:
        union_text[word] += 1
    return union_text

In [39]:
# Read text file
bill_clinton = read_text_document('../data/bill_clinton.txt')
fowler_a = read_text_document('../data/martin_fowler_1.txt')
fowler_b = read_text_document('../data/martin_fowler_2.txt')

In [40]:
bill_clinton = delete_unimportant_characters(bill_clinton)
fowler_a = delete_unimportant_characters(fowler_a)
fowler_b = delete_unimportant_characters(fowler_b)

In [41]:
# Split each sentence
bill_clinton = bill_clinton.split(' ')
fowler_a = fowler_a.split(' ')
fowler_b = fowler_b.split(' ')

In [42]:
# removes regular words
from nltk.corpus import stopwords
stop_words = set(stopwords.words('english'))
bill_clinton = [w for w in bill_clinton if not w in stop_words]
fowler_a = [w for w in fowler_a if not w in stop_words]
fowler_b = [w for w in fowler_b if not w in stop_words]

In [43]:
# Combine martin fowler "a" with martin fowler "b" and bill "c"
total_of_abc = set(fowler_a).union(set(fowler_b)).union(set(bill_clinton))

In [44]:
# The fromkeys() method returns a dictionary with the specified keys and the specified value.
wordDictA = dict.fromkeys(total_of_abc, 0)
wordDictB = dict.fromkeys(total_of_abc, 0)
wordDictC = dict.fromkeys(total_of_abc, 0)
print(wordDictA)
print(wordDictB)
print(wordDictC)

{'virus': 0, 'distracting': 0, 'production': 0, 'renew': 0, "japan's": 0, 'choice': 0, 'advice': 0, 'authors': 0, 'new': 0, 'election': 0, 'specifics': 0, 'quickly': 0, 'surge': 0, 'practices': 0, 'needs': 0, 'collaboration': 0, 'many': 0, 'great': 0, 'responsibility': 0, 'rapidly': 0, 'media': 0, 'crisis': 0, 'allowing': 0, 'team': 0, 'despite': 0, 'information': 0, 'else': 0, 'triple': 0, 'percent': 0, 'began': 0, '(rather': 0, 'businesses': 0, 'year': 0, 'imposing': 0, 'center': 0, 'kids': 0, 'fundamental': 0, 'within': 0, 'make': 0, 'importantly': 0, 'would': 0, 'technology': 0, 'beat': 0, 'concentrate': 0, 'significant': 0, "community's": 0, 'watching': 0, 'organizing': 0, 'collapses': 0, 'still': 0, 'ability': 0, 'denying': 0, 'changes': 0, 'important': 0, 'inflame': 0, 'deaths': 0, 'managers': 0, 'hire': 0, 'leader': 0, 'decade': 0, 'asked': 0, 'demeaning': 0, 'defines': 0, 'writing': 0, 'blame': 0, 'thing': 0, 'get': 0, 'times': 0, 'tackling': 0, 'agile': 0, 'shrugged': 0, 'bus

In [45]:
# Count the words using a dictionary key-value pair
wordDictA = count_words_in_text(wordDictA, fowler_a)
wordDictB = count_words_in_text(wordDictB, fowler_b)
wordDictC = count_words_in_text(wordDictC, bill_clinton)
print(wordDictA)
print(wordDictB)
print(wordDictC)

{'virus': 0, 'distracting': 0, 'production': 1, 'renew': 0, "japan's": 0, 'choice': 0, 'advice': 0, 'authors': 0, 'new': 1, 'election': 0, 'specifics': 1, 'quickly': 1, 'surge': 0, 'practices': 2, 'needs': 2, 'collaboration': 1, 'many': 0, 'great': 0, 'responsibility': 0, 'rapidly': 2, 'media': 0, 'crisis': 0, 'allowing': 1, 'team': 2, 'despite': 0, 'information': 0, 'else': 0, 'triple': 0, 'percent': 0, 'began': 1, '(rather': 0, 'businesses': 0, 'year': 0, 'imposing': 0, 'center': 0, 'kids': 0, 'fundamental': 1, 'within': 1, 'make': 2, 'importantly': 1, 'would': 0, 'technology': 1, 'beat': 0, 'concentrate': 0, 'significant': 1, "community's": 0, 'watching': 0, 'organizing': 0, 'collapses': 0, 'still': 1, 'ability': 0, 'denying': 0, 'changes': 1, 'important': 0, 'inflame': 0, 'deaths': 0, 'managers': 1, 'hire': 0, 'leader': 0, 'decade': 1, 'asked': 0, 'demeaning': 0, 'defines': 0, 'writing': 2, 'blame': 0, 'thing': 0, 'get': 0, 'times': 0, 'tackling': 0, 'agile': 4, 'shrugged': 0, 'bus

In [46]:
# Two-dimensional, size-mutable, potentially heterogeneous tabular data.
df_ab = pd.DataFrame([wordDictA, wordDictB, wordDictC])
print(df_ab)

   virus  distracting  production  renew  japan's  choice  advice  authors  \
0      0            0           1      0        0       0       0        0   
1      0            0           0      0        0       0       0        1   
2      1            1           0      1        1       1       1        0   

   new  election  ...  says  tall  excellence  unemployment  planning  \
0    1         0  ...     0     0           0             0         1   
1    0         0  ...     0     0           1             0         0   
2    0         1  ...     1     1           0             2         0   

   whether  someone  easy  users  we're  
0        0        0     1      1      0  
1        0        0     0      0      0  
2        1        1     0      0      1  

[3 rows x 282 columns]


In [47]:
# how often can we find term t in document D ?
# running our sentences through the tf function:
text_frequency_a = computeTextFrequency(wordDictA, fowler_a)
text_frequency_b = computeTextFrequency(wordDictB, fowler_b)
text_frequency_c = computeTextFrequency(wordDictC, bill_clinton)

text_frequency = pd.DataFrame([text_frequency_a, text_frequency_b, text_frequency_c])
print(text_frequency)

      virus  distracting  production     renew   japan's    choice    advice  \
0  0.000000     0.000000    0.009804  0.000000  0.000000  0.000000  0.000000   
1  0.000000     0.000000    0.000000  0.000000  0.000000  0.000000  0.000000   
2  0.004926     0.004926    0.000000  0.004926  0.004926  0.004926  0.004926   

    authors       new  election  ...      says      tall  excellence  \
0  0.000000  0.009804  0.000000  ...  0.000000  0.000000    0.000000   
1  0.018182  0.000000  0.000000  ...  0.000000  0.000000    0.018182   
2  0.000000  0.000000  0.004926  ...  0.004926  0.004926    0.000000   

   unemployment  planning   whether   someone      easy     users     we're  
0      0.000000  0.009804  0.000000  0.000000  0.009804  0.009804  0.000000  
1      0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  0.000000  
2      0.009852  0.000000  0.004926  0.004926  0.000000  0.000000  0.004926  

[3 rows x 282 columns]


In [48]:
# idf = inverse document frequency
# inputting our sentences in the log file
idfs = computeIDF([wordDictA, wordDictB, wordDictC])

# running all sentences through the IDF:
# fowler vs fowler vs bill
#idf1 = computeTFIDF(text_frequency_a, idfs)
#idf2 = computeTFIDF(text_frequency_b, idfs)
#idf3 = computeTFIDF(text_frequency_c, idfs)

wordDictA = computeTFIDF(text_frequency_a, idfs)
wordDictB = computeTFIDF(text_frequency_b, idfs)
wordDictC = computeTFIDF(text_frequency_c, idfs)

# store in a dataframe
idf = pd.DataFrame([idf1, idf2])
print(idf)

   virus  distracting  production  renew  japan's  choice  advice   authors  \
0    0.0          0.0    0.004678    0.0      0.0     0.0     0.0  0.000000   
1    0.0          0.0    0.000000    0.0      0.0     0.0     0.0  0.008675   

        new  election  ...  says  tall  excellence  unemployment  planning  \
0  0.004678       0.0  ...   0.0   0.0    0.000000           0.0  0.004678   
1  0.000000       0.0  ...   0.0   0.0    0.008675           0.0  0.000000   

   whether  someone      easy     users  we're  
0      0.0      0.0  0.004678  0.004678    0.0  
1      0.0      0.0  0.000000  0.000000    0.0  

[2 rows x 282 columns]


In [49]:
# Similarity
simi_text1_text2 = cosine(wordDictA, wordDictB)
simi_text1_text3 = cosine(wordDictA, wordDictC)
simi_text2_text3 = cosine(wordDictB, wordDictC)
print(simi_text1_text2)
print(simi_text1_text3)
print(simi_text2_text3)

if simi_text1_text2 > simi_text1_text3 and simi_text1_text2 > simi_text2_text3:
    print("text 1 and 2 are from the same author")
elif simi_text1_text3 > simi_text1_text2 and simi_text1_text3 > simi_text2_text3:
    print("text 1 and 3 are from the same author")
else:
    print("text 2 and 3 are from the same author ")

0.2255335554089116
0.022633936510629653
0.0639198741805592
text 1 and 2 are from the same author
