# Example: Load data, transform to Bag-of-words and fit Logistic regression

In [1]:
import pickle
import numpy as np
import stop_words

load_loc = '../../data/DeepFactData/annotated/' #SPECIFY!
with open(load_loc+"data_matrix_sample_programs.pickle",'rb') as f:
        data = pickle.load(f)

In [2]:
features = data['features']
features

['start time',
 'end time',
 'program_id',
 'sentence_id',
 'sentence',
 'claim_idx',
 'claim']

The sentences are for the data matrix (X), while the claims is for the outcome vector y

In [3]:
X = data['data'][:,4]
y = data['data'][:,6]
N = len(X)

# Some examples
for i in [21, 23, 33, 40, 48, 49, 50]: #range(100):
    print('Sentence is:\n\t' + X[i])
    print('Claim is:\n\t' + str(y[i])+'\n')
       
# Now convert y to a binary indicator matrix (1 is claim, 0 no claim)
y = np.asarray([y[i] is not None for i in range(N)])

Sentence is:
	LA og DF, der sammen med K udgør det parlamentariske grundlag rasede mod partiet, mens Løkke opfordrede til en tænkepause.
Claim is:
	None

Sentence is:
	De har sat personfnidder over fornuftig borgerlig politik.
Claim is:
	None

Sentence is:
	At vi skal bruge så meget energi på at nå dertil hvor hun ville indrømme, at der måske manglede en lille stjerne men ellers så var alt retvisende, og alt var lagt frem.
Claim is:
	None

Sentence is:
	Ingen kan være i tvivl om, at kronen på Eva Kjers politiske værk det er den her landbrugspakke, som er hendes prestigeprojekt.
Claim is:
	None

Sentence is:
	Vi kan sagtens være åbne og ærlige og fremlægge det for befolkningen og overbevise dem om, at det er det rigtige.
Claim is:
	None

Sentence is:
	Det er ikke kun mig, der mener, tallene ikke har været forklaret.
Claim is:
	None

Sentence is:
	Du sagde noget forkert. Du sagde, at det kun var mig, der mente det.
Claim is:
	None



## Transform and fit

In [4]:
import re
def preprocessing_bow(s):
    # replace special character with word (so it doesnt get lost)
    s = s.replace('%',' procent ')

    # Split numbers from words
    s = ' '.join(re.split('([\d,.-]+)',s))

    # Split all -
    s = ' '.join(s.split('-'))
    return s

for i in range(len(X)):
    X[i] = preprocessing_bow(X[i])

In [5]:
from sklearn.metrics import confusion_matrix
from sklearn import linear_model
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
# Make a Bag-of-Words
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(X)
words = vectorizer.get_feature_names()

# Fit the logit model
logistic = linear_model.LogisticRegression()
logistic.fit(X=X_bow,y=y)
ypred = logistic.predict(X_bow) 

# CM on traning data
C = confusion_matrix(y, ypred)
C

array([[7893,    5],
       [ 292,  276]])

Get the confusion matrix as probabilities.

In [7]:
C/np.sum(C[:])

array([[  9.32317505e-01,   5.90597685e-04],
       [  3.44909048e-02,   3.26009922e-02]])

## Most words only have one occurence!

In [8]:
a=np.asarray(np.sum(X_bow,axis=0))

print('Total words %i' %len(words))
print('Words with 1 occurence: %i' %np.sum(a==1))

Total words 10383
Words with 1 occurence: 5363


## Try removing stop words

In [9]:
# Get a list of danish stop words
stop_list = stop_words.get_stop_words('danish')
stop_list.remove('meget')
stop_list.remove('mange')

In [10]:
# Make a Bag-of-Words
vectorizer = CountVectorizer(stop_words=stop_list)
X_bow = vectorizer.fit_transform(X)
words = vectorizer.get_feature_names()

# Fit the logit model
logistic = linear_model.LogisticRegression()
logistic.fit(X=X_bow,y=y)
ypred = logistic.predict(X_bow) 

# CM on traning data
C = confusion_matrix(y, ypred)
C



array([[7894,    4],
       [ 326,  242]])

## Tokenize the words

In [11]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

In [12]:
s = X[0]
print(s)
print(word_tokenize(s, language='danish'))
#print(sent_tokenize(s, language='danish'))
stemmer = nltk.stem.SnowballStemmer('danish')
print([stemmer.stem(elm) for elm in word_tokenize(s, language='danish')])

Nu er det godt og vel  24  timer siden ,  at Lars Løkke bekendtgjorde at han måtte undersøge ,  om der er et grundlag for hans regering . 
['Nu', 'er', 'det', 'godt', 'og', 'vel', '24', 'timer', 'siden', ',', 'at', 'Lars', 'Løkke', 'bekendtgjorde', 'at', 'han', 'måtte', 'undersøge', ',', 'om', 'der', 'er', 'et', 'grundlag', 'for', 'hans', 'regering', '.']
['nu', 'er', 'det', 'godt', 'og', 'vel', '24', 'tim', 'sid', ',', 'at', 'lar', 'løk', 'bekendtgjord', 'at', 'han', 'måt', 'undersøg', ',', 'om', 'der', 'er', 'et', 'grundlag', 'for', 'han', 'regering', '.']


In [13]:
# Stemming each paragraph, then joining the paragraph into a dataset
stemmer = nltk.stem.SnowballStemmer('danish')
X_stem = []
for paragraf in X:
    X_stem.append(' '.join([stemmer.stem(el) for el in word_tokenize(paragraf, language='danish')]))

In [14]:
X_stem[0]

'nu er det godt og vel 24 tim sid , at lar løk bekendtgjord at han måt undersøg , om der er et grundlag for han regering .'

In [15]:
def stemmed_bow(X, vectorizer):
    stemmer = nltk.stem.SnowballStemmer('danish')
    
    X_stem = []
    for paragraf in X:
        X_stem.append(' '.join([stemmer.stem(el) for el in word_tokenize(paragraf, language='danish')]))
    
    return vectorizer.fit_transform(X_stem)
    
#vectorizer = CountVectorizer(stop_words=stop_list)
vectorizer = CountVectorizer()

In [16]:
X_bow = stemmed_bow(X, vectorizer)
words = vectorizer.get_feature_names()

# Fit the logit model
logistic = linear_model.LogisticRegression()
logistic.fit(X=X_bow,y=y)
ypred = logistic.predict(X_bow) 

# CM on traning data
C = confusion_matrix(y, ypred)
C

array([[7885,   13],
       [ 336,  232]])

In [17]:
a=np.asarray(np.sum(X_bow,axis=0))

print('Total words %i' %len(words))
print('Words with 1 occurence: %i' %np.sum(a==1))

Total words 7038
Words with 1 occurence: 3297


In [18]:
keep_idx = (a>1).reshape(-1, )

In [19]:
X_bow = X_bow[:,keep_idx]

new_words = []
for i in range(len(words)):
    if keep_idx[i]:
        new_words.append(words[i])
        
len(new_words)

3741

In [20]:
# Fit the logit model
logistic = linear_model.LogisticRegression()
logistic.fit(X=X_bow,y=y)
ypred = logistic.predict(X_bow) 

# CM on traning data
C = confusion_matrix(y, ypred)
C

array([[7883,   15],
       [ 370,  198]])

In [21]:
a=np.asarray(np.sum(X_bow,axis=0))

print('Total words %i' %len(new_words))
print('Words with 1 occurence: %i' %np.sum(a==1))

Total words 3741
Words with 1 occurence: 0


## Lemmatization
Not easily done for the danish language..

In [22]:
from nltk.corpus import wordnet as wn
'dan' in wn.langs()

True

In [23]:
wn.synsets('hund',lang='dan')

[Synset('dog.n.01')]