# Example: Load data, transform to Bag-of-words and fit Logistic regression

In [1]:
import pickle
import numpy as np
import stop_words

load_loc = '/home/jehi/Dropbox/DTU/DeepFactData/annotated/' #SPECIFY!
with open(load_loc+"data_matrix_sample_programs.pickle",'rb') as f:
        data = pickle.load(f)

In [2]:
features = data['features']
features

['start time',
 'end time',
 'program_id',
 'sentence_id',
 'sentence',
 'claim_idx',
 'claim']

The sentences are for the data matrix (X), while the claims is for the outcome vector y

In [3]:
X = data['data'][:,4]
y = data['data'][:,6]
N = len(X)

# Some examples
for i in [21, 23, 33, 40, 48, 49, 50]: #range(100):
    print('Sentence is:\n\t' + X[i])
    print('Claim is:\n\t' + str(y[i])+'\n')
       
# Now convert y to a binary indicator matrix (1 is claim, 0 no claim)
y = np.asarray([y[i] is not None for i in range(N)])

Sentence is:
	Det er også kedeligt, at man taler med én stemme om klimaet.
Claim is:
	None

Sentence is:
	Men hvad er ved at ske?
Claim is:
	None

Sentence is:
	Vi lever ikke i Europas Forenede Stater endnu så hvorfor skal vi have en præsident?
Claim is:
	None

Sentence is:
	Vi har en formand for Kommissionen og for Parlamentet.
Claim is:
	None

Sentence is:
	Det kan jeg nu ikke huske. Det er lang tid siden men det her er en formand for statsministerklubben, og det er godt.
Claim is:
	None

Sentence is:
	Lad os tage et konkret eksempel.
Claim is:
	None

Sentence is:
	De har diskuteret finansiering af den mulige klimaaftale i et helt år uden at kunne blive enige.
Claim is:
	None



## Transform and fit

In [4]:
import re
def preprocessing_bow(s):
    # replace special character with word (so it doesnt get lost)
    s = s.replace('%',' procent ')

    # Split numbers from words
    s = ' '.join(re.split('([\d,.-]+)',s))

    # Split all -
    s = ' '.join(s.split('-'))
    return s

for i in range(len(X)):
    X[i] = preprocessing_bow(X[i])

In [5]:
from sklearn.metrics import confusion_matrix
from sklearn import linear_model
from sklearn.feature_extraction.text import CountVectorizer
import matplotlib.pyplot as plt
%matplotlib inline

In [6]:
# Make a Bag-of-Words
vectorizer = CountVectorizer()
X_bow = vectorizer.fit_transform(X)
words = vectorizer.get_feature_names()

# Fit the logit model
logistic = linear_model.LogisticRegression()
logistic.fit(X=X_bow,y=y)
ypred = logistic.predict(X_bow) 

# CM on traning data
C = confusion_matrix(y, ypred)
C

array([[3225,    0],
       [ 116,  148]])

Get the confusion matrix as probabilities.

In [7]:
C/np.sum(C[:])

array([[ 0.92433362,  0.        ],
       [ 0.03324735,  0.04241903]])

## Most words only have one occurence!

In [8]:
a=np.asarray(np.sum(X_bow,axis=0))

print('Total words %i' %len(words))
print('Words with 1 occurence: %i' %np.sum(a==1))

Total words 6162
Words with 1 occurence: 3405


## Try removing stop words

In [9]:
# Get a list of danish stop words
stop_list = stop_words.get_stop_words('danish')
stop_list.remove('meget')
stop_list.remove('mange')

In [10]:
# Make a Bag-of-Words
vectorizer = CountVectorizer(stop_words=stop_list)
X_bow = vectorizer.fit_transform(X)
words = vectorizer.get_feature_names()

# Fit the logit model
logistic = linear_model.LogisticRegression()
logistic.fit(X=X_bow,y=y)
ypred = logistic.predict(X_bow) 

# CM on traning data
C = confusion_matrix(y, ypred)
C



array([[3225,    0],
       [ 136,  128]])

## Tokenize the words

In [11]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize

In [12]:
s = X[0]
print(s)
print(word_tokenize(s, language='danish'))
#print(sent_tokenize(s, language='danish'))
stemmer = nltk.stem.SnowballStemmer('danish')
print([stemmer.stem(elm) for elm in word_tokenize(s, language='danish')])

Glem alt om kommunalvalg ,  rævekager ,  de lange knives nat og lokalt fnidder . 
['Glem', 'alt', 'om', 'kommunalvalg', ',', 'rævekager', ',', 'de', 'lange', 'knives', 'nat', 'og', 'lokalt', 'fnidder', '.']
['glem', 'alt', 'om', 'kommunalvalg', ',', 'rævekag', ',', 'de', 'lang', 'kniv', 'nat', 'og', 'lokalt', 'fnid', '.']


In [13]:
# Stemming each paragraph, then joining the paragraph into a dataset
stemmer = nltk.stem.SnowballStemmer('danish')
X_stem = []
for paragraf in X:
    X_stem.append(' '.join([stemmer.stem(el) for el in word_tokenize(paragraf, language='danish')]))

In [14]:
X_stem[0]

'glem alt om kommunalvalg , rævekag , de lang kniv nat og lokalt fnid .'

In [15]:
def stemmed_bow(X, vectorizer):
    stemmer = nltk.stem.SnowballStemmer('danish')
    
    X_stem = []
    for paragraf in X:
        X_stem.append(' '.join([stemmer.stem(el) for el in word_tokenize(paragraf, language='danish')]))
    
    return vectorizer.fit_transform(X_stem)
    
#vectorizer = CountVectorizer(stop_words=stop_list)
vectorizer = CountVectorizer()

In [16]:
X_bow = stemmed_bow(X, vectorizer)
words = vectorizer.get_feature_names()

# Fit the logit model
logistic = linear_model.LogisticRegression()
logistic.fit(X=X_bow,y=y)
ypred = logistic.predict(X_bow) 

# CM on traning data
C = confusion_matrix(y, ypred)
C

array([[3222,    3],
       [ 135,  129]])

In [17]:
a=np.asarray(np.sum(X_bow,axis=0))

print('Total words %i' %len(words))
print('Words with 1 occurence: %i' %np.sum(a==1))

Total words 4295
Words with 1 occurence: 2082


In [18]:
keep_idx = (a>1).reshape(-1, )

In [19]:
X_bow = X_bow[:,keep_idx]

new_words = []
for i in range(len(words)):
    if keep_idx[i]:
        new_words.append(words[i])
        
len(new_words)

2213

In [20]:
# Fit the logit model
logistic = linear_model.LogisticRegression()
logistic.fit(X=X_bow,y=y)
ypred = logistic.predict(X_bow) 

# CM on traning data
C = confusion_matrix(y, ypred)
C

array([[3220,    5],
       [ 155,  109]])

In [21]:
a=np.asarray(np.sum(X_bow,axis=0))

print('Total words %i' %len(new_words))
print('Words with 1 occurence: %i' %np.sum(a==1))

Total words 2213
Words with 1 occurence: 0


## Lemmatization
Not easily done for the danish language..

In [22]:
from nltk.corpus import wordnet as wn
'dan' in wn.langs()

True

In [23]:
wn.synsets('hund',lang='dan')

[Synset('dog.n.01')]