# Baseline using Logisitic Regression

<b> Feature Extraction

In [107]:
import os.path
from nltk.corpus import stopwords
import re
from collections import defaultdict

In [108]:
data_root_dir = '..\data\DUC2001'
annotation_file = 'annotations.txt'
txt_opn_tag = '<TEXT>'
txt_close_tag = '</TEXT>'

In [109]:
def get_cluster_and_its_files(data_root_dir,annotation_file):
    '''Get a Cluster and the file names associated with it
       Returns a dictionary of the form { cluster_1 : [file1,file2,file3....], cluster_2 : [file1,file2,file3....] }'''    
    
    f = open(data_root_dir + '\\' + annotation_file,'r')
    
    clust_files = defaultdict(list)
    
    
    for line in f.readlines():
        cur_line = line.split(';')[0]
        clust_name = cur_line.split('@')[1]
        file_name = cur_line.split('@')[0]
        
        clust_files[clust_name].append(file_name)
        
    f.close()
    
    return clust_files
    
        

In [110]:
print get_cluster_and_its_files(data_root_dir,annotation_file)['mad cow disease']

['AP900322-0200', 'FBIS-41815', 'FBIS-45908', 'FT921-9310', 'FT931-3883', 'FT933-8272', 'FT941-575', 'LA042290-0104', 'LA060490-0083', 'WSJ910107-0139']


In [111]:
def get_text_from_doc(document_path,txt_opn_tag,txt_close_tag):
    
    f = open(document_path,'r')
    content = f.read()
    f.close()
    
    start = content.index(txt_opn_tag) + len(txt_opn_tag)
    end   = content.index(txt_close_tag)
    
    return content[start:end]
        

In [112]:
file_name = 'AP830325-0143'
file_path = data_root_dir + '\\' + file_name
get_text_from_doc(file_path,txt_opn_tag,txt_close_tag)

"\n   Millions of gallons of crude oil that\nspilled when a tanker ran aground spread across a wildlife-rich\nstretch of ocean Saturday, and Alaska's chief environmental officer\ncriticized cleanup efforts as too slow.\n   The biggest oil spill in U.S. history created a slick about\nseven miles long and seven miles wide in Prince William Sound. The\nCoast Guard said only Reef Island and the western edge of Bligh\nIsland had been touched by the slick.\n   ``This situation, I think, was everyone's secret nightmare about\nwhat could happen with oil traffic in the sound,'' said Dennis\nKelso, commissioner of the Alaska Department of Environmental\nConservation.\n   Some 240,000 barrels _ about 10,080,000 gallons _ of crude oil\nfrom Alaska's North Slope spilled early Friday when the 987-foot\ntanker Exxon Valdez ran hard aground on Bligh Reef, about 25 miles\noutside Valdez, where it had taken on a total cargo of 1.2 million\nbarrels. Initial reports indicated 270,000 barrels had spilled.\

In [113]:
def tokenize_txt(text):
    
    tokenizedList = re.split('\W+', text.lower())
    return [x for x in tokenizedList if x != '' and x != '\n' and x != u'\x85' and x != '\r']
    

In [114]:
tokenize_txt('What is this ?? Is this cool ? I don\'t know')

['what', 'is', 'this', 'is', 'this', 'cool', 'i', 'don', 't', 'know']

<b>Feature 1 : Term frequency over the cluster(TF)

In [115]:
def get_term_freqs(data_root_dir,annotation_file,stop_words=None) :
    '''Get the term freqs of words in clusters. The term freqs are unique to clusters.
    Returns a dict of form {clust1 : {word1 : 2, word2 :3...},clust2 : {word1 : 2, word2 :3..} ......}'''
        
    #Check about stop_words
    
    clust_files = get_cluster_and_its_files(data_root_dir,annotation_file)
    
    clust_term_freq = defaultdict(defaultdict)
    
    
    for clust,files in clust_files.iteritems():
        term_freq = defaultdict(int)
        
        for doc in files:
            doc_path = data_root_dir + '\\' + doc
            txt = get_text_from_doc(doc_path,txt_opn_tag,txt_close_tag)
            doc_tokens = tokenize_txt(txt)
            
            for token in doc_tokens:
                term_freq[token] += 1
        
        clust_term_freq[clust] = term_freq
    
    return clust_term_freq
            
            
            
    

In [116]:
get_term_freqs(data_root_dir,annotation_file)['cattle disease']

defaultdict(<type 'int'>, {'all': 1, 'german': 4, '092': 1, 'existing': 1, 'per': 1, 'human': 1, 'still': 1, 'decisions': 1, 'its': 1, 'contaminated': 3, 'one': 1, 'had': 2, 'kretzschmar': 1, 'to': 6, 'jakob': 1, 'do': 1, 'non': 2, 'popularly': 1, 'march': 1, 'diseases': 1, 'than': 1, 'government': 1, 'very': 1, 'scientists': 1, 'possible': 1, 'cannot': 1, 'know': 1, 'not': 2, 'affect': 2, 'safeguards': 1, 'countries': 1, 'should': 1, 'medicines': 1, '50': 1, 'transmitted': 2, 'minimal': 1, 'ban': 2, 'university': 1, 'because': 1, 'humans': 4, 'bovine': 1, 'connections': 1, 'likely': 1, 'catching': 1, 'are': 1, 'encephalopathy': 1, 'eu': 2, 'further': 1, 'institutes': 1, 'agriculture': 1, 'britain': 2, 'concern': 1, 'universities': 1, 'project': 1, 'said': 3, 'imported': 3, 'for': 2, '1992': 1, 'recorded': 1, 'expressed': 1, 'research': 4, 'may': 1, 'gottingen': 1, 'health': 1, 'between': 1, 'new': 1, 'announced': 1, 'available': 1, 'be': 7, 'we': 1, 'pushing': 1, 'however': 1, 'switze

<b> Feature 2 : Total document number in the datasets, divided by the frequency of documents which contains this word (IDF)

In [144]:
def get_doc_freqs(data_root_dir,annotation_file):
    data_root_dir += '\\'
    
    docs =  [file_name for _,__,file_name in os.walk(data_root_dir)][0]
    
    if annotation_file in docs:
        docs.remove(annotation_file)        
    
    inverted_index  = defaultdict(set)
    
    
    for doc in docs:
        doc_path = data_root_dir + doc        
        txt = get_text_from_doc(doc_path,txt_opn_tag,txt_close_tag)
        doc_tokens = tokenize_txt(txt)
        
        for token in doc_tokens:
            inverted_index[token].add(doc)
    
    
    
    no_of_docs = len(docs)
    idf_dict = defaultdict(float)
    
    for term,doc_lst in inverted_index.iteritems():
        idf_dict[term] = float(no_of_docs) / len(doc_lst)
    
    return idf_dict
        
        
    

In [146]:
doc_freqs = get_doc_freqs(data_root_dir,annotation_file)
print doc_freqs['furazabol']
print doc_freqs['the']

154.5
1.00324675325


In [None]:
's'