In [None]:
# Experiments on entire text instead of just snippets

import pandas as pd
import glob
import numpy as np 
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.linear_model import SGDClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn import metrics
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier



In [None]:
# Get data 
species = glob.glob('*tsv')
np_array_list = []
for file_ in species:
    df = pd.read_csv(file_,index_col=None, header=0, sep="\t")
    np_array_list.append(df.as_matrix())

comb_np_array = np.vstack(np_array_list)
big_frame = pd.DataFrame(comb_np_array)


In [None]:
pd.set_option('display.max_colwidth', 1)
temp = big_frame[9].str.extract(r'<body>(.*)</body>', expand=False)

temp  = temp.str.replace('<b style="color:red; font-size:14pt"><u><i>(.*)</i></u></b>', '=ANIMAL=')
temp  = temp.str.replace('<title>', '')
temp  = temp.str.replace('</title>', '')
temp  = temp.str.replace('<p>', '')
temp  = temp.str.replace('</p>', '')
temp  = temp.str.replace('<title xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">' ,'')
temp  = temp.str.replace('<p xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">' ,'')

big_frame['text_clean'] = temp 


big_frame.head()

In [None]:
#Try the NaiveBayes classifier on the complete dataset 
labels = big_frame[0]
text = big_frame['text_clean']
print("length total: ", len(labels))
labels_train, labels_test, text_train, text_test = train_test_split(labels, text, test_size=0.3, random_state=1)
print("length train labels: ", len(labels_train))
print("length test labels: ", len(labels_test))


text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB()),])
text_clf.fit(text_train, labels_train)
docs_test = text_test
predicted = text_clf.predict(docs_test)
print(metrics.classification_report(labels_test, predicted))


In [None]:
#text_clf = Pipeline([('tfidf', TfidfVectorizer(min_df=5, max_df=0.7, sublinear_tf=True, ngram_range=(1,2), analyzer='word', max_features=10000)), ('clf-svm', LinearSVC(class_weight='balanced', C=1.0, verbose=True))])
#text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', MultinomialNB()),])
text_clf = Pipeline([('tfidf', TfidfVectorizer(min_df=5, max_df=0.7, sublinear_tf=True, ngram_range=(1,2), analyzer='word', max_features=10000)), ('clf', SGDClassifier(loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=5, tol=None)),])
#text_clf = Pipeline([('tfidf', TfidfVectorizer(min_df=5, max_df=0.7, sublinear_tf=True, ngram_range=(1,2), analyzer='word', max_features=10000)), ('clf', KNeighborsClassifier(n_neighbors=3, weights='uniform', algorithm='brute', leaf_size=30, p=2, metric='euclidean', metric_params=None, n_jobs=1))])
#text_clf = Pipeline([('tfidf', TfidfVectorizer(min_df=5, max_df=0.7, sublinear_tf=True, ngram_range=(1,2), analyzer='word', max_features=10000)), ('clf', DecisionTreeClassifier())])


from sklearn import metrics
from sklearn.model_selection import StratifiedKFold
import numpy as np

def customCrossValidation(X, y):
    ''' Perform a cross validation and print out the metrics '''
    skf = StratifiedKFold(n_splits=10)
    cm = None
    y_predicted_overall = None
    y_test_overall = None
    for train_index, test_index in skf.split(X, y):
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]
        text_clf.fit(X_train, y_train)
        y_predicted = text_clf.predict(X_test)
        # collect the y_predicted per fold
        if y_predicted_overall is None:
            y_predicted_overall = y_predicted
            y_test_overall = y_test
        else: 
            y_predicted_overall = np.concatenate([y_predicted_overall, y_predicted])
            y_test_overall = np.concatenate([y_test_overall, y_test])
        cv_cm = metrics.confusion_matrix(y_test, y_predicted)
        # sum the cv per fold
        if cm is None:
            cm = cv_cm
        else:
            cm += cv_cm
    print (metrics.classification_report(y_test_overall, y_predicted_overall, digits=3))
    print (cm)
    
customCrossValidation(big_frame['text_clean'], big_frame[0])  



In [None]:
text_clf = Pipeline([('tfidf', TfidfVectorizer(min_df=5, max_df=0.7, sublinear_tf=True, ngram_range=(1,2), analyzer='word', max_features=10000)), ('clf', KNeighborsClassifier(n_neighbors=3, weights='uniform', algorithm='brute', leaf_size=30, p=2, metric='euclidean', metric_params=None, n_jobs=1))])
customCrossValidation(big_frame['text_clean'], big_frame[0])  



In [None]:
text_clf = Pipeline([('tfidf', TfidfVectorizer(min_df=5, max_df=0.7, sublinear_tf=True, ngram_range=(1,2), analyzer='word', max_features=10000)), ('clf', DecisionTreeClassifier())])
customCrossValidation(big_frame['text_clean'], big_frame[0])  
