In [4]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from nltk.corpus import stopwords


from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression

In [5]:
def read_data(data_file,label_file):
    
    with open(data_file) as f:
        data = f.read().split('\n')
        
    with open(label_file) as f:
        labels = f.read().split('\n')
        
    return data,labels

In [6]:
dat,lab = read_data('baseline_hits.txt','baseline_hits_labels.txt')

In [7]:
# Pipeline and model building:

TF = TfidfVectorizer()
X = TF.fit_transform(dat)

# Logistic Regression

In [8]:
# Create training and test set
X_train, X_test, y_train, y_test = train_test_split(dat,lab,test_size=0.2,random_state=42)

# Stopwords in english
stop = stopwords.words('english')

pipe = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stop)),
    ('logreg', LogisticRegression())
])

#pipe.fit(X_train,y_train)
params = {"tfidf__lowercase":[True,False],
          "tfidf__max_df":[0.9,0.95,0.98,1.0],
          "tfidf__min_df":[1,2],
          "logreg__C":[0.001,0.01,0.1,1.0,10.0,100.0]}

searchLR = GridSearchCV(pipe,param_grid=params,n_jobs=-1,verbose=3,scoring='roc_auc',cv=4)

searchLR.fit(X_train,y_train)

Fitting 4 folds for each of 96 candidates, totalling 384 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    1.0s
[Parallel(n_jobs=-1)]: Done 316 tasks      | elapsed:    2.5s
[Parallel(n_jobs=-1)]: Done 384 out of 384 | elapsed:    2.7s finished


GridSearchCV(cv=4, error_score=nan,
             estimator=Pipeline(memory=None,
                                steps=[('tfidf',
                                        TfidfVectorizer(analyzer='word',
                                                        binary=False,
                                                        decode_error='strict',
                                                        dtype=<class 'numpy.float64'>,
                                                        encoding='utf-8',
                                                        input='content',
                                                        lowercase=True,
                                                        max_df=1.0,
                                                        max_features=None,
                                                        min_df=1,
                                                        ngram_range=(1, 1),
                                                        n

In [9]:
searchLR.best_params_

{'logreg__C': 10.0,
 'tfidf__lowercase': True,
 'tfidf__max_df': 0.9,
 'tfidf__min_df': 2}

In [10]:
searchLR.best_score_

0.5703125

### Next steps:
1. Take a look at the accuracy/precision for both the 0 and the 1 class
2. Try a Naive Bayes classifier
3. Try more text preprocessing than simply lower-case. We can make a function and use it in the pipeline as a tf-idf parameter (look into the "preprocessor" parameter: https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html)
4. run best-performing model on the test data.

### Build the best version of LR based on GridSearch above

In [97]:
TF_best = TfidfVectorizer(
    analyzer='word',
    binary=False,
    decode_error='strict',
    encoding='utf-8',
    input='content',
    lowercase=True,
    max_df=0.9,
    max_features=None,
    min_df=2,
    ngram_range=(1, 1),
    norm='l2',
    preprocessor=None,
    smooth_idf=True,
    stop_words= stop)
X_best = TF_best.fit_transform(dat)

X_train_best, X_test_best, y_train_best, y_test_best = train_test_split(X_best,lab,test_size=0.2,random_state=42)

LR_best = LogisticRegression(C=10)
LR_best.fit(X_train_best, y_train_best)

LR_best.score(X_test_best, y_test_best)

0.55

In [117]:
from sklearn.metrics import confusion_matrix

tn, fp, fn, tp = confusion_matrix(y_test_best, LR_best.predict(X_test_best)).ravel()

recall_best = tp/(tp+fn)
precision_best = tp/(tp+fp)
accuracy_best = (tp+tn)/(tp+tn+fp+fn)
f1_best = 2*((precision_best+recall_best)/(precision_best+recall_best))

print(f"Recall: {recall_best}\nPrecision: {precision_best}\nAccuracy: {accuracy_best}\nF1: {f1_best}")

Recall: 0.0
Precision: nan
Accuracy: 0.55
F1: nan


  


# Naive Bayes

In [100]:
from sklearn.naive_bayes import MultinomialNB

In [112]:
pipe = Pipeline([
    ('tfidf', TfidfVectorizer(stop_words=stop)),
    ('NB', MultinomialNB(alpha=0.001))
])

#pipe.fit(X_train,y_train)
params = {"tfidf__lowercase":[True,False],
          "tfidf__max_df":[0.9,0.95,0.98,1.0],
          "tfidf__min_df":[1,2],
          "NB__alpha": [0,0.0001,0.001,0.01,1],
          "NB__fit_prior": [True,False],
         }

searchNB = GridSearchCV(pipe,param_grid=params,n_jobs=-1,verbose=3,scoring='roc_auc',cv=4)

searchNB.fit(X_train,y_train);

Fitting 4 folds for each of 160 candidates, totalling 640 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done  16 tasks      | elapsed:    0.1s
[Parallel(n_jobs=-1)]: Done 640 out of 640 | elapsed:    2.2s finished


In [113]:
searchNB.best_params_

{'NB__alpha': 0.0001,
 'NB__fit_prior': True,
 'tfidf__lowercase': True,
 'tfidf__max_df': 0.9,
 'tfidf__min_df': 2}

In [114]:
searchNB.best_score_

0.56640625

In [115]:
searchNB.best_estimator_

Pipeline(memory=None,
         steps=[('tfidf',
                 TfidfVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.float64'>,
                                 encoding='utf-8', input='content',
                                 lowercase=True, max_df=0.9, max_features=None,
                                 min_df=2, ngram_range=(1, 1), norm='l2',
                                 preprocessor=None, smooth_idf=True,
                                 stop_words=['i', 'me', 'my', 'myself', 'we',
                                             'our', 'ours', 'ourselves', 'you',
                                             "you're", "you've", "you'll",
                                             "you'd", 'your', 'yours',
                                             'yourself', 'yourselves', 'he',
                                             'him', 'his', 'himself', 'she',
                   

### Build the best version of NB based on GridSearch above

In [116]:
TF_best = TfidfVectorizer(
    analyzer='word', binary=False,
    decode_error='strict',
    encoding='utf-8', input='content',
    lowercase=True, max_df=0.9, max_features=None,
    min_df=2, ngram_range=(1, 1), norm='l2',
    preprocessor=None, smooth_idf=True,
    stop_words=stop,
    strip_accents=None, sublinear_tf=False,
    token_pattern='(?u)\\b\\w\\w+\\b',
    tokenizer=None, use_idf=True,
    vocabulary=None
)
X_best = TF_best.fit_transform(dat)

X_train_best, X_test_best, y_train_best, y_test_best = train_test_split(X_best,lab,test_size=0.2,random_state=42)

NB_best = MultinomialNB(
    alpha=0.0001, 
    class_prior=None, 
    fit_prior=True
)

NB_best.fit(X_train_best, y_train_best)

NB_best.score(X_test_best, y_test_best)

0.5

In [118]:
tn, fp, fn, tp = confusion_matrix(y_test_best, NB_best.predict(X_test_best)).ravel()

recall_best = tp/(tp+fn)
precision_best = tp/(tp+fp)
accuracy_best = (tp+tn)/(tp+tn+fp+fn)
f1_best = 2*((precision_best+recall_best)/(precision_best+recall_best))

print(f"Recall: {recall_best}\nPrecision: {precision_best}\nAccuracy: {accuracy_best}\nF1: {f1_best}")

Recall: 0.0
Precision: 0.0
Accuracy: 0.5
F1: nan


  


In [6]:
a = np.array([1,2,3,4])
np.random.choice(a)

2