In [1]:
import pandas as pd
import numpy as np

import sklearn
from sklearn.naive_bayes  import MultinomialNB
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression

from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.metrics import f1_score

from feature_builder import process_dataset, add_text_embeddings, calculate_keyword_encoding
from hyperparameter_tuning import random_search

In [2]:
train_dataset = pd.read_csv('train.csv')

In [3]:
test_dataset = pd.read_csv('test.csv')

In [4]:
y = train_dataset.loc[:,'target']

In [100]:
logisticRegr = LogisticRegression()

In [90]:
param={'penalty': ['l1', 'l2', 'elasticnet', 'none'],
       'C': (0, 10, 0.1),
       'solver': ['lbfgs', 'newton-cg', 'liblinear', 'sag', 'saga'],
       'max_iter':[100, 1000, 2500, 5000],
       'multi_class': ['auto', 'ovr', 'multinomial']
      }

Primero pruebo con tf-idf

In [60]:
from sklearn.feature_extraction.text import TfidfVectorizer
v = TfidfVectorizer()
x_tfidf = v.fit_transform(train_dataset['text'])


In [61]:
x_train_tfidf, x_test_tfidf, y_train_tfidf, y_test_tfidf = train_test_split(x_tfidf, y, test_size = .33, random_state = 17)

In [91]:
random_search(x_train_tfidf,y_train_tfidf,logisticRegr,param)

Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    2.8s
[Parallel(n_jobs=4)]: Done 136 tasks      | elapsed:   27.1s
[Parallel(n_jobs=4)]: Done 193 out of 200 | elapsed:   29.3s remaining:    1.1s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:  3.2min finished



 Time taken: 0 hours 3 minutes and 13.66 seconds.

 Best f1 score with 5-folds and 40 combinations of hyperparameters:
0.7387110620486113

 Best hyperparameters:
{'solver': 'lbfgs', 'penalty': 'l2', 'multi_class': 'ovr', 'max_iter': 2500, 'C': 10}


In [93]:
logisticRegr.fit(x_train_tfidf, y_train_tfidf)

LogisticRegression(C=10, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='ovr', n_jobs=None, penalty='l2',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [94]:
y_pred_tfidf = logisticRegr.predict(x_test_tfidf)

In [95]:
f1_score(y_test_tfidf, y_pred_tfidf)

0.7512588116817723

0.7501272264631044 = verbose=1, solver='liblinear',random_state=0, C=5, penalty='l2',max_iter=1000

Ahora voy a probar con el process_dataset de feature_builder

In [12]:
x_processed = process_dataset(train_dataset)

Percentage of words covered in the embeddings = 0.4937444933920705


In [13]:
x_train_processed, x_test_processed, y_train_processed, y_test_processed = train_test_split(x_processed, y, test_size = .33, random_state = 17)

In [14]:
logisticRegr.fit(x_train_processed, y_train_processed)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)

In [15]:
y_pred_processed = logisticRegr.predict(x_test_processed)

In [16]:
f1_score(y_test_processed, y_pred_processed)

0.7131264916467781

In [5]:
x_embedd = train_dataset.copy()
add_text_embeddings(x_embedd, False, 'embeddings')
x_embedd.drop(['text', 'location', 'keyword', 'id', 'target'], axis=1, inplace=True)

Percentage of words covered in the embeddings = 0.4937444933920705


Me quede solo con los embeddings ya que el resto deteriora el algoritmo por alguna razón.

In [27]:
x_train_embedd, x_test_embedd, y_train_embedd, y_test_embedd = train_test_split(x_embedd, y, test_size = .30, random_state = 17)

In [103]:
random_search(x_train_embedd,y_train_embedd,logisticRegr,param)

Fitting 5 folds for each of 40 candidates, totalling 200 fits


[Parallel(n_jobs=4)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=4)]: Done  24 tasks      | elapsed:    3.5s
[Parallel(n_jobs=4)]: Done 120 tasks      | elapsed:   22.4s
[Parallel(n_jobs=4)]: Done 193 out of 200 | elapsed:   33.1s remaining:    1.2s
[Parallel(n_jobs=4)]: Done 200 out of 200 | elapsed:   36.6s finished



 Time taken: 0 hours 0 minutes and 37.02 seconds.

 Best f1 score with 5-folds and 40 combinations of hyperparameters:
0.7474323602109954

 Best hyperparameters:
{'solver': 'liblinear', 'penalty': 'l1', 'multi_class': 'auto', 'max_iter': 1000, 'C': 10}


In [28]:
logisticRegr = LogisticRegression(solver='liblinear', penalty='l1', multi_class='auto', max_iter=1000, C=1)

In [29]:
logisticRegr.fit(x_train_embedd, y_train_embedd)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [30]:
y_pred_embedd = logisticRegr.predict(x_test_embedd)

In [31]:
f1_score(y_test_embedd, y_pred_embedd)

0.7682539682539683

In [26]:
0.7645914396887159

0.7645914396887159

In [None]:
['mean_encode', 'url_count', 'number_count', 'space_percentage', 'unique_chars', 'word_density', 'capitals']

In [10]:
x = process_dataset(train_dataset, text_type=None)

In [32]:
x_2 = x[['mean_encode', 'url_count', 'number_count', 'space_percentage', 'unique_chars', 'word_density', 'capitals']]

In [24]:
logisticRegr = LogisticRegression(solver='liblinear', penalty='l1', multi_class='auto', max_iter=1000, C=1)

In [33]:
x_train, x_test, y_train, y_test = train_test_split(x_2, y, test_size = .33, random_state = 17)

In [34]:
logisticRegr.fit(x_train, y_train)

LogisticRegression(C=1, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=1000,
                   multi_class='auto', n_jobs=None, penalty='l1',
                   random_state=None, solver='liblinear', tol=0.0001, verbose=0,
                   warm_start=False)

In [36]:
y_pred = logisticRegr.predict(x_test)

In [37]:
f1_score(y_test, y_pred)

0.7045889101338432

In [None]:
0.7100760456273765