In [1]:
import pandas as pd
import numpy as np
from scipy.sparse import hstack

from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score
from sklearn.model_selection import GridSearchCV

from sklearn.feature_extraction.text import TfidfVectorizer

from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.linear_model import LogisticRegression

In [13]:
train = pd.read_csv('final_train.csv')

#Completo los NaN que se generan al importar del CSV
train.update(train[['hashtags', 'mentions', 'urls', 'clean_text', 'punctuation_signs', 'lemma_text', 'porter_stemmed_text', 'snowball_stemmed_text']].fillna(""))

test = train.loc[train['source'] == 'test']
train = train.loc[train['source'] == 'train']

In [4]:
def generateSubmission(predicts, model):
    submission = test[['id', 'target']]
    submission['target'] = predicts
    submission['target'] = submission['target'].astype(int)
    submission.to_csv(model + '_pred.csv', index=False)

In [5]:
train_numeric = train[['entities_count', 'words_count', 'punctuations_signs_count', 'hashtags_count', 'mentions_count', 'urls_count', 
           'stopwords_count', 'words_length_avg', 'punctuations_ratio', 'mentions_ratio', 'urls_ratio', 'stopwords_ratio',
           'special_entities_ratio', 'keyword_cv_mean_enc']]

In [6]:
def tdfIdfVectorization(columnName):
    vector = TfidfVectorizer(sublinear_tf=True)
    vector.fit(np.array(train[columnName]).ravel())
    
    return vector

### Prueba todo junto

In [7]:
text_vector = tdfIdfVectorization('lemma_text')
hashtags_vector = tdfIdfVectorization('hashtags')
mentions_vector = tdfIdfVectorization('mentions')
urls_vector = tdfIdfVectorization('urls')

In [8]:
x1 = text_vector.transform(np.array(train['lemma_text']).ravel())
x2 = hashtags_vector.transform(np.array(train['hashtags']).ravel())
x3 = mentions_vector.transform(np.array(train['mentions']).ravel())
x4 = urls_vector.transform(np.array(train['urls']).ravel())
XT = hstack((x1, x2, x3, x4))
y = np.array(train['target']).ravel()
X_train, X_test, y_train, y_test = train_test_split(XT, y, test_size=0.2, random_state=30)

In [17]:
train_numeric.columns[train_numeric.isna().any()].tolist()

[]

In [16]:
hstack(np.array(train_numeric).ravel(), XT)

TypeError: ufunc 'isnan' not supported for the input types, and the inputs could not be safely coerced to any supported types according to the casting rule ''safe''

In [None]:
## NaiveBayes
nb_model = MultinomialNB()
nb_model.fit(X_train, y_train)
predicts = nb_model.predict(X_test)
print("Score NB:", f1_score(y_test, predicts))

In [None]:
## Logistic Regression
lr_model = LogisticRegression(solver='liblinear')
lr_model.fit(X_train, y_train)
predicts = lr_model.predict(X_test)
print("Score LR:", f1_score(y_test, predicts))

In [None]:
#Predigo para lo real
x1_r = text_vector.transform(np.array(test['lemma_text']).ravel())
x2_r = hashtags_vector.transform(np.array(test['hashtags']).ravel())
x3_r = mentions_vector.transform(np.array(test['mentions']).ravel())
x4_r = urls_vector.transform(np.array(test['urls']).ravel())
XT_r = hstack((x1_r, x2_r, x3_r, x4_r))

In [None]:
subm = nb_model.predict(XT_r)

In [None]:
generateSubmission(subm, 'nb_full')

## RandomForest

In [None]:
def randomForestGridSearch():
    X_train, X_test, y_train, y_test = train_test_split(train_numeric.values, train['target'].values, test_size=0.2, random_state=530)
    
    rf_folds = 5
    rf_grid = dict(n_estimators=[3, 6, 9, 12], max_depth=[4, 8, 12], min_samples_split=[15, 30, 45, 60], min_samples_leaf=[15, 30, 45, 60], max_features=[5, 10, 13])
    clf = GridSearchCV(estimator=RandomForestClassifier(random_state=51), param_grid=rf_grid, n_jobs=-1, cv=rf_folds, scoring='f1', error_score=0)
    clf.fit(X_train, y_train)
    print("Mejores parametros encontrados:", clf.best_params_)

    rf_best_model = RandomForestClassifier(random_state=51, n_estimators=clf.best_params_['n_estimators'], max_features=clf.best_params_['max_features'], min_samples_leaf=clf.best_params_['min_samples_leaf'], min_samples_split=clf.best_params_['min_samples_split'], max_depth=clf.best_params_['max_depth'])
    rf_best_model.fit(X_train, y_train)
    preds = rf_best_model.predict(X_test)
    print("Score RF:", f1_score(y_test, preds))
    
    print(train_numeric.columns)
    print(rfModel.feature_importances_)
    
    return preds

In [None]:
print(train_numeric.columns)
print(rfModel.feature_importances_)

## Naive Bayes

In [None]:
def naiveBayesGridSearch(columnName):
    vector = tdfIdfVectorization(columnName)
    
    X = vector.transform(np.array(train[columnName]).ravel())
    y = np.array(train['target']).ravel()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)
    
    nb_model = MultinomialNB()
    nb_model.fit(X_train, y_train)
    predicts = nb_model.predict(X_test)
    print("Score NB:", f1_score(y_test, predicts))
    
    return predicts

In [None]:
naiveBayesGridSearch('lemma_text')

## Logistic Regression

In [None]:
def logisticRegressionGridSearch(columnName):
    vector = tdfIdfVectorization(columnName)
    
    X = vector.transform(np.array(train[columnName]).ravel())
    y = np.array(train['target']).ravel()
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=30)
    
    lr_model = LogisticRegression(solver='lbfgs')
    lr_model.fit(X_train, y_train)
    predicts = lr_model.predict(X_test)
    print("Score LR:", f1_score(y_test, predicts))
    
    return predicts